diff options
author | Sergei Golubchik <vuvova@gmail.com> | 2015-05-04 19:17:21 +0200 |
---|---|---|
committer | Sergei Golubchik <vuvova@gmail.com> | 2015-05-04 19:17:21 +0200 |
commit | 6d06fbbd1dc25b3c12568f9038060dfdb69f9683 (patch) | |
tree | 21e27f3fddc89f9dda6b337091464ba10c490123 /storage/innobase/include | |
parent | 1645930d0bd02f79df3ebff412b90acdc15bd9a0 (diff) | |
download | mariadb-git-6d06fbbd1dc25b3c12568f9038060dfdb69f9683.tar.gz |
move to storage/innobase
Diffstat (limited to 'storage/innobase/include')
227 files changed, 72797 insertions, 0 deletions
diff --git a/storage/innobase/include/api0api.h b/storage/innobase/include/api0api.h new file mode 100644 index 00000000000..d77d691becc --- /dev/null +++ b/storage/innobase/include/api0api.h @@ -0,0 +1,1304 @@ +/***************************************************************************** + +Copyright (c) 2011, 2013, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/api0api.h +InnoDB Native API + +2008-08-01 Created by Sunny Bains. +3/20/2011 Jimmy Yang extracted from Embedded InnoDB +*******************************************************/ + +#ifndef api0api_h +#define api0api_h + +#include "db0err.h" +#include <stdio.h> + +#ifdef _MSC_VER +#define strncasecmp _strnicmp +#define strcasecmp _stricmp +#endif + +#if defined(__GNUC__) && (__GNUC__ > 2) && ! defined(__INTEL_COMPILER) +#define UNIV_NO_IGNORE __attribute__ ((warn_unused_result)) +#else +#define UNIV_NO_IGNORE +#endif /* __GNUC__ && __GNUC__ > 2 && !__INTEL_COMPILER */ + +/* See comment about ib_bool_t as to why the two macros are unsigned long. */ +/** The boolean value of "true" used internally within InnoDB */ +#define IB_TRUE 0x1UL +/** The boolean value of "false" used internally within InnoDB */ +#define IB_FALSE 0x0UL + +/* Basic types used by the InnoDB API. */ +/** All InnoDB error codes are represented by ib_err_t */ +typedef enum dberr_t ib_err_t; +/** Representation of a byte within InnoDB */ +typedef unsigned char ib_byte_t; +/** Representation of an unsigned long int within InnoDB */ +typedef unsigned long int ib_ulint_t; + +/* We assume C99 support except when using VisualStudio. */ +#if !defined(_MSC_VER) +#include <stdint.h> +#endif /* _MSC_VER */ + +/* Integer types used by the API. Microsft VS defines its own types +and we use the Microsoft types when building with Visual Studio. */ +#if defined(_MSC_VER) +/** A signed 8 bit integral type. */ +typedef __int8 ib_i8_t; +#else +/** A signed 8 bit integral type. */ +typedef int8_t ib_i8_t; +#endif + +#if defined(_MSC_VER) +/** An unsigned 8 bit integral type. */ +typedef unsigned __int8 ib_u8_t; +#else +/** An unsigned 8 bit integral type. */ +typedef uint8_t ib_u8_t; +#endif + +#if defined(_MSC_VER) +/** A signed 16 bit integral type. */ +typedef __int16 ib_i16_t; +#else +/** A signed 16 bit integral type. */ +typedef int16_t ib_i16_t; +#endif + +#if defined(_MSC_VER) +/** An unsigned 16 bit integral type. */ +typedef unsigned __int16 ib_u16_t; +#else +/** An unsigned 16 bit integral type. */ +typedef uint16_t ib_u16_t; +#endif + +#if defined(_MSC_VER) +/** A signed 32 bit integral type. */ +typedef __int32 ib_i32_t; +#else +/** A signed 32 bit integral type. */ +typedef int32_t ib_i32_t; +#endif + +#if defined(_MSC_VER) +/** An unsigned 32 bit integral type. */ +typedef unsigned __int32 ib_u32_t; +#else +/** An unsigned 32 bit integral type. */ +typedef uint32_t ib_u32_t; +#endif + +#if defined(_MSC_VER) +/** A signed 64 bit integral type. */ +typedef __int64 ib_i64_t; +#else +/** A signed 64 bit integral type. */ +typedef int64_t ib_i64_t; +#endif + +#if defined(_MSC_VER) +/** An unsigned 64 bit integral type. */ +typedef unsigned __int64 ib_u64_t; +#else +/** An unsigned 64 bit integral type. */ +typedef uint64_t ib_u64_t; +#endif + +typedef void* ib_opaque_t; +typedef ib_opaque_t ib_charset_t; +typedef ib_ulint_t ib_bool_t; +typedef ib_u64_t ib_id_u64_t; + +/** @enum ib_cfg_type_t Possible types for a configuration variable. */ +typedef enum { + IB_CFG_IBOOL, /*!< The configuration parameter is + of type ibool */ + + /* XXX Can we avoid having different types for ulint and ulong? + - On Win64 "unsigned long" is 32 bits + - ulong is always defined as "unsigned long" + - On Win64 ulint is defined as 64 bit integer + => On Win64 ulint != ulong. + If we typecast all ulong and ulint variables to the smaller type + ulong, then we will cut the range of the ulint variables. + This is not a problem for most ulint variables because their max + allowed values do not exceed 2^32-1 (e.g. log_groups is ulint + but its max allowed value is 10). BUT buffer_pool_size and + log_file_size allow up to 2^64-1. */ + + IB_CFG_ULINT, /*!< The configuration parameter is + of type ulint */ + + IB_CFG_ULONG, /*!< The configuration parameter is + of type ulong */ + + IB_CFG_TEXT, /*!< The configuration parameter is + of type char* */ + + IB_CFG_CB /*!< The configuration parameter is + a callback parameter */ +} ib_cfg_type_t; + +/** @enum ib_col_type_t column types that are supported. */ +typedef enum { + IB_VARCHAR = 1, /*!< Character varying length. The + column is not padded. */ + + IB_CHAR = 2, /*!< Fixed length character string. The + column is padded to the right. */ + + IB_BINARY = 3, /*!< Fixed length binary, similar to + IB_CHAR but the column is not padded + to the right. */ + + IB_VARBINARY = 4, /*!< Variable length binary */ + + IB_BLOB = 5, /*!< Binary large object, or + a TEXT type */ + + IB_INT = 6, /*!< Integer: can be any size + from 1 - 8 bytes. If the size is + 1, 2, 4 and 8 bytes then you can use + the typed read and write functions. For + other sizes you will need to use the + ib_col_get_value() function and do the + conversion yourself. */ + + IB_SYS = 8, /*!< System column, this column can + be one of DATA_TRX_ID, DATA_ROLL_PTR + or DATA_ROW_ID. */ + + IB_FLOAT = 9, /*!< C (float) floating point value. */ + + IB_DOUBLE = 10, /*!> C (double) floating point value. */ + + IB_DECIMAL = 11, /*!< Decimal stored as an ASCII + string */ + + IB_VARCHAR_ANYCHARSET = 12, /*!< Any charset, varying length */ + + IB_CHAR_ANYCHARSET = 13 /*!< Any charset, fixed length */ + +} ib_col_type_t; + +/** @enum ib_tbl_fmt_t InnoDB table format types */ +typedef enum { + IB_TBL_REDUNDANT, /*!< Redundant row format, the column + type and length is stored in the row.*/ + + IB_TBL_COMPACT, /*!< Compact row format, the column + type is not stored in the row. The + length is stored in the row but the + storage format uses a compact format + to store the length of the column data + and record data storage format also + uses less storage. */ + + IB_TBL_DYNAMIC, /*!< Compact row format. BLOB prefixes + are not stored in the clustered index */ + + IB_TBL_COMPRESSED /*!< Similar to dynamic format but + with pages compressed */ +} ib_tbl_fmt_t; + +/** @enum ib_col_attr_t InnoDB column attributes */ +typedef enum { + IB_COL_NONE = 0, /*!< No special attributes. */ + + IB_COL_NOT_NULL = 1, /*!< Column data can't be NULL. */ + + IB_COL_UNSIGNED = 2, /*!< Column is IB_INT and unsigned. */ + + IB_COL_NOT_USED = 4, /*!< Future use, reserved. */ + + IB_COL_CUSTOM1 = 8, /*!< Custom precision type, this is + a bit that is ignored by InnoDB and so + can be set and queried by users. */ + + IB_COL_CUSTOM2 = 16, /*!< Custom precision type, this is + a bit that is ignored by InnoDB and so + can be set and queried by users. */ + + IB_COL_CUSTOM3 = 32 /*!< Custom precision type, this is + a bit that is ignored by InnoDB and so + can be set and queried by users. */ +} ib_col_attr_t; + +/* Note: must match lock0types.h */ +/** @enum ib_lck_mode_t InnoDB lock modes. */ +typedef enum { + IB_LOCK_IS = 0, /*!< Intention shared, an intention + lock should be used to lock tables */ + + IB_LOCK_IX, /*!< Intention exclusive, an intention + lock should be used to lock tables */ + + IB_LOCK_S, /*!< Shared locks should be used to + lock rows */ + + IB_LOCK_X, /*!< Exclusive locks should be used to + lock rows*/ + + IB_LOCK_TABLE_X, /*!< exclusive table lock */ + + IB_LOCK_NONE, /*!< This is used internally to note + consistent read */ + + IB_LOCK_NUM = IB_LOCK_NONE /*!< number of lock modes */ +} ib_lck_mode_t; + +typedef enum { + IB_CLUSTERED = 1, /*!< clustered index */ + IB_UNIQUE = 2 /*!< unique index */ +} ib_index_type_t; + +/** @enum ib_srch_mode_t InnoDB cursor search modes for ib_cursor_moveto(). +Note: Values must match those found in page0cur.h */ +typedef enum { + IB_CUR_G = 1, /*!< If search key is not found then + position the cursor on the row that + is greater than the search key */ + + IB_CUR_GE = 2, /*!< If the search key not found then + position the cursor on the row that + is greater than or equal to the search + key */ + + IB_CUR_L = 3, /*!< If search key is not found then + position the cursor on the row that + is less than the search key */ + + IB_CUR_LE = 4 /*!< If search key is not found then + position the cursor on the row that + is less than or equal to the search + key */ +} ib_srch_mode_t; + +/** @enum ib_match_mode_t Various match modes used by ib_cursor_moveto() */ +typedef enum { + IB_CLOSEST_MATCH, /*!< Closest match possible */ + + IB_EXACT_MATCH, /*!< Search using a complete key + value */ + + IB_EXACT_PREFIX /*!< Search using a key prefix which + must match to rows: the prefix may + contain an incomplete field (the + last field in prefix may be just + a prefix of a fixed length column) */ +} ib_match_mode_t; + +/** @struct ib_col_meta_t InnoDB column meta data. */ +typedef struct { + ib_col_type_t type; /*!< Type of the column */ + + ib_col_attr_t attr; /*!< Column attributes */ + + ib_u32_t type_len; /*!< Length of type */ + + ib_u16_t client_type; /*!< 16 bits of data relevant only to + the client. InnoDB doesn't care */ + + ib_charset_t* charset; /*!< Column charset */ +} ib_col_meta_t; + +/* Note: Must be in sync with trx0trx.h */ +/** @enum ib_trx_state_t The transaction state can be queried using the +ib_trx_state() function. The InnoDB deadlock monitor can roll back a +transaction and users should be prepared for this, especially where there +is high contention. The way to determine the state of the transaction is to +query it's state and check. */ +typedef enum { + IB_TRX_NOT_STARTED, /*!< Has not started yet, the + transaction has not ben started yet.*/ + + IB_TRX_ACTIVE, /*!< The transaction is currently + active and needs to be either + committed or rolled back. */ + + IB_TRX_COMMITTED_IN_MEMORY, /*!< Not committed to disk yet */ + + IB_TRX_PREPARED /*!< Support for 2PC/XA */ +} ib_trx_state_t; + +/* Note: Must be in sync with trx0trx.h */ +/** @enum ib_trx_level_t Transaction isolation levels */ +typedef enum { + IB_TRX_READ_UNCOMMITTED = 0, /*!< Dirty read: non-locking SELECTs are + performed so that we do not look at a + possible earlier version of a record; + thus they are not 'consistent' reads + under this isolation level; otherwise + like level 2 */ + + IB_TRX_READ_COMMITTED = 1, /*!< Somewhat Oracle-like isolation, + except that in range UPDATE and DELETE + we must block phantom rows with + next-key locks; SELECT ... FOR UPDATE + and ... LOCK IN SHARE MODE only lock + the index records, NOT the gaps before + them, and thus allow free inserting; + each consistent read reads its own + snapshot */ + + IB_TRX_REPEATABLE_READ = 2, /*!< All consistent reads in the same + trx read the same snapshot; full + next-key locking used in locking reads + to block insertions into gaps */ + + IB_TRX_SERIALIZABLE = 3 /*!< All plain SELECTs are converted to + LOCK IN SHARE MODE reads */ +} ib_trx_level_t; + +/** Generical InnoDB callback prototype. */ +typedef void (*ib_cb_t)(void); + +#define IB_CFG_BINLOG_ENABLED 0x1 +#define IB_CFG_MDL_ENABLED 0x2 +#define IB_CFG_DISABLE_ROWLOCK 0x4 + +/** The first argument to the InnoDB message logging function. By default +it's set to stderr. You should treat ib_msg_stream_t as a void*, since +it will probably change in the future. */ +typedef FILE* ib_msg_stream_t; + +/** All log messages are written to this function.It should have the same +behavior as fprintf(3). */ +typedef int (*ib_msg_log_t)(ib_msg_stream_t, const char*, ...); + +/* Note: This is to make it easy for API users to have type +checking for arguments to our functions. Making it ib_opaque_t +by itself will result in pointer decay resulting in subverting +of the compiler's type checking. */ + +/** InnoDB tuple handle. This handle can refer to either a cluster index +tuple or a secondary index tuple. There are two types of tuples for each +type of index, making a total of four types of tuple handles. There +is a tuple for reading the entire row contents and another for searching +on the index key. */ +typedef struct ib_tuple_t* ib_tpl_t; + +/** InnoDB transaction handle, all database operations need to be covered +by transactions. This handle represents a transaction. The handle can be +created with ib_trx_begin(), you commit your changes with ib_trx_commit() +and undo your changes using ib_trx_rollback(). If the InnoDB deadlock +monitor rolls back the transaction then you need to free the transaction +using the function ib_trx_release(). You can query the state of an InnoDB +transaction by calling ib_trx_state(). */ +typedef struct trx_t* ib_trx_t; + +/** InnoDB cursor handle */ +typedef struct ib_cursor_t* ib_crsr_t; + +/*************************************************************//** +This function is used to compare two data fields for which the data type +is such that we must use the client code to compare them. + +@param col_meta column meta data +@param p1 key +@oaram p1_len key length +@param p2 second key +@param p2_len second key length +@return 1, 0, -1, if a is greater, equal, less than b, respectively */ + +typedef int (*ib_client_cmp_t)( + const ib_col_meta_t* col_meta, + const ib_byte_t* p1, + ib_ulint_t p1_len, + const ib_byte_t* p2, + ib_ulint_t p2_len); + +/* This should be the same as univ.i */ +/** Represents SQL_NULL length */ +#define IB_SQL_NULL 0xFFFFFFFF +/** The number of system columns in a row. */ +#define IB_N_SYS_COLS 3 + +/** The maximum length of a text column. */ +#define MAX_TEXT_LEN 4096 + +/* MySQL uses 3 byte UTF-8 encoding. */ +/** The maximum length of a column name in a table schema. */ +#define IB_MAX_COL_NAME_LEN (64 * 3) + +/** The maximum length of a table name (plus database name). */ +#define IB_MAX_TABLE_NAME_LEN (64 * 3) * 2 + +/*****************************************************************//** +Start a transaction that's been rolled back. This special function +exists for the case when InnoDB's deadlock detector has rolledack +a transaction. While the transaction has been rolled back the handle +is still valid and can be reused by calling this function. If you +don't want to reuse the transaction handle then you can free the handle +by calling ib_trx_release(). +@return innobase txn handle */ + +ib_err_t +ib_trx_start( +/*=========*/ + ib_trx_t ib_trx, /*!< in: transaction to restart */ + ib_trx_level_t ib_trx_level, /*!< in: trx isolation level */ + ib_bool_t read_write, /*!< in: true if read write + transaction */ + ib_bool_t auto_commit, /*!< in: auto commit after each + single DML */ + void* thd); /*!< in: THD */ + +/*****************************************************************//** +Begin a transaction. This will allocate a new transaction handle and +put the transaction in the active state. +@return innobase txn handle */ + +ib_trx_t +ib_trx_begin( +/*=========*/ + ib_trx_level_t ib_trx_level, /*!< in: trx isolation level */ + ib_bool_t read_write, /*!< in: true if read write + transaction */ + ib_bool_t auto_commit); /*!< in: auto commit after each + single DML */ + +/*****************************************************************//** +Query the transaction's state. This function can be used to check for +the state of the transaction in case it has been rolled back by the +InnoDB deadlock detector. Note that when a transaction is selected as +a victim for rollback, InnoDB will always return an appropriate error +code indicating this. @see DB_DEADLOCK, @see DB_LOCK_TABLE_FULL and +@see DB_LOCK_WAIT_TIMEOUT +@return transaction state */ + +ib_trx_state_t +ib_trx_state( +/*=========*/ + ib_trx_t ib_trx); /*!< in: trx handle */ + +/*****************************************************************//** +Release the resources of the transaction. If the transaction was +selected as a victim by InnoDB and rolled back then use this function +to free the transaction handle. +@return DB_SUCCESS or err code */ + +ib_err_t +ib_trx_release( +/*===========*/ + ib_trx_t ib_trx); /*!< in: trx handle */ + +/*****************************************************************//** +Commit a transaction. This function will release the schema latches too. +It will also free the transaction handle. +@return DB_SUCCESS or err code */ + +ib_err_t +ib_trx_commit( +/*==========*/ + ib_trx_t ib_trx); /*!< in: trx handle */ + +/*****************************************************************//** +Rollback a transaction. This function will release the schema latches too. +It will also free the transaction handle. +@return DB_SUCCESS or err code */ + +ib_err_t +ib_trx_rollback( +/*============*/ + ib_trx_t ib_trx); /*!< in: trx handle */ + +/*****************************************************************//** +Open an InnoDB table and return a cursor handle to it. +@return DB_SUCCESS or err code */ + +ib_err_t +ib_cursor_open_table_using_id( +/*==========================*/ + ib_id_u64_t table_id, /*!< in: table id of table to open */ + ib_trx_t ib_trx, /*!< in: Current transaction handle + can be NULL */ + ib_crsr_t* ib_crsr); /*!< out,own: InnoDB cursor */ + +/*****************************************************************//** +Open an InnoDB index and return a cursor handle to it. +@return DB_SUCCESS or err code */ + +ib_err_t +ib_cursor_open_index_using_id( +/*==========================*/ + ib_id_u64_t index_id, /*!< in: index id of index to open */ + ib_trx_t ib_trx, /*!< in: Current transaction handle + can be NULL */ + ib_crsr_t* ib_crsr); /*!< out: InnoDB cursor */ + +/*****************************************************************//** +Open an InnoDB secondary index cursor and return a cursor handle to it. +@return DB_SUCCESS or err code */ + +ib_err_t +ib_cursor_open_index_using_name( +/*============================*/ + ib_crsr_t ib_open_crsr, /*!< in: open/active cursor */ + const char* index_name, /*!< in: secondary index name */ + ib_crsr_t* ib_crsr, /*!< out,own: InnoDB index cursor */ + int* idx_type, /*!< out: index is cluster index */ + ib_id_u64_t* idx_id); /*!< out: index id */ + +/*****************************************************************//** +Open an InnoDB table by name and return a cursor handle to it. +@return DB_SUCCESS or err code */ + +ib_err_t +ib_cursor_open_table( +/*=================*/ + const char* name, /*!< in: table name */ + ib_trx_t ib_trx, /*!< in: Current transaction handle + can be NULL */ + ib_crsr_t* ib_crsr); /*!< out,own: InnoDB cursor */ + +/*****************************************************************//** +Reset the cursor. +@return DB_SUCCESS or err code */ + +ib_err_t +ib_cursor_reset( +/*============*/ + ib_crsr_t ib_crsr); /*!< in/out: InnoDB cursor */ + + +/*****************************************************************//** +set a cursor trx to NULL*/ + +void +ib_cursor_clear_trx( +/*================*/ + ib_crsr_t ib_crsr); /*!< in/out: InnoDB cursor */ + +/*****************************************************************//** +Close an InnoDB table and free the cursor. +@return DB_SUCCESS or err code */ + +ib_err_t +ib_cursor_close( +/*============*/ + ib_crsr_t ib_crsr); /*!< in/out: InnoDB cursor */ + +/*****************************************************************//** +Close the table, decrement n_ref_count count. +@return DB_SUCCESS or err code */ + +ib_err_t +ib_cursor_close_table( +/*==================*/ + ib_crsr_t ib_crsr); /*!< in/out: InnoDB cursor */ + +/*****************************************************************//** +update the cursor with new transactions and also reset the cursor +@return DB_SUCCESS or err code */ + +ib_err_t +ib_cursor_new_trx( +/*==============*/ + ib_crsr_t ib_crsr, /*!< in/out: InnoDB cursor */ + ib_trx_t ib_trx); /*!< in: transaction */ + +/*****************************************************************//** +Commit the transaction in a cursor +@return DB_SUCCESS or err code */ + +ib_err_t +ib_cursor_commit_trx( +/*=================*/ + ib_crsr_t ib_crsr, /*!< in/out: InnoDB cursor */ + ib_trx_t ib_trx); /*!< in: transaction */ + +/********************************************************************//** +Open a table using the table name, if found then increment table ref count. +@return table instance if found */ + +void* +ib_open_table_by_name( +/*==================*/ + const char* name); /*!< in: table name to lookup */ + +/*****************************************************************//** +Insert a row to a table. +@return DB_SUCCESS or err code */ + +ib_err_t +ib_cursor_insert_row( +/*=================*/ + ib_crsr_t ib_crsr, /*!< in/out: InnoDB cursor instance */ + const ib_tpl_t ib_tpl); /*!< in: tuple to insert */ + +/*****************************************************************//** +Update a row in a table. +@return DB_SUCCESS or err code */ + +ib_err_t +ib_cursor_update_row( +/*=================*/ + ib_crsr_t ib_crsr, /*!< in: InnoDB cursor instance */ + const ib_tpl_t ib_old_tpl, /*!< in: Old tuple in table */ + const ib_tpl_t ib_new_tpl); /*!< in: New tuple to update */ + +/*****************************************************************//** +Delete a row in a table. +@return DB_SUCCESS or err code */ + +ib_err_t +ib_cursor_delete_row( +/*=================*/ + ib_crsr_t ib_crsr); /*!< in: cursor instance */ + +/*****************************************************************//** +Read current row. +@return DB_SUCCESS or err code */ + +ib_err_t +ib_cursor_read_row( +/*===============*/ + ib_crsr_t ib_crsr, /*!< in: InnoDB cursor instance */ + ib_tpl_t ib_tpl, /*!< out: read cols into this tuple */ + void** row_buf, /*!< in/out: row buffer */ + ib_ulint_t* row_len); /*!< in/out: row buffer len */ + +/*****************************************************************//** +Move cursor to the first record in the table. +@return DB_SUCCESS or err code */ + +ib_err_t +ib_cursor_first( +/*============*/ + ib_crsr_t ib_crsr); /*!< in: InnoDB cursor instance */ + +/*****************************************************************//** +Move cursor to the last record in the table. +@return DB_SUCCESS or err code */ + +ib_err_t +ib_cursor_last( +/*===========*/ + ib_crsr_t ib_crsr); /*!< in: InnoDB cursor instance */ + +/*****************************************************************//** +Move cursor to the next record in the table. +@return DB_SUCCESS or err code */ + +ib_err_t +ib_cursor_next( +/*===========*/ + ib_crsr_t ib_crsr); /*!< in: InnoDB cursor instance */ + +/*****************************************************************//** +Search for key. +@return DB_SUCCESS or err code */ + +ib_err_t +ib_cursor_moveto( +/*=============*/ + ib_crsr_t ib_crsr, /*!< in: InnoDB cursor instance */ + ib_tpl_t ib_tpl, /*!< in: Key to search for */ + ib_srch_mode_t ib_srch_mode); /*!< in: search mode */ + +/*****************************************************************//** +Set the match mode for ib_cursor_move(). */ + +void +ib_cursor_set_match_mode( +/*=====================*/ + ib_crsr_t ib_crsr, /*!< in: Cursor instance */ + ib_match_mode_t match_mode); /*!< in: ib_cursor_moveto match mode */ + +/*****************************************************************//** +Set a column of the tuple. Make a copy using the tuple's heap. +@return DB_SUCCESS or error code */ + +ib_err_t +ib_col_set_value( +/*=============*/ + ib_tpl_t ib_tpl, /*!< in: tuple instance */ + ib_ulint_t col_no, /*!< in: column index in tuple */ + const void* src, /*!< in: data value */ + ib_ulint_t len, /*!< in: data value len */ + ib_bool_t need_cpy); /*!< in: if need memcpy */ + + +/*****************************************************************//** +Get the size of the data available in the column the tuple. +@return bytes avail or IB_SQL_NULL */ + +ib_ulint_t +ib_col_get_len( +/*===========*/ + ib_tpl_t ib_tpl, /*!< in: tuple instance */ + ib_ulint_t i); /*!< in: column index in tuple */ + +/*****************************************************************//** +Copy a column value from the tuple. +@return bytes copied or IB_SQL_NULL */ + +ib_ulint_t +ib_col_copy_value( +/*==============*/ + ib_tpl_t ib_tpl, /*!< in: tuple instance */ + ib_ulint_t i, /*!< in: column index in tuple */ + void* dst, /*!< out: copied data value */ + ib_ulint_t len); /*!< in: max data value len to copy */ + +/*************************************************************//** +Read a signed int 8 bit column from an InnoDB tuple. +@return DB_SUCCESS or error */ + +ib_err_t +ib_tuple_read_i8( +/*=============*/ + ib_tpl_t ib_tpl, /*!< in: InnoDB tuple */ + ib_ulint_t i, /*!< in: column number */ + ib_i8_t* ival); /*!< out: integer value */ + +/*************************************************************//** +Read an unsigned int 8 bit column from an InnoDB tuple. +@return DB_SUCCESS or error */ + +ib_err_t +ib_tuple_read_u8( +/*=============*/ + ib_tpl_t ib_tpl, /*!< in: InnoDB tuple */ + ib_ulint_t i, /*!< in: column number */ + ib_u8_t* ival); /*!< out: integer value */ + +/*************************************************************//** +Read a signed int 16 bit column from an InnoDB tuple. +@return DB_SUCCESS or error */ + +ib_err_t +ib_tuple_read_i16( +/*==============*/ + ib_tpl_t ib_tpl, /*!< in: InnoDB tuple */ + ib_ulint_t i, /*!< in: column number */ + ib_i16_t* ival); /*!< out: integer value */ + +/*************************************************************//** +Read an unsigned int 16 bit column from an InnoDB tuple. +@return DB_SUCCESS or error */ + +ib_err_t +ib_tuple_read_u16( +/*==============*/ + ib_tpl_t ib_tpl, /*!< in: InnoDB tuple */ + ib_ulint_t i, /*!< in: column number */ + ib_u16_t* ival); /*!< out: integer value */ + +/*************************************************************//** +Read a signed int 32 bit column from an InnoDB tuple. +@return DB_SUCCESS or error */ + +ib_err_t +ib_tuple_read_i32( +/*==============*/ + ib_tpl_t ib_tpl, /*!< in: InnoDB tuple */ + ib_ulint_t i, /*!< in: column number */ + ib_i32_t* ival); /*!< out: integer value */ + +/*************************************************************//** +Read an unsigned int 32 bit column from an InnoDB tuple. +@return DB_SUCCESS or error */ + +ib_err_t +ib_tuple_read_u32( +/*==============*/ + ib_tpl_t ib_tpl, /*!< in: InnoDB tuple */ + ib_ulint_t i, /*!< in: column number */ + ib_u32_t* ival); /*!< out: integer value */ + +/*************************************************************//** +Read a signed int 64 bit column from an InnoDB tuple. +@return DB_SUCCESS or error */ + +ib_err_t +ib_tuple_read_i64( +/*==============*/ + ib_tpl_t ib_tpl, /*!< in: InnoDB tuple */ + ib_ulint_t i, /*!< in: column number */ + ib_i64_t* ival); /*!< out: integer value */ + +/*************************************************************//** +Read an unsigned int 64 bit column from an InnoDB tuple. +@return DB_SUCCESS or error */ + +ib_err_t +ib_tuple_read_u64( +/*==============*/ + ib_tpl_t ib_tpl, /*!< in: InnoDB tuple */ + ib_ulint_t i, /*!< in: column number */ + ib_u64_t* ival); /*!< out: integer value */ + +/*****************************************************************//** +Get a column value pointer from the tuple. +@return NULL or pointer to buffer */ + +const void* +ib_col_get_value( +/*=============*/ + ib_tpl_t ib_tpl, /*!< in: InnoDB tuple */ + ib_ulint_t i); /*!< in: column number */ + +/*****************************************************************//** +Get a column type, length and attributes from the tuple. +@return len of column data */ + +ib_ulint_t +ib_col_get_meta( +/*============*/ + ib_tpl_t ib_tpl, /*!< in: InnoDB tuple */ + ib_ulint_t i, /*!< in: column number */ + ib_col_meta_t* ib_col_meta); /*!< out: column meta data */ + +/*****************************************************************//** +"Clear" or reset an InnoDB tuple. We free the heap and recreate the tuple. +@return new tuple, or NULL */ + +ib_tpl_t +ib_tuple_clear( +/*============*/ + ib_tpl_t ib_tpl); /*!< in: InnoDB tuple */ + +/*****************************************************************//** +Create a new cluster key search tuple and copy the contents of the +secondary index key tuple columns that refer to the cluster index record +to the cluster key. It does a deep copy of the column data. +@return DB_SUCCESS or error code */ + +ib_err_t +ib_tuple_get_cluster_key( +/*=====================*/ + ib_crsr_t ib_crsr, /*!< in: secondary index cursor */ + ib_tpl_t* ib_dst_tpl, /*!< out,own: destination tuple */ + const ib_tpl_t ib_src_tpl); /*!< in: source tuple */ + +/*****************************************************************//** +Copy the contents of source tuple to destination tuple. The tuples +must be of the same type and belong to the same table/index. +@return DB_SUCCESS or error code */ + +ib_err_t +ib_tuple_copy( +/*==========*/ + ib_tpl_t ib_dst_tpl, /*!< in: destination tuple */ + const ib_tpl_t ib_src_tpl); /*!< in: source tuple */ + +/*****************************************************************//** +Create an InnoDB tuple used for index/table search. +@return tuple for current index */ + +ib_tpl_t +ib_sec_search_tuple_create( +/*=======================*/ + ib_crsr_t ib_crsr); /*!< in: Cursor instance */ + +/*****************************************************************//** +Create an InnoDB tuple used for index/table search. +@return tuple for current index */ + +ib_tpl_t +ib_sec_read_tuple_create( +/*=====================*/ + ib_crsr_t ib_crsr); /*!< in: Cursor instance */ + +/*****************************************************************//** +Create an InnoDB tuple used for table key operations. +@return tuple for current table */ + +ib_tpl_t +ib_clust_search_tuple_create( +/*=========================*/ + ib_crsr_t ib_crsr); /*!< in: Cursor instance */ + +/*****************************************************************//** +Create an InnoDB tuple for table row operations. +@return tuple for current table */ + +ib_tpl_t +ib_clust_read_tuple_create( +/*=======================*/ + ib_crsr_t ib_crsr); /*!< in: Cursor instance */ + +/*****************************************************************//** +Return the number of user columns in the tuple definition. +@return number of user columns */ + +ib_ulint_t +ib_tuple_get_n_user_cols( +/*=====================*/ + const ib_tpl_t ib_tpl); /*!< in: Tuple for current table */ + +/*****************************************************************//** +Return the number of columns in the tuple definition. +@return number of columns */ + +ib_ulint_t +ib_tuple_get_n_cols( +/*================*/ + const ib_tpl_t ib_tpl); /*!< in: Tuple for current table */ + +/*****************************************************************//** +Destroy an InnoDB tuple. */ + +void +ib_tuple_delete( +/*============*/ + ib_tpl_t ib_tpl); /*!< in,own: Tuple instance to delete */ + +/*****************************************************************//** +Truncate a table. The cursor handle will be closed and set to NULL +on success. +@return DB_SUCCESS or error code */ + +ib_err_t +ib_cursor_truncate( +/*===============*/ + ib_crsr_t* ib_crsr, /*!< in/out: cursor for table + to truncate */ + ib_id_u64_t* table_id); /*!< out: new table id */ + +/*****************************************************************//** +Get a table id. +@return DB_SUCCESS if found */ + +ib_err_t +ib_table_get_id( +/*============*/ + const char* table_name, /*!< in: table to find */ + ib_id_u64_t* table_id); /*!< out: table id if found */ + +/*****************************************************************//** +Get an index id. +@return DB_SUCCESS if found */ + +ib_err_t +ib_index_get_id( +/*============*/ + const char* table_name, /*!< in: find index for this table */ + const char* index_name, /*!< in: index to find */ + ib_id_u64_t* index_id); /*!< out: index id if found */ + +/*****************************************************************//** +Check if cursor is positioned. +@return IB_TRUE if positioned */ + +ib_bool_t +ib_cursor_is_positioned( +/*====================*/ + const ib_crsr_t ib_crsr); /*!< in: InnoDB cursor instance */ + +/*****************************************************************//** +Checks if the data dictionary is latched in exclusive mode by a +user transaction. +@return TRUE if exclusive latch */ + +ib_bool_t +ib_schema_lock_is_exclusive( +/*========================*/ + const ib_trx_t ib_trx); /*!< in: transaction */ + +/*****************************************************************//** +Lock an InnoDB cursor/table. +@return DB_SUCCESS or error code */ + +ib_err_t +ib_cursor_lock( +/*===========*/ + ib_crsr_t ib_crsr, /*!< in/out: InnoDB cursor */ + ib_lck_mode_t ib_lck_mode); /*!< in: InnoDB lock mode */ + +/*****************************************************************//** +Set the Lock an InnoDB table using the table id. +@return DB_SUCCESS or error code */ + +ib_err_t +ib_table_lock( +/*===========*/ + ib_trx_t ib_trx, /*!< in/out: transaction */ + ib_id_u64_t table_id, /*!< in: table id */ + ib_lck_mode_t ib_lck_mode); /*!< in: InnoDB lock mode */ + +/*****************************************************************//** +Set the Lock mode of the cursor. +@return DB_SUCCESS or error code */ + +ib_err_t +ib_cursor_set_lock_mode( +/*====================*/ + ib_crsr_t ib_crsr, /*!< in/out: InnoDB cursor */ + ib_lck_mode_t ib_lck_mode); /*!< in: InnoDB lock mode */ + +/*****************************************************************//** +Set need to access clustered index record flag. */ + +void +ib_cursor_set_cluster_access( +/*=========================*/ + ib_crsr_t ib_crsr); /*!< in/out: InnoDB cursor */ + +/*****************************************************************//** +Write an integer value to a column. Integers are stored in big-endian +format and will need to be converted from the host format. +@return DB_SUCESS or error */ + +ib_err_t +ib_tuple_write_i8( +/*==============*/ + ib_tpl_t ib_tpl, /*!< in/out: tuple to write to */ + int col_no, /*!< in: column number */ + ib_i8_t val); /*!< in: value to write */ + +/*****************************************************************//** +Write an integer value to a column. Integers are stored in big-endian +format and will need to be converted from the host format. +@return DB_SUCESS or error */ + +ib_err_t +ib_tuple_write_i16( +/*=================*/ + ib_tpl_t ib_tpl, /*!< in/out: tuple to write to */ + int col_no, /*!< in: column number */ + ib_i16_t val); /*!< in: value to write */ + +/*****************************************************************//** +Write an integer value to a column. Integers are stored in big-endian +format and will need to be converted from the host format. +@return DB_SUCESS or error */ + +ib_err_t +ib_tuple_write_i32( +/*===============*/ + ib_tpl_t ib_tpl, /*!< in/out: tuple to write to */ + int col_no, /*!< in: column number */ + ib_i32_t val); /*!< in: value to write */ + +/*****************************************************************//** +Write an integer value to a column. Integers are stored in big-endian +format and will need to be converted from the host format. +@return DB_SUCESS or error */ + +ib_err_t +ib_tuple_write_i64( +/*===============*/ + ib_tpl_t ib_tpl, /*!< in/out: tuple to write to */ + int col_no, /*!< in: column number */ + ib_i64_t val); /*!< in: value to write */ + +/*****************************************************************//** +Write an integer value to a column. Integers are stored in big-endian +format and will need to be converted from the host format. +@return DB_SUCESS or error */ + +ib_err_t +ib_tuple_write_u8( +/*==============*/ + ib_tpl_t ib_tpl, /*!< in/out: tuple to write to */ + int col_no, /*!< in: column number */ + ib_u8_t val); /*!< in: value to write */ + +/*****************************************************************//** +Write an integer value to a column. Integers are stored in big-endian +format and will need to be converted from the host format. +@return DB_SUCESS or error */ + +ib_err_t +ib_tuple_write_u16( +/*===============*/ + ib_tpl_t ib_tpl, /*!< in/out: tuple to write to */ + int col_no, /*!< in: column number */ + ib_u16_t val); /*!< in: value to write */ + +/*****************************************************************//** +Write an integer value to a column. Integers are stored in big-endian +format and will need to be converted from the host format. +@return DB_SUCESS or error */ + +ib_err_t +ib_tuple_write_u32( +/*=================*/ + ib_tpl_t ib_tpl, /*!< in/out: tuple to write to */ + int col_no, /*!< in: column number */ + ib_u32_t val); /*!< in: value to write */ + +/*****************************************************************//** +Write an integer value to a column. Integers are stored in big-endian +format and will need to be converted from the host format. +@return DB_SUCESS or error */ + +ib_err_t +ib_tuple_write_u64( +/*===============*/ + ib_tpl_t ib_tpl, /*!< in/out: tuple to write to */ + int col_no, /*!< in: column number */ + ib_u64_t val); /*!< in: value to write */ + +/*****************************************************************//** +Inform the cursor that it's the start of an SQL statement. */ + +void +ib_cursor_stmt_begin( +/*=================*/ + ib_crsr_t ib_crsr); /*!< in: cursor */ + +/*****************************************************************//** +Write a double value to a column. +@return DB_SUCCESS or error */ + +ib_err_t +ib_tuple_write_double( +/*==================*/ + ib_tpl_t ib_tpl, /*!< in: InnoDB tuple */ + int col_no, /*!< in: column number */ + double val); /*!< in: value to write */ + +/*************************************************************//** +Read a double column value from an InnoDB tuple. +@return DB_SUCCESS or error */ + +ib_err_t +ib_tuple_read_double( +/*=================*/ + ib_tpl_t ib_tpl, /*!< in: InnoDB tuple */ + ib_ulint_t col_no, /*!< in: column number */ + double* dval); /*!< out: double value */ + +/*****************************************************************//** +Write a float value to a column. +@return DB_SUCCESS or error */ + +ib_err_t +ib_tuple_write_float( +/*=================*/ + ib_tpl_t ib_tpl, /*!< in/out: tuple to write to */ + int col_no, /*!< in: column number */ + float val); /*!< in: value to write */ + +/*************************************************************//** +Read a float value from an InnoDB tuple. +@return DB_SUCCESS or error */ + +ib_err_t +ib_tuple_read_float( +/*================*/ + ib_tpl_t ib_tpl, /*!< in: InnoDB tuple */ + ib_ulint_t col_no, /*!< in: column number */ + float* fval); /*!< out: float value */ + +/*****************************************************************//** +Get a column type, length and attributes from the tuple. +@return len of column data */ + +const char* +ib_col_get_name( +/*============*/ + ib_crsr_t ib_crsr, /*!< in: InnoDB cursor instance */ + ib_ulint_t i); /*!< in: column index in tuple */ + +/*****************************************************************//** +Get an index field name from the cursor. +@return name of the field */ + +const char* +ib_get_idx_field_name( +/*==================*/ + ib_crsr_t ib_crsr, /*!< in: InnoDB cursor instance */ + ib_ulint_t i); /*!< in: column index in tuple */ + +/*****************************************************************//** +Truncate a table. +@return DB_SUCCESS or error code */ + +ib_err_t +ib_table_truncate( +/*==============*/ + const char* table_name, /*!< in: table name */ + ib_id_u64_t* table_id); /*!< out: new table id */ + +/*****************************************************************//** +Frees a possible InnoDB trx object associated with the current THD. +@return DB_SUCCESS or error number */ + +ib_err_t +ib_close_thd( +/*=========*/ + void* thd); /*!< in: handle to the MySQL + thread of the user whose resources + should be free'd */ + +/*****************************************************************//** +Get generic configure status +@return configure status*/ + +int +ib_cfg_get_cfg(); +/*============*/ + +/*****************************************************************//** +Increase/decrease the memcached sync count of table to sync memcached +DML with SQL DDLs. +@return DB_SUCCESS or error number */ +ib_err_t +ib_cursor_set_memcached_sync( +/*=========================*/ + ib_crsr_t ib_crsr, /*!< in: cursor */ + ib_bool_t flag); /*!< in: true for increasing */ + +/*****************************************************************//** +Check whether the table name conforms to our requirements. Currently +we only do a simple check for the presence of a '/'. +@return DB_SUCCESS or err code */ + +ib_err_t +ib_table_name_check( +/*================*/ + const char* name); /*!< in: table name to check */ + +/*****************************************************************//** +Return isolation configuration set by "innodb_api_trx_level" +@return trx isolation level*/ + +ib_trx_state_t +ib_cfg_trx_level(); +/*==============*/ + +/*****************************************************************//** +Return configure value for background commit interval (in seconds) +@return background commit interval (in seconds) */ + +ib_ulint_t +ib_cfg_bk_commit_interval(); +/*=======================*/ + +/*****************************************************************//** +Get a trx start time. +@return trx start_time */ + +ib_u64_t +ib_trx_get_start_time( +/*==================*/ + ib_trx_t ib_trx); /*!< in: transaction */ + +#endif /* api0api_h */ diff --git a/storage/innobase/include/api0misc.h b/storage/innobase/include/api0misc.h new file mode 100644 index 00000000000..fcd748390d1 --- /dev/null +++ b/storage/innobase/include/api0misc.h @@ -0,0 +1,78 @@ +/***************************************************************************** + +Copyright (c) 2008, 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/api0misc.h +InnoDB Native API + +3/20/2011 Jimmy Yang extracted from Embedded InnoDB +2008 Created by Sunny Bains +*******************************************************/ + +#ifndef api0misc_h +#define api0misc_h + +#include "univ.i" +#include "os0file.h" +#include "que0que.h" +#include "trx0trx.h" + +/** Whether binlog is enabled for applications using InnoDB APIs */ +extern my_bool ib_binlog_enabled; + +/** Whether MySQL MDL is enabled for applications using InnoDB APIs */ +extern my_bool ib_mdl_enabled; + +/** Whether InnoDB row lock is disabled for applications using InnoDB APIs */ +extern my_bool ib_disable_row_lock; + +/** configure value for transaction isolation level */ +extern ulong ib_trx_level_setting; + +/** configure value for background commit interval (in seconds) */ +extern ulong ib_bk_commit_interval; + +/******************************************************************** +Handles user errors and lock waits detected by the database engine. +@return TRUE if it was a lock wait and we should continue running +the query thread */ +UNIV_INTERN +ibool +ib_handle_errors( +/*=============*/ + dberr_t* new_err, /*!< out: possible new error + encountered in lock wait, or if + no new error, the value of + trx->error_state at the entry of this + function */ + trx_t* trx, /*!< in: transaction */ + que_thr_t* thr, /*!< in: query thread */ + trx_savept_t* savept); /*!< in: savepoint or NULL */ + +/************************************************************************* +Sets a lock on a table. +@return error code or DB_SUCCESS */ +UNIV_INTERN +dberr_t +ib_trx_lock_table_with_retry( +/*=========================*/ + trx_t* trx, /*!< in/out: transaction */ + dict_table_t* table, /*!< in: table to lock */ + enum lock_mode mode); /*!< in: lock mode */ + +#endif /* api0misc_h */ diff --git a/storage/innobase/include/btr0btr.h b/storage/innobase/include/btr0btr.h new file mode 100644 index 00000000000..305acf7e322 --- /dev/null +++ b/storage/innobase/include/btr0btr.h @@ -0,0 +1,773 @@ +/***************************************************************************** + +Copyright (c) 1994, 2013, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2012, Facebook Inc. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/btr0btr.h +The B-tree + +Created 6/2/1994 Heikki Tuuri +*******************************************************/ + +#ifndef btr0btr_h +#define btr0btr_h + +#include "univ.i" + +#include "dict0dict.h" +#include "data0data.h" +#include "page0cur.h" +#include "mtr0mtr.h" +#include "btr0types.h" + +#ifndef UNIV_HOTBACKUP +/** Maximum record size which can be stored on a page, without using the +special big record storage structure */ +#define BTR_PAGE_MAX_REC_SIZE (UNIV_PAGE_SIZE / 2 - 200) + +/** @brief Maximum depth of a B-tree in InnoDB. + +Note that this isn't a maximum as such; none of the tree operations +avoid producing trees bigger than this. It is instead a "max depth +that other code must work with", useful for e.g. fixed-size arrays +that must store some information about each level in a tree. In other +words: if a B-tree with bigger depth than this is encountered, it is +not acceptable for it to lead to mysterious memory corruption, but it +is acceptable for the program to die with a clear assert failure. */ +#define BTR_MAX_LEVELS 100 + +/** Latching modes for btr_cur_search_to_nth_level(). */ +enum btr_latch_mode { + /** Search a record on a leaf page and S-latch it. */ + BTR_SEARCH_LEAF = RW_S_LATCH, + /** (Prepare to) modify a record on a leaf page and X-latch it. */ + BTR_MODIFY_LEAF = RW_X_LATCH, + /** Obtain no latches. */ + BTR_NO_LATCHES = RW_NO_LATCH, + /** Start modifying the entire B-tree. */ + BTR_MODIFY_TREE = 33, + /** Continue modifying the entire B-tree. */ + BTR_CONT_MODIFY_TREE = 34, + /** Search the previous record. */ + BTR_SEARCH_PREV = 35, + /** Modify the previous record. */ + BTR_MODIFY_PREV = 36 +}; + +/* BTR_INSERT, BTR_DELETE and BTR_DELETE_MARK are mutually exclusive. */ + +/** If this is ORed to btr_latch_mode, it means that the search tuple +will be inserted to the index, at the searched position. +When the record is not in the buffer pool, try to use the insert buffer. */ +#define BTR_INSERT 512 + +/** This flag ORed to btr_latch_mode says that we do the search in query +optimization */ +#define BTR_ESTIMATE 1024 + +/** This flag ORed to BTR_INSERT says that we can ignore possible +UNIQUE definition on secondary indexes when we decide if we can use +the insert buffer to speed up inserts */ +#define BTR_IGNORE_SEC_UNIQUE 2048 + +/** Try to delete mark the record at the searched position using the +insert/delete buffer when the record is not in the buffer pool. */ +#define BTR_DELETE_MARK 4096 + +/** Try to purge the record at the searched position using the insert/delete +buffer when the record is not in the buffer pool. */ +#define BTR_DELETE 8192 + +/** In the case of BTR_SEARCH_LEAF or BTR_MODIFY_LEAF, the caller is +already holding an S latch on the index tree */ +#define BTR_ALREADY_S_LATCHED 16384 + +#define BTR_LATCH_MODE_WITHOUT_FLAGS(latch_mode) \ + ((latch_mode) & ~(BTR_INSERT \ + | BTR_DELETE_MARK \ + | BTR_DELETE \ + | BTR_ESTIMATE \ + | BTR_IGNORE_SEC_UNIQUE \ + | BTR_ALREADY_S_LATCHED)) +#endif /* UNIV_HOTBACKUP */ + +/**************************************************************//** +Report that an index page is corrupted. */ +UNIV_INTERN +void +btr_corruption_report( +/*==================*/ + const buf_block_t* block, /*!< in: corrupted block */ + const dict_index_t* index) /*!< in: index tree */ + UNIV_COLD __attribute__((nonnull)); + +/** Assert that a B-tree page is not corrupted. +@param block buffer block containing a B-tree page +@param index the B-tree index */ +#define btr_assert_not_corrupted(block, index) \ + if ((ibool) !!page_is_comp(buf_block_get_frame(block)) \ + != dict_table_is_comp((index)->table)) { \ + btr_corruption_report(block, index); \ + ut_error; \ + } + +#ifndef UNIV_HOTBACKUP +#ifdef UNIV_BLOB_DEBUG +# include "ut0rbt.h" +/** An index->blobs entry for keeping track of off-page column references */ +struct btr_blob_dbg_t +{ + unsigned blob_page_no:32; /*!< first BLOB page number */ + unsigned ref_page_no:32; /*!< referring page number */ + unsigned ref_heap_no:16; /*!< referring heap number */ + unsigned ref_field_no:10; /*!< referring field number */ + unsigned owner:1; /*!< TRUE if BLOB owner */ + unsigned always_owner:1; /*!< TRUE if always + has been the BLOB owner; + reset to TRUE on B-tree + page splits and merges */ + unsigned del:1; /*!< TRUE if currently + delete-marked */ +}; + +/**************************************************************//** +Add a reference to an off-page column to the index->blobs map. */ +UNIV_INTERN +void +btr_blob_dbg_add_blob( +/*==================*/ + const rec_t* rec, /*!< in: clustered index record */ + ulint field_no, /*!< in: number of off-page column */ + ulint page_no, /*!< in: start page of the column */ + dict_index_t* index, /*!< in/out: index tree */ + const char* ctx) /*!< in: context (for logging) */ + __attribute__((nonnull)); +/**************************************************************//** +Display the references to off-page columns. +This function is to be called from a debugger, +for example when a breakpoint on ut_dbg_assertion_failed is hit. */ +UNIV_INTERN +void +btr_blob_dbg_print( +/*===============*/ + const dict_index_t* index) /*!< in: index tree */ + __attribute__((nonnull)); +/**************************************************************//** +Check that there are no references to off-page columns from or to +the given page. Invoked when freeing or clearing a page. +@return TRUE when no orphan references exist */ +UNIV_INTERN +ibool +btr_blob_dbg_is_empty( +/*==================*/ + dict_index_t* index, /*!< in: index */ + ulint page_no) /*!< in: page number */ + __attribute__((nonnull, warn_unused_result)); + +/**************************************************************//** +Modify the 'deleted' flag of a record. */ +UNIV_INTERN +void +btr_blob_dbg_set_deleted_flag( +/*==========================*/ + const rec_t* rec, /*!< in: record */ + dict_index_t* index, /*!< in/out: index */ + const ulint* offsets,/*!< in: rec_get_offs(rec, index) */ + ibool del) /*!< in: TRUE=deleted, FALSE=exists */ + __attribute__((nonnull)); +/**************************************************************//** +Change the ownership of an off-page column. */ +UNIV_INTERN +void +btr_blob_dbg_owner( +/*===============*/ + const rec_t* rec, /*!< in: record */ + dict_index_t* index, /*!< in/out: index */ + const ulint* offsets,/*!< in: rec_get_offs(rec, index) */ + ulint i, /*!< in: ith field in rec */ + ibool own) /*!< in: TRUE=owned, FALSE=disowned */ + __attribute__((nonnull)); +/** Assert that there are no BLOB references to or from the given page. */ +# define btr_blob_dbg_assert_empty(index, page_no) \ + ut_a(btr_blob_dbg_is_empty(index, page_no)) +#else /* UNIV_BLOB_DEBUG */ +# define btr_blob_dbg_add_blob(rec, field_no, page, index, ctx) ((void) 0) +# define btr_blob_dbg_set_deleted_flag(rec, index, offsets, del)((void) 0) +# define btr_blob_dbg_owner(rec, index, offsets, i, val) ((void) 0) +# define btr_blob_dbg_assert_empty(index, page_no) ((void) 0) +#endif /* UNIV_BLOB_DEBUG */ + +/**************************************************************//** +Gets the root node of a tree and x-latches it. +@return root page, x-latched */ +UNIV_INTERN +page_t* +btr_root_get( +/*=========*/ + const dict_index_t* index, /*!< in: index tree */ + mtr_t* mtr) /*!< in: mtr */ + __attribute__((nonnull)); + +/**************************************************************//** +Checks and adjusts the root node of a tree during IMPORT TABLESPACE. +@return error code, or DB_SUCCESS */ +UNIV_INTERN +dberr_t +btr_root_adjust_on_import( +/*======================*/ + const dict_index_t* index) /*!< in: index tree */ + __attribute__((nonnull, warn_unused_result)); + +/**************************************************************//** +Gets the height of the B-tree (the level of the root, when the leaf +level is assumed to be 0). The caller must hold an S or X latch on +the index. +@return tree height (level of the root) */ +UNIV_INTERN +ulint +btr_height_get( +/*===========*/ + dict_index_t* index, /*!< in: index tree */ + mtr_t* mtr) /*!< in/out: mini-transaction */ + __attribute__((nonnull, warn_unused_result)); +/**************************************************************//** +Gets a buffer page and declares its latching order level. */ +UNIV_INLINE +buf_block_t* +btr_block_get_func( +/*===============*/ + ulint space, /*!< in: space id */ + ulint zip_size, /*!< in: compressed page size in bytes + or 0 for uncompressed pages */ + ulint page_no, /*!< in: page number */ + ulint mode, /*!< in: latch mode */ + const char* file, /*!< in: file name */ + ulint line, /*!< in: line where called */ +# ifdef UNIV_SYNC_DEBUG + const dict_index_t* index, /*!< in: index tree, may be NULL + if it is not an insert buffer tree */ +# endif /* UNIV_SYNC_DEBUG */ + mtr_t* mtr); /*!< in/out: mini-transaction */ +# ifdef UNIV_SYNC_DEBUG +/** Gets a buffer page and declares its latching order level. +@param space tablespace identifier +@param zip_size compressed page size in bytes or 0 for uncompressed pages +@param page_no page number +@param mode latch mode +@param index index tree, may be NULL if not the insert buffer tree +@param mtr mini-transaction handle +@return the block descriptor */ +# define btr_block_get(space,zip_size,page_no,mode,index,mtr) \ + btr_block_get_func(space,zip_size,page_no,mode, \ + __FILE__,__LINE__,index,mtr) +# else /* UNIV_SYNC_DEBUG */ +/** Gets a buffer page and declares its latching order level. +@param space tablespace identifier +@param zip_size compressed page size in bytes or 0 for uncompressed pages +@param page_no page number +@param mode latch mode +@param idx index tree, may be NULL if not the insert buffer tree +@param mtr mini-transaction handle +@return the block descriptor */ +# define btr_block_get(space,zip_size,page_no,mode,idx,mtr) \ + btr_block_get_func(space,zip_size,page_no,mode,__FILE__,__LINE__,mtr) +# endif /* UNIV_SYNC_DEBUG */ +/** Gets a buffer page and declares its latching order level. +@param space tablespace identifier +@param zip_size compressed page size in bytes or 0 for uncompressed pages +@param page_no page number +@param mode latch mode +@param idx index tree, may be NULL if not the insert buffer tree +@param mtr mini-transaction handle +@return the uncompressed page frame */ +# define btr_page_get(space,zip_size,page_no,mode,idx,mtr) \ + buf_block_get_frame(btr_block_get(space,zip_size,page_no,mode,idx,mtr)) +#endif /* !UNIV_HOTBACKUP */ +/**************************************************************//** +Gets the index id field of a page. +@return index id */ +UNIV_INLINE +index_id_t +btr_page_get_index_id( +/*==================*/ + const page_t* page) /*!< in: index page */ + __attribute__((nonnull, pure, warn_unused_result)); +#ifndef UNIV_HOTBACKUP +/********************************************************//** +Gets the node level field in an index page. +@return level, leaf level == 0 */ +UNIV_INLINE +ulint +btr_page_get_level_low( +/*===================*/ + const page_t* page) /*!< in: index page */ + __attribute__((nonnull, pure, warn_unused_result)); +#define btr_page_get_level(page, mtr) btr_page_get_level_low(page) +/********************************************************//** +Gets the next index page number. +@return next page number */ +UNIV_INLINE +ulint +btr_page_get_next( +/*==============*/ + const page_t* page, /*!< in: index page */ + mtr_t* mtr) /*!< in: mini-transaction handle */ + __attribute__((nonnull, warn_unused_result)); +/********************************************************//** +Gets the previous index page number. +@return prev page number */ +UNIV_INLINE +ulint +btr_page_get_prev( +/*==============*/ + const page_t* page, /*!< in: index page */ + mtr_t* mtr) /*!< in: mini-transaction handle */ + __attribute__((nonnull, warn_unused_result)); +/*************************************************************//** +Gets pointer to the previous user record in the tree. It is assumed +that the caller has appropriate latches on the page and its neighbor. +@return previous user record, NULL if there is none */ +UNIV_INTERN +rec_t* +btr_get_prev_user_rec( +/*==================*/ + rec_t* rec, /*!< in: record on leaf level */ + mtr_t* mtr) /*!< in: mtr holding a latch on the page, and if + needed, also to the previous page */ + __attribute__((nonnull, warn_unused_result)); +/*************************************************************//** +Gets pointer to the next user record in the tree. It is assumed +that the caller has appropriate latches on the page and its neighbor. +@return next user record, NULL if there is none */ +UNIV_INTERN +rec_t* +btr_get_next_user_rec( +/*==================*/ + rec_t* rec, /*!< in: record on leaf level */ + mtr_t* mtr) /*!< in: mtr holding a latch on the page, and if + needed, also to the next page */ + __attribute__((nonnull, warn_unused_result)); +/**************************************************************//** +Releases the latch on a leaf page and bufferunfixes it. */ +UNIV_INLINE +void +btr_leaf_page_release( +/*==================*/ + buf_block_t* block, /*!< in: buffer block */ + ulint latch_mode, /*!< in: BTR_SEARCH_LEAF or + BTR_MODIFY_LEAF */ + mtr_t* mtr) /*!< in: mtr */ + __attribute__((nonnull)); +/**************************************************************//** +Gets the child node file address in a node pointer. +NOTE: the offsets array must contain all offsets for the record since +we read the last field according to offsets and assume that it contains +the child page number. In other words offsets must have been retrieved +with rec_get_offsets(n_fields=ULINT_UNDEFINED). +@return child node address */ +UNIV_INLINE +ulint +btr_node_ptr_get_child_page_no( +/*===========================*/ + const rec_t* rec, /*!< in: node pointer record */ + const ulint* offsets)/*!< in: array returned by rec_get_offsets() */ + __attribute__((nonnull, pure, warn_unused_result)); +/************************************************************//** +Creates the root node for a new index tree. +@return page number of the created root, FIL_NULL if did not succeed */ +UNIV_INTERN +ulint +btr_create( +/*=======*/ + ulint type, /*!< in: type of the index */ + ulint space, /*!< in: space where created */ + ulint zip_size,/*!< in: compressed page size in bytes + or 0 for uncompressed pages */ + index_id_t index_id,/*!< in: index id */ + dict_index_t* index, /*!< in: index */ + mtr_t* mtr) /*!< in: mini-transaction handle */ + __attribute__((nonnull)); +/************************************************************//** +Frees a B-tree except the root page, which MUST be freed after this +by calling btr_free_root. */ +UNIV_INTERN +void +btr_free_but_not_root( +/*==================*/ + ulint space, /*!< in: space where created */ + ulint zip_size, /*!< in: compressed page size in bytes + or 0 for uncompressed pages */ + ulint root_page_no); /*!< in: root page number */ +/************************************************************//** +Frees the B-tree root page. Other tree MUST already have been freed. */ +UNIV_INTERN +void +btr_free_root( +/*==========*/ + ulint space, /*!< in: space where created */ + ulint zip_size, /*!< in: compressed page size in bytes + or 0 for uncompressed pages */ + ulint root_page_no, /*!< in: root page number */ + mtr_t* mtr) /*!< in/out: mini-transaction */ + __attribute__((nonnull)); +/*************************************************************//** +Makes tree one level higher by splitting the root, and inserts +the tuple. It is assumed that mtr contains an x-latch on the tree. +NOTE that the operation of this function must always succeed, +we cannot reverse it: therefore enough free disk space must be +guaranteed to be available before this function is called. +@return inserted record */ +UNIV_INTERN +rec_t* +btr_root_raise_and_insert( +/*======================*/ + ulint flags, /*!< in: undo logging and locking flags */ + btr_cur_t* cursor, /*!< in: cursor at which to insert: must be + on the root page; when the function returns, + the cursor is positioned on the predecessor + of the inserted record */ + ulint** offsets,/*!< out: offsets on inserted record */ + mem_heap_t** heap, /*!< in/out: pointer to memory heap + that can be emptied, or NULL */ + const dtuple_t* tuple, /*!< in: tuple to insert */ + ulint n_ext, /*!< in: number of externally stored columns */ + mtr_t* mtr) /*!< in: mtr */ + __attribute__((nonnull, warn_unused_result)); +/*************************************************************//** +Reorganizes an index page. + +IMPORTANT: On success, the caller will have to update IBUF_BITMAP_FREE +if this is a compressed leaf page in a secondary index. This has to +be done either within the same mini-transaction, or by invoking +ibuf_reset_free_bits() before mtr_commit(). On uncompressed pages, +IBUF_BITMAP_FREE is unaffected by reorganization. + +@retval true if the operation was successful +@retval false if it is a compressed page, and recompression failed */ +UNIV_INTERN +bool +btr_page_reorganize_low( +/*====================*/ + bool recovery,/*!< in: true if called in recovery: + locks should not be updated, i.e., + there cannot exist locks on the + page, and a hash index should not be + dropped: it cannot exist */ + ulint z_level,/*!< in: compression level to be used + if dealing with compressed page */ + page_cur_t* cursor, /*!< in/out: page cursor */ + dict_index_t* index, /*!< in: the index tree of the page */ + mtr_t* mtr) /*!< in/out: mini-transaction */ + __attribute__((nonnull, warn_unused_result)); +/*************************************************************//** +Reorganizes an index page. + +IMPORTANT: On success, the caller will have to update IBUF_BITMAP_FREE +if this is a compressed leaf page in a secondary index. This has to +be done either within the same mini-transaction, or by invoking +ibuf_reset_free_bits() before mtr_commit(). On uncompressed pages, +IBUF_BITMAP_FREE is unaffected by reorganization. + +@retval true if the operation was successful +@retval false if it is a compressed page, and recompression failed */ +UNIV_INTERN +bool +btr_page_reorganize( +/*================*/ + page_cur_t* cursor, /*!< in/out: page cursor */ + dict_index_t* index, /*!< in: the index tree of the page */ + mtr_t* mtr) /*!< in/out: mini-transaction */ + __attribute__((nonnull)); +/*************************************************************//** +Decides if the page should be split at the convergence point of +inserts converging to left. +@return TRUE if split recommended */ +UNIV_INTERN +ibool +btr_page_get_split_rec_to_left( +/*===========================*/ + btr_cur_t* cursor, /*!< in: cursor at which to insert */ + rec_t** split_rec)/*!< out: if split recommended, + the first record on upper half page, + or NULL if tuple should be first */ + __attribute__((nonnull, warn_unused_result)); +/*************************************************************//** +Decides if the page should be split at the convergence point of +inserts converging to right. +@return TRUE if split recommended */ +UNIV_INTERN +ibool +btr_page_get_split_rec_to_right( +/*============================*/ + btr_cur_t* cursor, /*!< in: cursor at which to insert */ + rec_t** split_rec)/*!< out: if split recommended, + the first record on upper half page, + or NULL if tuple should be first */ + __attribute__((nonnull, warn_unused_result)); +/*************************************************************//** +Splits an index page to halves and inserts the tuple. It is assumed +that mtr holds an x-latch to the index tree. NOTE: the tree x-latch is +released within this function! NOTE that the operation of this +function must always succeed, we cannot reverse it: therefore enough +free disk space (2 pages) must be guaranteed to be available before +this function is called. + +@return inserted record */ +UNIV_INTERN +rec_t* +btr_page_split_and_insert( +/*======================*/ + ulint flags, /*!< in: undo logging and locking flags */ + btr_cur_t* cursor, /*!< in: cursor at which to insert; when the + function returns, the cursor is positioned + on the predecessor of the inserted record */ + ulint** offsets,/*!< out: offsets on inserted record */ + mem_heap_t** heap, /*!< in/out: pointer to memory heap + that can be emptied, or NULL */ + const dtuple_t* tuple, /*!< in: tuple to insert */ + ulint n_ext, /*!< in: number of externally stored columns */ + mtr_t* mtr) /*!< in: mtr */ + __attribute__((nonnull, warn_unused_result)); +/*******************************************************//** +Inserts a data tuple to a tree on a non-leaf level. It is assumed +that mtr holds an x-latch on the tree. */ +UNIV_INTERN +void +btr_insert_on_non_leaf_level_func( +/*==============================*/ + ulint flags, /*!< in: undo logging and locking flags */ + dict_index_t* index, /*!< in: index */ + ulint level, /*!< in: level, must be > 0 */ + dtuple_t* tuple, /*!< in: the record to be inserted */ + const char* file, /*!< in: file name */ + ulint line, /*!< in: line where called */ + mtr_t* mtr) /*!< in: mtr */ + __attribute__((nonnull)); +# define btr_insert_on_non_leaf_level(f,i,l,t,m) \ + btr_insert_on_non_leaf_level_func(f,i,l,t,__FILE__,__LINE__,m) +#endif /* !UNIV_HOTBACKUP */ +/****************************************************************//** +Sets a record as the predefined minimum record. */ +UNIV_INTERN +void +btr_set_min_rec_mark( +/*=================*/ + rec_t* rec, /*!< in/out: record */ + mtr_t* mtr) /*!< in: mtr */ + __attribute__((nonnull)); +#ifndef UNIV_HOTBACKUP +/*************************************************************//** +Deletes on the upper level the node pointer to a page. */ +UNIV_INTERN +void +btr_node_ptr_delete( +/*================*/ + dict_index_t* index, /*!< in: index tree */ + buf_block_t* block, /*!< in: page whose node pointer is deleted */ + mtr_t* mtr) /*!< in: mtr */ + __attribute__((nonnull)); +#ifdef UNIV_DEBUG +/************************************************************//** +Checks that the node pointer to a page is appropriate. +@return TRUE */ +UNIV_INTERN +ibool +btr_check_node_ptr( +/*===============*/ + dict_index_t* index, /*!< in: index tree */ + buf_block_t* block, /*!< in: index page */ + mtr_t* mtr) /*!< in: mtr */ + __attribute__((nonnull, warn_unused_result)); +#endif /* UNIV_DEBUG */ +/*************************************************************//** +Tries to merge the page first to the left immediate brother if such a +brother exists, and the node pointers to the current page and to the +brother reside on the same page. If the left brother does not satisfy these +conditions, looks at the right brother. If the page is the only one on that +level lifts the records of the page to the father page, thus reducing the +tree height. It is assumed that mtr holds an x-latch on the tree and on the +page. If cursor is on the leaf level, mtr must also hold x-latches to +the brothers, if they exist. +@return TRUE on success */ +UNIV_INTERN +ibool +btr_compress( +/*=========*/ + btr_cur_t* cursor, /*!< in/out: cursor on the page to merge + or lift; the page must not be empty: + when deleting records, use btr_discard_page() + if the page would become empty */ + ibool adjust, /*!< in: TRUE if should adjust the + cursor position even if compression occurs */ + mtr_t* mtr) /*!< in/out: mini-transaction */ + __attribute__((nonnull)); +/*************************************************************//** +Discards a page from a B-tree. This is used to remove the last record from +a B-tree page: the whole page must be removed at the same time. This cannot +be used for the root page, which is allowed to be empty. */ +UNIV_INTERN +void +btr_discard_page( +/*=============*/ + btr_cur_t* cursor, /*!< in: cursor on the page to discard: not on + the root page */ + mtr_t* mtr) /*!< in: mtr */ + __attribute__((nonnull)); +#endif /* !UNIV_HOTBACKUP */ +/****************************************************************//** +Parses the redo log record for setting an index record as the predefined +minimum record. +@return end of log record or NULL */ +UNIV_INTERN +byte* +btr_parse_set_min_rec_mark( +/*=======================*/ + byte* ptr, /*!< in: buffer */ + byte* end_ptr,/*!< in: buffer end */ + ulint comp, /*!< in: nonzero=compact page format */ + page_t* page, /*!< in: page or NULL */ + mtr_t* mtr) /*!< in: mtr or NULL */ + __attribute__((nonnull(1,2), warn_unused_result)); +/***********************************************************//** +Parses a redo log record of reorganizing a page. +@return end of log record or NULL */ +UNIV_INTERN +byte* +btr_parse_page_reorganize( +/*======================*/ + byte* ptr, /*!< in: buffer */ + byte* end_ptr,/*!< in: buffer end */ + dict_index_t* index, /*!< in: record descriptor */ + bool compressed,/*!< in: true if compressed page */ + buf_block_t* block, /*!< in: page to be reorganized, or NULL */ + mtr_t* mtr) /*!< in: mtr or NULL */ + __attribute__((nonnull(1,2,3), warn_unused_result)); +#ifndef UNIV_HOTBACKUP +/**************************************************************//** +Gets the number of pages in a B-tree. +@return number of pages, or ULINT_UNDEFINED if the index is unavailable */ +UNIV_INTERN +ulint +btr_get_size( +/*=========*/ + dict_index_t* index, /*!< in: index */ + ulint flag, /*!< in: BTR_N_LEAF_PAGES or BTR_TOTAL_SIZE */ + mtr_t* mtr) /*!< in/out: mini-transaction where index + is s-latched */ + __attribute__((nonnull, warn_unused_result)); +/**************************************************************//** +Allocates a new file page to be used in an index tree. NOTE: we assume +that the caller has made the reservation for free extents! +@retval NULL if no page could be allocated +@retval block, rw_lock_x_lock_count(&block->lock) == 1 if allocation succeeded +(init_mtr == mtr, or the page was not previously freed in mtr) +@retval block (not allocated or initialized) otherwise */ +UNIV_INTERN +buf_block_t* +btr_page_alloc( +/*===========*/ + dict_index_t* index, /*!< in: index tree */ + ulint hint_page_no, /*!< in: hint of a good page */ + byte file_direction, /*!< in: direction where a possible + page split is made */ + ulint level, /*!< in: level where the page is placed + in the tree */ + mtr_t* mtr, /*!< in/out: mini-transaction + for the allocation */ + mtr_t* init_mtr) /*!< in/out: mini-transaction + for x-latching and initializing + the page */ + __attribute__((nonnull, warn_unused_result)); +/**************************************************************//** +Frees a file page used in an index tree. NOTE: cannot free field external +storage pages because the page must contain info on its level. */ +UNIV_INTERN +void +btr_page_free( +/*==========*/ + dict_index_t* index, /*!< in: index tree */ + buf_block_t* block, /*!< in: block to be freed, x-latched */ + mtr_t* mtr) /*!< in: mtr */ + __attribute__((nonnull)); +/**************************************************************//** +Frees a file page used in an index tree. Can be used also to BLOB +external storage pages, because the page level 0 can be given as an +argument. */ +UNIV_INTERN +void +btr_page_free_low( +/*==============*/ + dict_index_t* index, /*!< in: index tree */ + buf_block_t* block, /*!< in: block to be freed, x-latched */ + ulint level, /*!< in: page level */ + mtr_t* mtr) /*!< in: mtr */ + __attribute__((nonnull)); +#ifdef UNIV_BTR_PRINT +/*************************************************************//** +Prints size info of a B-tree. */ +UNIV_INTERN +void +btr_print_size( +/*===========*/ + dict_index_t* index) /*!< in: index tree */ + __attribute__((nonnull)); +/**************************************************************//** +Prints directories and other info of all nodes in the index. */ +UNIV_INTERN +void +btr_print_index( +/*============*/ + dict_index_t* index, /*!< in: index */ + ulint width) /*!< in: print this many entries from start + and end */ + __attribute__((nonnull)); +#endif /* UNIV_BTR_PRINT */ +/************************************************************//** +Checks the size and number of fields in a record based on the definition of +the index. +@return TRUE if ok */ +UNIV_INTERN +ibool +btr_index_rec_validate( +/*===================*/ + const rec_t* rec, /*!< in: index record */ + const dict_index_t* index, /*!< in: index */ + ibool dump_on_error) /*!< in: TRUE if the function + should print hex dump of record + and page on error */ + __attribute__((nonnull, warn_unused_result)); +/**************************************************************//** +Checks the consistency of an index tree. +@return TRUE if ok */ +UNIV_INTERN +bool +btr_validate_index( +/*===============*/ + dict_index_t* index, /*!< in: index */ + const trx_t* trx) /*!< in: transaction or 0 */ + __attribute__((nonnull(1), warn_unused_result)); + +#define BTR_N_LEAF_PAGES 1 +#define BTR_TOTAL_SIZE 2 +#endif /* !UNIV_HOTBACKUP */ + +#ifndef UNIV_NONINL +#include "btr0btr.ic" +#endif + +#endif diff --git a/storage/innobase/include/btr0btr.ic b/storage/innobase/include/btr0btr.ic new file mode 100644 index 00000000000..00f50b5dcaf --- /dev/null +++ b/storage/innobase/include/btr0btr.ic @@ -0,0 +1,290 @@ +/***************************************************************************** + +Copyright (c) 1994, 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/btr0btr.ic +The B-tree + +Created 6/2/1994 Heikki Tuuri +*******************************************************/ + +#include "mach0data.h" +#ifndef UNIV_HOTBACKUP +#include "mtr0mtr.h" +#include "mtr0log.h" +#include "page0zip.h" + +#define BTR_MAX_NODE_LEVEL 50 /*!< Maximum B-tree page level + (not really a hard limit). + Used in debug assertions + in btr_page_set_level and + btr_page_get_level_low */ + +/**************************************************************//** +Gets a buffer page and declares its latching order level. */ +UNIV_INLINE +buf_block_t* +btr_block_get_func( +/*===============*/ + ulint space, /*!< in: space id */ + ulint zip_size, /*!< in: compressed page size in bytes + or 0 for uncompressed pages */ + ulint page_no, /*!< in: page number */ + ulint mode, /*!< in: latch mode */ + const char* file, /*!< in: file name */ + ulint line, /*!< in: line where called */ +#ifdef UNIV_SYNC_DEBUG + const dict_index_t* index, /*!< in: index tree, may be NULL + if it is not an insert buffer tree */ +#endif /* UNIV_SYNC_DEBUG */ + mtr_t* mtr) /*!< in/out: mtr */ +{ + buf_block_t* block; + + block = buf_page_get_gen(space, zip_size, page_no, mode, + NULL, BUF_GET, file, line, mtr); + + if (mode != RW_NO_LATCH) { + + buf_block_dbg_add_level( + block, index != NULL && dict_index_is_ibuf(index) + ? SYNC_IBUF_TREE_NODE : SYNC_TREE_NODE); + } + + return(block); +} + +/**************************************************************//** +Sets the index id field of a page. */ +UNIV_INLINE +void +btr_page_set_index_id( +/*==================*/ + page_t* page, /*!< in: page to be created */ + page_zip_des_t* page_zip,/*!< in: compressed page whose uncompressed + part will be updated, or NULL */ + index_id_t id, /*!< in: index id */ + mtr_t* mtr) /*!< in: mtr */ +{ + if (page_zip) { + mach_write_to_8(page + (PAGE_HEADER + PAGE_INDEX_ID), id); + page_zip_write_header(page_zip, + page + (PAGE_HEADER + PAGE_INDEX_ID), + 8, mtr); + } else { + mlog_write_ull(page + (PAGE_HEADER + PAGE_INDEX_ID), id, mtr); + } +} +#endif /* !UNIV_HOTBACKUP */ + +/**************************************************************//** +Gets the index id field of a page. +@return index id */ +UNIV_INLINE +index_id_t +btr_page_get_index_id( +/*==================*/ + const page_t* page) /*!< in: index page */ +{ + return(mach_read_from_8(page + PAGE_HEADER + PAGE_INDEX_ID)); +} + +#ifndef UNIV_HOTBACKUP +/********************************************************//** +Gets the node level field in an index page. +@return level, leaf level == 0 */ +UNIV_INLINE +ulint +btr_page_get_level_low( +/*===================*/ + const page_t* page) /*!< in: index page */ +{ + ulint level; + + ut_ad(page); + + level = mach_read_from_2(page + PAGE_HEADER + PAGE_LEVEL); + + ut_ad(level <= BTR_MAX_NODE_LEVEL); + + return(level); +} + +/********************************************************//** +Sets the node level field in an index page. */ +UNIV_INLINE +void +btr_page_set_level( +/*===============*/ + page_t* page, /*!< in: index page */ + page_zip_des_t* page_zip,/*!< in: compressed page whose uncompressed + part will be updated, or NULL */ + ulint level, /*!< in: level, leaf level == 0 */ + mtr_t* mtr) /*!< in: mini-transaction handle */ +{ + ut_ad(page && mtr); + ut_ad(level <= BTR_MAX_NODE_LEVEL); + + if (page_zip) { + mach_write_to_2(page + (PAGE_HEADER + PAGE_LEVEL), level); + page_zip_write_header(page_zip, + page + (PAGE_HEADER + PAGE_LEVEL), + 2, mtr); + } else { + mlog_write_ulint(page + (PAGE_HEADER + PAGE_LEVEL), level, + MLOG_2BYTES, mtr); + } +} + +/********************************************************//** +Gets the next index page number. +@return next page number */ +UNIV_INLINE +ulint +btr_page_get_next( +/*==============*/ + const page_t* page, /*!< in: index page */ + mtr_t* mtr __attribute__((unused))) + /*!< in: mini-transaction handle */ +{ + ut_ad(page && mtr); + ut_ad(mtr_memo_contains_page(mtr, page, MTR_MEMO_PAGE_X_FIX) + || mtr_memo_contains_page(mtr, page, MTR_MEMO_PAGE_S_FIX)); + + return(mach_read_from_4(page + FIL_PAGE_NEXT)); +} + +/********************************************************//** +Sets the next index page field. */ +UNIV_INLINE +void +btr_page_set_next( +/*==============*/ + page_t* page, /*!< in: index page */ + page_zip_des_t* page_zip,/*!< in: compressed page whose uncompressed + part will be updated, or NULL */ + ulint next, /*!< in: next page number */ + mtr_t* mtr) /*!< in: mini-transaction handle */ +{ + ut_ad(page && mtr); + + if (page_zip) { + mach_write_to_4(page + FIL_PAGE_NEXT, next); + page_zip_write_header(page_zip, page + FIL_PAGE_NEXT, 4, mtr); + } else { + mlog_write_ulint(page + FIL_PAGE_NEXT, next, MLOG_4BYTES, mtr); + } +} + +/********************************************************//** +Gets the previous index page number. +@return prev page number */ +UNIV_INLINE +ulint +btr_page_get_prev( +/*==============*/ + const page_t* page, /*!< in: index page */ + mtr_t* mtr __attribute__((unused))) /*!< in: mini-transaction handle */ +{ + ut_ad(page && mtr); + + return(mach_read_from_4(page + FIL_PAGE_PREV)); +} + +/********************************************************//** +Sets the previous index page field. */ +UNIV_INLINE +void +btr_page_set_prev( +/*==============*/ + page_t* page, /*!< in: index page */ + page_zip_des_t* page_zip,/*!< in: compressed page whose uncompressed + part will be updated, or NULL */ + ulint prev, /*!< in: previous page number */ + mtr_t* mtr) /*!< in: mini-transaction handle */ +{ + ut_ad(page && mtr); + + if (page_zip) { + mach_write_to_4(page + FIL_PAGE_PREV, prev); + page_zip_write_header(page_zip, page + FIL_PAGE_PREV, 4, mtr); + } else { + mlog_write_ulint(page + FIL_PAGE_PREV, prev, MLOG_4BYTES, mtr); + } +} + +/**************************************************************//** +Gets the child node file address in a node pointer. +NOTE: the offsets array must contain all offsets for the record since +we read the last field according to offsets and assume that it contains +the child page number. In other words offsets must have been retrieved +with rec_get_offsets(n_fields=ULINT_UNDEFINED). +@return child node address */ +UNIV_INLINE +ulint +btr_node_ptr_get_child_page_no( +/*===========================*/ + const rec_t* rec, /*!< in: node pointer record */ + const ulint* offsets)/*!< in: array returned by rec_get_offsets() */ +{ + const byte* field; + ulint len; + ulint page_no; + + ut_ad(!rec_offs_comp(offsets) || rec_get_node_ptr_flag(rec)); + + /* The child address is in the last field */ + field = rec_get_nth_field(rec, offsets, + rec_offs_n_fields(offsets) - 1, &len); + + ut_ad(len == 4); + + page_no = mach_read_from_4(field); + + if (page_no == 0) { + fprintf(stderr, + "InnoDB: a nonsensical page number 0" + " in a node ptr record at offset %lu\n", + (ulong) page_offset(rec)); + buf_page_print(page_align(rec), 0, 0); + ut_ad(0); + } + + return(page_no); +} + +/**************************************************************//** +Releases the latches on a leaf page and bufferunfixes it. */ +UNIV_INLINE +void +btr_leaf_page_release( +/*==================*/ + buf_block_t* block, /*!< in: buffer block */ + ulint latch_mode, /*!< in: BTR_SEARCH_LEAF or + BTR_MODIFY_LEAF */ + mtr_t* mtr) /*!< in: mtr */ +{ + ut_ad(latch_mode == BTR_SEARCH_LEAF || latch_mode == BTR_MODIFY_LEAF); + ut_ad(!mtr_memo_contains(mtr, block, MTR_MEMO_MODIFY)); + + mtr_memo_release(mtr, block, + latch_mode == BTR_SEARCH_LEAF + ? MTR_MEMO_PAGE_S_FIX + : MTR_MEMO_PAGE_X_FIX); +} +#endif /* !UNIV_HOTBACKUP */ diff --git a/storage/innobase/include/btr0cur.h b/storage/innobase/include/btr0cur.h new file mode 100644 index 00000000000..f1e4406fcf7 --- /dev/null +++ b/storage/innobase/include/btr0cur.h @@ -0,0 +1,937 @@ +/***************************************************************************** + +Copyright (c) 1994, 2014, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/btr0cur.h +The index tree cursor + +Created 10/16/1994 Heikki Tuuri +*******************************************************/ + +#ifndef btr0cur_h +#define btr0cur_h + +#include "univ.i" +#include "dict0dict.h" +#include "page0cur.h" +#include "btr0types.h" + +/** Mode flags for btr_cur operations; these can be ORed */ +enum { + /** do no undo logging */ + BTR_NO_UNDO_LOG_FLAG = 1, + /** do no record lock checking */ + BTR_NO_LOCKING_FLAG = 2, + /** sys fields will be found in the update vector or inserted + entry */ + BTR_KEEP_SYS_FLAG = 4, + /** btr_cur_pessimistic_update() must keep cursor position + when moving columns to big_rec */ + BTR_KEEP_POS_FLAG = 8, + /** the caller is creating the index or wants to bypass the + index->info.online creation log */ + BTR_CREATE_FLAG = 16, + /** the caller of btr_cur_optimistic_update() or + btr_cur_update_in_place() will take care of + updating IBUF_BITMAP_FREE */ + BTR_KEEP_IBUF_BITMAP = 32 +}; + +#ifndef UNIV_HOTBACKUP +#include "que0types.h" +#include "row0types.h" +#include "ha0ha.h" + +#define BTR_CUR_ADAPT +#define BTR_CUR_HASH_ADAPT + +#ifdef UNIV_DEBUG +/*********************************************************//** +Returns the page cursor component of a tree cursor. +@return pointer to page cursor component */ +UNIV_INLINE +page_cur_t* +btr_cur_get_page_cur( +/*=================*/ + const btr_cur_t* cursor);/*!< in: tree cursor */ +/*********************************************************//** +Returns the buffer block on which the tree cursor is positioned. +@return pointer to buffer block */ +UNIV_INLINE +buf_block_t* +btr_cur_get_block( +/*==============*/ + const btr_cur_t* cursor);/*!< in: tree cursor */ +/*********************************************************//** +Returns the record pointer of a tree cursor. +@return pointer to record */ +UNIV_INLINE +rec_t* +btr_cur_get_rec( +/*============*/ + const btr_cur_t* cursor);/*!< in: tree cursor */ +#else /* UNIV_DEBUG */ +# define btr_cur_get_page_cur(cursor) (&(cursor)->page_cur) +# define btr_cur_get_block(cursor) ((cursor)->page_cur.block) +# define btr_cur_get_rec(cursor) ((cursor)->page_cur.rec) +#endif /* UNIV_DEBUG */ +/*********************************************************//** +Returns the compressed page on which the tree cursor is positioned. +@return pointer to compressed page, or NULL if the page is not compressed */ +UNIV_INLINE +page_zip_des_t* +btr_cur_get_page_zip( +/*=================*/ + btr_cur_t* cursor);/*!< in: tree cursor */ +/*********************************************************//** +Invalidates a tree cursor by setting record pointer to NULL. */ +UNIV_INLINE +void +btr_cur_invalidate( +/*===============*/ + btr_cur_t* cursor);/*!< in: tree cursor */ +/*********************************************************//** +Returns the page of a tree cursor. +@return pointer to page */ +UNIV_INLINE +page_t* +btr_cur_get_page( +/*=============*/ + btr_cur_t* cursor);/*!< in: tree cursor */ +/*********************************************************//** +Returns the index of a cursor. +@param cursor b-tree cursor +@return index */ +#define btr_cur_get_index(cursor) ((cursor)->index) +/*********************************************************//** +Positions a tree cursor at a given record. */ +UNIV_INLINE +void +btr_cur_position( +/*=============*/ + dict_index_t* index, /*!< in: index */ + rec_t* rec, /*!< in: record in tree */ + buf_block_t* block, /*!< in: buffer block of rec */ + btr_cur_t* cursor);/*!< in: cursor */ +/********************************************************************//** +Searches an index tree and positions a tree cursor on a given level. +NOTE: n_fields_cmp in tuple must be set so that it cannot be compared +to node pointer page number fields on the upper levels of the tree! +Note that if mode is PAGE_CUR_LE, which is used in inserts, then +cursor->up_match and cursor->low_match both will have sensible values. +If mode is PAGE_CUR_GE, then up_match will a have a sensible value. */ +UNIV_INTERN +void +btr_cur_search_to_nth_level( +/*========================*/ + dict_index_t* index, /*!< in: index */ + ulint level, /*!< in: the tree level of search */ + const dtuple_t* tuple, /*!< in: data tuple; NOTE: n_fields_cmp in + tuple must be set so that it cannot get + compared to the node ptr page number field! */ + ulint mode, /*!< in: PAGE_CUR_L, ...; + NOTE that if the search is made using a unique + prefix of a record, mode should be PAGE_CUR_LE, + not PAGE_CUR_GE, as the latter may end up on + the previous page of the record! Inserts + should always be made using PAGE_CUR_LE to + search the position! */ + ulint latch_mode, /*!< in: BTR_SEARCH_LEAF, ..., ORed with + at most one of BTR_INSERT, BTR_DELETE_MARK, + BTR_DELETE, or BTR_ESTIMATE; + cursor->left_block is used to store a pointer + to the left neighbor page, in the cases + BTR_SEARCH_PREV and BTR_MODIFY_PREV; + NOTE that if has_search_latch + is != 0, we maybe do not have a latch set + on the cursor page, we assume + the caller uses his search latch + to protect the record! */ + btr_cur_t* cursor, /*!< in/out: tree cursor; the cursor page is + s- or x-latched, but see also above! */ + ulint has_search_latch,/*!< in: latch mode the caller + currently has on btr_search_latch: + RW_S_LATCH, or 0 */ + const char* file, /*!< in: file name */ + ulint line, /*!< in: line where called */ + mtr_t* mtr); /*!< in: mtr */ +/*****************************************************************//** +Opens a cursor at either end of an index. */ +UNIV_INTERN +void +btr_cur_open_at_index_side_func( +/*============================*/ + bool from_left, /*!< in: true if open to the low end, + false if to the high end */ + dict_index_t* index, /*!< in: index */ + ulint latch_mode, /*!< in: latch mode */ + btr_cur_t* cursor, /*!< in/out: cursor */ + ulint level, /*!< in: level to search for + (0=leaf) */ + const char* file, /*!< in: file name */ + ulint line, /*!< in: line where called */ + mtr_t* mtr) /*!< in/out: mini-transaction */ + __attribute__((nonnull)); +#define btr_cur_open_at_index_side(f,i,l,c,lv,m) \ + btr_cur_open_at_index_side_func(f,i,l,c,lv,__FILE__,__LINE__,m) +/**********************************************************************//** +Positions a cursor at a randomly chosen position within a B-tree. */ +UNIV_INTERN +void +btr_cur_open_at_rnd_pos_func( +/*=========================*/ + dict_index_t* index, /*!< in: index */ + ulint latch_mode, /*!< in: BTR_SEARCH_LEAF, ... */ + btr_cur_t* cursor, /*!< in/out: B-tree cursor */ + const char* file, /*!< in: file name */ + ulint line, /*!< in: line where called */ + mtr_t* mtr); /*!< in: mtr */ +#define btr_cur_open_at_rnd_pos(i,l,c,m) \ + btr_cur_open_at_rnd_pos_func(i,l,c,__FILE__,__LINE__,m) +/*************************************************************//** +Tries to perform an insert to a page in an index tree, next to cursor. +It is assumed that mtr holds an x-latch on the page. The operation does +not succeed if there is too little space on the page. If there is just +one record on the page, the insert will always succeed; this is to +prevent trying to split a page with just one record. +@return DB_SUCCESS, DB_WAIT_LOCK, DB_FAIL, or error number */ +UNIV_INTERN +dberr_t +btr_cur_optimistic_insert( +/*======================*/ + ulint flags, /*!< in: undo logging and locking flags: if not + zero, the parameters index and thr should be + specified */ + btr_cur_t* cursor, /*!< in: cursor on page after which to insert; + cursor stays valid */ + ulint** offsets,/*!< out: offsets on *rec */ + mem_heap_t** heap, /*!< in/out: pointer to memory heap, or NULL */ + dtuple_t* entry, /*!< in/out: entry to insert */ + rec_t** rec, /*!< out: pointer to inserted record if + succeed */ + big_rec_t** big_rec,/*!< out: big rec vector whose fields have to + be stored externally by the caller, or + NULL */ + ulint n_ext, /*!< in: number of externally stored columns */ + que_thr_t* thr, /*!< in: query thread or NULL */ + mtr_t* mtr) /*!< in/out: mini-transaction; + if this function returns DB_SUCCESS on + a leaf page of a secondary index in a + compressed tablespace, the caller must + mtr_commit(mtr) before latching + any further pages */ + __attribute__((nonnull(2,3,4,5,6,7,10), warn_unused_result)); +/*************************************************************//** +Performs an insert on a page of an index tree. It is assumed that mtr +holds an x-latch on the tree and on the cursor page. If the insert is +made on the leaf level, to avoid deadlocks, mtr must also own x-latches +to brothers of page, if those brothers exist. +@return DB_SUCCESS or error number */ +UNIV_INTERN +dberr_t +btr_cur_pessimistic_insert( +/*=======================*/ + ulint flags, /*!< in: undo logging and locking flags: if not + zero, the parameter thr should be + specified; if no undo logging is specified, + then the caller must have reserved enough + free extents in the file space so that the + insertion will certainly succeed */ + btr_cur_t* cursor, /*!< in: cursor after which to insert; + cursor stays valid */ + ulint** offsets,/*!< out: offsets on *rec */ + mem_heap_t** heap, /*!< in/out: pointer to memory heap + that can be emptied, or NULL */ + dtuple_t* entry, /*!< in/out: entry to insert */ + rec_t** rec, /*!< out: pointer to inserted record if + succeed */ + big_rec_t** big_rec,/*!< out: big rec vector whose fields have to + be stored externally by the caller, or + NULL */ + ulint n_ext, /*!< in: number of externally stored columns */ + que_thr_t* thr, /*!< in: query thread or NULL */ + mtr_t* mtr) /*!< in/out: mini-transaction */ + __attribute__((nonnull(2,3,4,5,6,7,10), warn_unused_result)); +/*************************************************************//** +See if there is enough place in the page modification log to log +an update-in-place. + +@retval false if out of space; IBUF_BITMAP_FREE will be reset +outside mtr if the page was recompressed +@retval true if enough place; + +IMPORTANT: The caller will have to update IBUF_BITMAP_FREE if this is +a secondary index leaf page. This has to be done either within the +same mini-transaction, or by invoking ibuf_reset_free_bits() before +mtr_commit(mtr). */ +UNIV_INTERN +bool +btr_cur_update_alloc_zip_func( +/*==========================*/ + page_zip_des_t* page_zip,/*!< in/out: compressed page */ + page_cur_t* cursor, /*!< in/out: B-tree page cursor */ + dict_index_t* index, /*!< in: the index corresponding to cursor */ +#ifdef UNIV_DEBUG + ulint* offsets,/*!< in/out: offsets of the cursor record */ +#endif /* UNIV_DEBUG */ + ulint length, /*!< in: size needed */ + bool create, /*!< in: true=delete-and-insert, + false=update-in-place */ + mtr_t* mtr) /*!< in/out: mini-transaction */ + __attribute__((nonnull, warn_unused_result)); +#ifdef UNIV_DEBUG +# define btr_cur_update_alloc_zip(page_zip,cursor,index,offsets,len,cr,mtr) \ + btr_cur_update_alloc_zip_func(page_zip,cursor,index,offsets,len,cr,mtr) +#else /* UNIV_DEBUG */ +# define btr_cur_update_alloc_zip(page_zip,cursor,index,offsets,len,cr,mtr) \ + btr_cur_update_alloc_zip_func(page_zip,cursor,index,len,cr,mtr) +#endif /* UNIV_DEBUG */ +/*************************************************************//** +Updates a record when the update causes no size changes in its fields. +@return locking or undo log related error code, or +@retval DB_SUCCESS on success +@retval DB_ZIP_OVERFLOW if there is not enough space left +on the compressed page (IBUF_BITMAP_FREE was reset outside mtr) */ +UNIV_INTERN +dberr_t +btr_cur_update_in_place( +/*====================*/ + ulint flags, /*!< in: undo logging and locking flags */ + btr_cur_t* cursor, /*!< in: cursor on the record to update; + cursor stays valid and positioned on the + same record */ + ulint* offsets,/*!< in/out: offsets on cursor->page_cur.rec */ + const upd_t* update, /*!< in: update vector */ + ulint cmpl_info,/*!< in: compiler info on secondary index + updates */ + que_thr_t* thr, /*!< in: query thread */ + trx_id_t trx_id, /*!< in: transaction id */ + mtr_t* mtr) /*!< in/out: mini-transaction; if this + is a secondary index, the caller must + mtr_commit(mtr) before latching any + further pages */ + __attribute__((warn_unused_result, nonnull)); +/***********************************************************//** +Writes a redo log record of updating a record in-place. */ +UNIV_INTERN +void +btr_cur_update_in_place_log( +/*========================*/ + ulint flags, /*!< in: flags */ + const rec_t* rec, /*!< in: record */ + dict_index_t* index, /*!< in: index of the record */ + const upd_t* update, /*!< in: update vector */ + trx_id_t trx_id, /*!< in: transaction id */ + roll_ptr_t roll_ptr, /*!< in: roll ptr */ + mtr_t* mtr) /*!< in: mtr */ + __attribute__((nonnull)); +/*************************************************************//** +Tries to update a record on a page in an index tree. It is assumed that mtr +holds an x-latch on the page. The operation does not succeed if there is too +little space on the page or if the update would result in too empty a page, +so that tree compression is recommended. +@return error code, including +@retval DB_SUCCESS on success +@retval DB_OVERFLOW if the updated record does not fit +@retval DB_UNDERFLOW if the page would become too empty +@retval DB_ZIP_OVERFLOW if there is not enough space left +on the compressed page */ +UNIV_INTERN +dberr_t +btr_cur_optimistic_update( +/*======================*/ + ulint flags, /*!< in: undo logging and locking flags */ + btr_cur_t* cursor, /*!< in: cursor on the record to update; + cursor stays valid and positioned on the + same record */ + ulint** offsets,/*!< out: offsets on cursor->page_cur.rec */ + mem_heap_t** heap, /*!< in/out: pointer to NULL or memory heap */ + const upd_t* update, /*!< in: update vector; this must also + contain trx id and roll ptr fields */ + ulint cmpl_info,/*!< in: compiler info on secondary index + updates */ + que_thr_t* thr, /*!< in: query thread */ + trx_id_t trx_id, /*!< in: transaction id */ + mtr_t* mtr) /*!< in/out: mini-transaction; if this + is a secondary index, the caller must + mtr_commit(mtr) before latching any + further pages */ + __attribute__((warn_unused_result, nonnull)); +/*************************************************************//** +Performs an update of a record on a page of a tree. It is assumed +that mtr holds an x-latch on the tree and on the cursor page. If the +update is made on the leaf level, to avoid deadlocks, mtr must also +own x-latches to brothers of page, if those brothers exist. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +btr_cur_pessimistic_update( +/*=======================*/ + ulint flags, /*!< in: undo logging, locking, and rollback + flags */ + btr_cur_t* cursor, /*!< in/out: cursor on the record to update; + cursor may become invalid if *big_rec == NULL + || !(flags & BTR_KEEP_POS_FLAG) */ + ulint** offsets,/*!< out: offsets on cursor->page_cur.rec */ + mem_heap_t** offsets_heap, + /*!< in/out: pointer to memory heap + that can be emptied, or NULL */ + mem_heap_t* entry_heap, + /*!< in/out: memory heap for allocating + big_rec and the index tuple */ + big_rec_t** big_rec,/*!< out: big rec vector whose fields have to + be stored externally by the caller, or NULL */ + const upd_t* update, /*!< in: update vector; this is allowed also + contain trx id and roll ptr fields, but + the values in update vector have no effect */ + ulint cmpl_info,/*!< in: compiler info on secondary index + updates */ + que_thr_t* thr, /*!< in: query thread */ + trx_id_t trx_id, /*!< in: transaction id */ + mtr_t* mtr) /*!< in/out: mini-transaction; must be committed + before latching any further pages */ + __attribute__((warn_unused_result, nonnull)); +/***********************************************************//** +Marks a clustered index record deleted. Writes an undo log record to +undo log on this delete marking. Writes in the trx id field the id +of the deleting transaction, and in the roll ptr field pointer to the +undo log record created. +@return DB_SUCCESS, DB_LOCK_WAIT, or error number */ +UNIV_INTERN +dberr_t +btr_cur_del_mark_set_clust_rec( +/*===========================*/ + buf_block_t* block, /*!< in/out: buffer block of the record */ + rec_t* rec, /*!< in/out: record */ + dict_index_t* index, /*!< in: clustered index of the record */ + const ulint* offsets,/*!< in: rec_get_offsets(rec) */ + que_thr_t* thr, /*!< in: query thread */ + mtr_t* mtr) /*!< in/out: mini-transaction */ + __attribute__((nonnull, warn_unused_result)); +/***********************************************************//** +Sets a secondary index record delete mark to TRUE or FALSE. +@return DB_SUCCESS, DB_LOCK_WAIT, or error number */ +UNIV_INTERN +dberr_t +btr_cur_del_mark_set_sec_rec( +/*=========================*/ + ulint flags, /*!< in: locking flag */ + btr_cur_t* cursor, /*!< in: cursor */ + ibool val, /*!< in: value to set */ + que_thr_t* thr, /*!< in: query thread */ + mtr_t* mtr) /*!< in/out: mini-transaction */ + __attribute__((nonnull, warn_unused_result)); +/*************************************************************//** +Tries to compress a page of the tree if it seems useful. It is assumed +that mtr holds an x-latch on the tree and on the cursor page. To avoid +deadlocks, mtr must also own x-latches to brothers of page, if those +brothers exist. NOTE: it is assumed that the caller has reserved enough +free extents so that the compression will always succeed if done! +@return TRUE if compression occurred */ +UNIV_INTERN +ibool +btr_cur_compress_if_useful( +/*=======================*/ + btr_cur_t* cursor, /*!< in/out: cursor on the page to compress; + cursor does not stay valid if compression + occurs */ + ibool adjust, /*!< in: TRUE if should adjust the + cursor position even if compression occurs */ + mtr_t* mtr) /*!< in/out: mini-transaction */ + __attribute__((nonnull)); +/*******************************************************//** +Removes the record on which the tree cursor is positioned. It is assumed +that the mtr has an x-latch on the page where the cursor is positioned, +but no latch on the whole tree. +@return TRUE if success, i.e., the page did not become too empty */ +UNIV_INTERN +ibool +btr_cur_optimistic_delete_func( +/*===========================*/ + btr_cur_t* cursor, /*!< in: cursor on the record to delete; + cursor stays valid: if deletion succeeds, + on function exit it points to the successor + of the deleted record */ +# ifdef UNIV_DEBUG + ulint flags, /*!< in: BTR_CREATE_FLAG or 0 */ +# endif /* UNIV_DEBUG */ + mtr_t* mtr) /*!< in: mtr; if this function returns + TRUE on a leaf page of a secondary + index, the mtr must be committed + before latching any further pages */ + __attribute__((nonnull, warn_unused_result)); +# ifdef UNIV_DEBUG +# define btr_cur_optimistic_delete(cursor, flags, mtr) \ + btr_cur_optimistic_delete_func(cursor, flags, mtr) +# else /* UNIV_DEBUG */ +# define btr_cur_optimistic_delete(cursor, flags, mtr) \ + btr_cur_optimistic_delete_func(cursor, mtr) +# endif /* UNIV_DEBUG */ +/*************************************************************//** +Removes the record on which the tree cursor is positioned. Tries +to compress the page if its fillfactor drops below a threshold +or if it is the only page on the level. It is assumed that mtr holds +an x-latch on the tree and on the cursor page. To avoid deadlocks, +mtr must also own x-latches to brothers of page, if those brothers +exist. +@return TRUE if compression occurred */ +UNIV_INTERN +ibool +btr_cur_pessimistic_delete( +/*=======================*/ + dberr_t* err, /*!< out: DB_SUCCESS or DB_OUT_OF_FILE_SPACE; + the latter may occur because we may have + to update node pointers on upper levels, + and in the case of variable length keys + these may actually grow in size */ + ibool has_reserved_extents, /*!< in: TRUE if the + caller has already reserved enough free + extents so that he knows that the operation + will succeed */ + btr_cur_t* cursor, /*!< in: cursor on the record to delete; + if compression does not occur, the cursor + stays valid: it points to successor of + deleted record on function exit */ + ulint flags, /*!< in: BTR_CREATE_FLAG or 0 */ + enum trx_rb_ctx rb_ctx, /*!< in: rollback context */ + mtr_t* mtr) /*!< in: mtr */ + __attribute__((nonnull)); +#endif /* !UNIV_HOTBACKUP */ +/***********************************************************//** +Parses a redo log record of updating a record in-place. +@return end of log record or NULL */ +UNIV_INTERN +byte* +btr_cur_parse_update_in_place( +/*==========================*/ + byte* ptr, /*!< in: buffer */ + byte* end_ptr,/*!< in: buffer end */ + page_t* page, /*!< in/out: page or NULL */ + page_zip_des_t* page_zip,/*!< in/out: compressed page, or NULL */ + dict_index_t* index); /*!< in: index corresponding to page */ +/****************************************************************//** +Parses the redo log record for delete marking or unmarking of a clustered +index record. +@return end of log record or NULL */ +UNIV_INTERN +byte* +btr_cur_parse_del_mark_set_clust_rec( +/*=================================*/ + byte* ptr, /*!< in: buffer */ + byte* end_ptr,/*!< in: buffer end */ + page_t* page, /*!< in/out: page or NULL */ + page_zip_des_t* page_zip,/*!< in/out: compressed page, or NULL */ + dict_index_t* index); /*!< in: index corresponding to page */ +/****************************************************************//** +Parses the redo log record for delete marking or unmarking of a secondary +index record. +@return end of log record or NULL */ +UNIV_INTERN +byte* +btr_cur_parse_del_mark_set_sec_rec( +/*===============================*/ + byte* ptr, /*!< in: buffer */ + byte* end_ptr,/*!< in: buffer end */ + page_t* page, /*!< in/out: page or NULL */ + page_zip_des_t* page_zip);/*!< in/out: compressed page, or NULL */ +#ifndef UNIV_HOTBACKUP +/*******************************************************************//** +Estimates the number of rows in a given index range. +@return estimated number of rows */ +UNIV_INTERN +ib_int64_t +btr_estimate_n_rows_in_range( +/*=========================*/ + dict_index_t* index, /*!< in: index */ + const dtuple_t* tuple1, /*!< in: range start, may also be empty tuple */ + ulint mode1, /*!< in: search mode for range start */ + const dtuple_t* tuple2, /*!< in: range end, may also be empty tuple */ + ulint mode2); /*!< in: search mode for range end */ +/*******************************************************************//** +Estimates the number of different key values in a given index, for +each n-column prefix of the index where 1 <= n <= dict_index_get_n_unique(index). +The estimates are stored in the array index->stat_n_diff_key_vals[] (indexed +0..n_uniq-1) and the number of pages that were sampled is saved in +index->stat_n_sample_sizes[]. +If innodb_stats_method is nulls_ignored, we also record the number of +non-null values for each prefix and stored the estimates in +array index->stat_n_non_null_key_vals. */ +UNIV_INTERN +void +btr_estimate_number_of_different_key_vals( +/*======================================*/ + dict_index_t* index); /*!< in: index */ + +/** Gets the externally stored size of a record, in units of a database page. +@param[in] rec record +@param[in] offsets array returned by rec_get_offsets() +@return externally stored part, in units of a database page */ + +ulint +btr_rec_get_externally_stored_len( + const rec_t* rec, + const ulint* offsets); + +/*******************************************************************//** +Marks non-updated off-page fields as disowned by this record. The ownership +must be transferred to the updated record which is inserted elsewhere in the +index tree. In purge only the owner of externally stored field is allowed +to free the field. */ +UNIV_INTERN +void +btr_cur_disown_inherited_fields( +/*============================*/ + page_zip_des_t* page_zip,/*!< in/out: compressed page whose uncompressed + part will be updated, or NULL */ + rec_t* rec, /*!< in/out: record in a clustered index */ + dict_index_t* index, /*!< in: index of the page */ + const ulint* offsets,/*!< in: array returned by rec_get_offsets() */ + const upd_t* update, /*!< in: update vector */ + mtr_t* mtr) /*!< in/out: mini-transaction */ + __attribute__((nonnull(2,3,4,5,6))); + +/** Operation code for btr_store_big_rec_extern_fields(). */ +enum blob_op { + /** Store off-page columns for a freshly inserted record */ + BTR_STORE_INSERT = 0, + /** Store off-page columns for an insert by update */ + BTR_STORE_INSERT_UPDATE, + /** Store off-page columns for an update */ + BTR_STORE_UPDATE +}; + +/*******************************************************************//** +Determine if an operation on off-page columns is an update. +@return TRUE if op != BTR_STORE_INSERT */ +UNIV_INLINE +ibool +btr_blob_op_is_update( +/*==================*/ + enum blob_op op) /*!< in: operation */ + __attribute__((warn_unused_result)); + +/*******************************************************************//** +Stores the fields in big_rec_vec to the tablespace and puts pointers to +them in rec. The extern flags in rec will have to be set beforehand. +The fields are stored on pages allocated from leaf node +file segment of the index tree. +@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */ +UNIV_INTERN +dberr_t +btr_store_big_rec_extern_fields( +/*============================*/ + dict_index_t* index, /*!< in: index of rec; the index tree + MUST be X-latched */ + buf_block_t* rec_block, /*!< in/out: block containing rec */ + rec_t* rec, /*!< in/out: record */ + const ulint* offsets, /*!< in: rec_get_offsets(rec, index); + the "external storage" flags in offsets + will not correspond to rec when + this function returns */ + const big_rec_t*big_rec_vec, /*!< in: vector containing fields + to be stored externally */ + mtr_t* btr_mtr, /*!< in: mtr containing the + latches to the clustered index */ + enum blob_op op) /*! in: operation code */ + __attribute__((nonnull, warn_unused_result)); + +/*******************************************************************//** +Frees the space in an externally stored field to the file space +management if the field in data is owned the externally stored field, +in a rollback we may have the additional condition that the field must +not be inherited. */ +UNIV_INTERN +void +btr_free_externally_stored_field( +/*=============================*/ + dict_index_t* index, /*!< in: index of the data, the index + tree MUST be X-latched; if the tree + height is 1, then also the root page + must be X-latched! (this is relevant + in the case this function is called + from purge where 'data' is located on + an undo log page, not an index + page) */ + byte* field_ref, /*!< in/out: field reference */ + const rec_t* rec, /*!< in: record containing field_ref, for + page_zip_write_blob_ptr(), or NULL */ + const ulint* offsets, /*!< in: rec_get_offsets(rec, index), + or NULL */ + page_zip_des_t* page_zip, /*!< in: compressed page corresponding + to rec, or NULL if rec == NULL */ + ulint i, /*!< in: field number of field_ref; + ignored if rec == NULL */ + enum trx_rb_ctx rb_ctx, /*!< in: rollback context */ + mtr_t* local_mtr); /*!< in: mtr containing the latch to + data an an X-latch to the index + tree */ +/*******************************************************************//** +Copies the prefix of an externally stored field of a record. The +clustered index record must be protected by a lock or a page latch. +@return the length of the copied field, or 0 if the column was being +or has been deleted */ +UNIV_INTERN +ulint +btr_copy_externally_stored_field_prefix( +/*====================================*/ + byte* buf, /*!< out: the field, or a prefix of it */ + ulint len, /*!< in: length of buf, in bytes */ + ulint zip_size,/*!< in: nonzero=compressed BLOB page size, + zero for uncompressed BLOBs */ + const byte* data, /*!< in: 'internally' stored part of the + field containing also the reference to + the external part; must be protected by + a lock or a page latch */ + ulint local_len);/*!< in: length of data, in bytes */ +/*******************************************************************//** +Copies an externally stored field of a record to mem heap. The +clustered index record must be protected by a lock or a page latch. +@return the whole field copied to heap */ +UNIV_INTERN +byte* +btr_copy_externally_stored_field( +/*=============================*/ + ulint* len, /*!< out: length of the whole field */ + const byte* data, /*!< in: 'internally' stored part of the + field containing also the reference to + the external part; must be protected by + a lock or a page latch */ + ulint zip_size,/*!< in: nonzero=compressed BLOB page size, + zero for uncompressed BLOBs */ + ulint local_len,/*!< in: length of data */ + mem_heap_t* heap); /*!< in: mem heap */ +/*******************************************************************//** +Copies an externally stored field of a record to mem heap. +@return the field copied to heap, or NULL if the field is incomplete */ +UNIV_INTERN +byte* +btr_rec_copy_externally_stored_field( +/*=================================*/ + const rec_t* rec, /*!< in: record in a clustered index; + must be protected by a lock or a page latch */ + const ulint* offsets,/*!< in: array returned by rec_get_offsets() */ + ulint zip_size,/*!< in: nonzero=compressed BLOB page size, + zero for uncompressed BLOBs */ + ulint no, /*!< in: field number */ + ulint* len, /*!< out: length of the field */ + mem_heap_t* heap); /*!< in: mem heap */ +/*******************************************************************//** +Flags the data tuple fields that are marked as extern storage in the +update vector. We use this function to remember which fields we must +mark as extern storage in a record inserted for an update. +@return number of flagged external columns */ +UNIV_INTERN +ulint +btr_push_update_extern_fields( +/*==========================*/ + dtuple_t* tuple, /*!< in/out: data tuple */ + const upd_t* update, /*!< in: update vector */ + mem_heap_t* heap) /*!< in: memory heap */ + __attribute__((nonnull)); +/***********************************************************//** +Sets a secondary index record's delete mark to the given value. This +function is only used by the insert buffer merge mechanism. */ +UNIV_INTERN +void +btr_cur_set_deleted_flag_for_ibuf( +/*==============================*/ + rec_t* rec, /*!< in/out: record */ + page_zip_des_t* page_zip, /*!< in/out: compressed page + corresponding to rec, or NULL + when the tablespace is + uncompressed */ + ibool val, /*!< in: value to set */ + mtr_t* mtr); /*!< in/out: mini-transaction */ +/*######################################################################*/ + +/** In the pessimistic delete, if the page data size drops below this +limit, merging it to a neighbor is tried */ +#define BTR_CUR_PAGE_COMPRESS_LIMIT (UNIV_PAGE_SIZE / 2) + +/** A slot in the path array. We store here info on a search path down the +tree. Each slot contains data on a single level of the tree. */ + +struct btr_path_t{ + ulint nth_rec; /*!< index of the record + where the page cursor stopped on + this level (index in alphabetical + order); value ULINT_UNDEFINED + denotes array end */ + ulint n_recs; /*!< number of records on the page */ + ulint page_no; /*!< no of the page containing the record */ + ulint page_level; /*!< level of the page, if later we fetch + the page under page_no and it is no different + level then we know that the tree has been + reorganized */ +}; + +#define BTR_PATH_ARRAY_N_SLOTS 250 /*!< size of path array (in slots) */ + +/** Values for the flag documenting the used search method */ +enum btr_cur_method { + BTR_CUR_HASH = 1, /*!< successful shortcut using + the hash index */ + BTR_CUR_HASH_FAIL, /*!< failure using hash, success using + binary search: the misleading hash + reference is stored in the field + hash_node, and might be necessary to + update */ + BTR_CUR_BINARY, /*!< success using the binary search */ + BTR_CUR_INSERT_TO_IBUF, /*!< performed the intended insert to + the insert buffer */ + BTR_CUR_DEL_MARK_IBUF, /*!< performed the intended delete + mark in the insert/delete buffer */ + BTR_CUR_DELETE_IBUF, /*!< performed the intended delete in + the insert/delete buffer */ + BTR_CUR_DELETE_REF /*!< row_purge_poss_sec() failed */ +}; + +/** The tree cursor: the definition appears here only for the compiler +to know struct size! */ +struct btr_cur_t { + dict_index_t* index; /*!< index where positioned */ + page_cur_t page_cur; /*!< page cursor */ + purge_node_t* purge_node; /*!< purge node, for BTR_DELETE */ + buf_block_t* left_block; /*!< this field is used to store + a pointer to the left neighbor + page, in the cases + BTR_SEARCH_PREV and + BTR_MODIFY_PREV */ + /*------------------------------*/ + que_thr_t* thr; /*!< this field is only used + when btr_cur_search_to_nth_level + is called for an index entry + insertion: the calling query + thread is passed here to be + used in the insert buffer */ + /*------------------------------*/ + /** The following fields are used in + btr_cur_search_to_nth_level to pass information: */ + /* @{ */ + enum btr_cur_method flag; /*!< Search method used */ + ulint tree_height; /*!< Tree height if the search is done + for a pessimistic insert or update + operation */ + ulint up_match; /*!< If the search mode was PAGE_CUR_LE, + the number of matched fields to the + the first user record to the right of + the cursor record after + btr_cur_search_to_nth_level; + for the mode PAGE_CUR_GE, the matched + fields to the first user record AT THE + CURSOR or to the right of it; + NOTE that the up_match and low_match + values may exceed the correct values + for comparison to the adjacent user + record if that record is on a + different leaf page! (See the note in + row_ins_duplicate_error_in_clust.) */ + ulint up_bytes; /*!< number of matched bytes to the + right at the time cursor positioned; + only used internally in searches: not + defined after the search */ + ulint low_match; /*!< if search mode was PAGE_CUR_LE, + the number of matched fields to the + first user record AT THE CURSOR or + to the left of it after + btr_cur_search_to_nth_level; + NOT defined for PAGE_CUR_GE or any + other search modes; see also the NOTE + in up_match! */ + ulint low_bytes; /*!< number of matched bytes to the + right at the time cursor positioned; + only used internally in searches: not + defined after the search */ + ulint n_fields; /*!< prefix length used in a hash + search if hash_node != NULL */ + ulint n_bytes; /*!< hash prefix bytes if hash_node != + NULL */ + ulint fold; /*!< fold value used in the search if + flag is BTR_CUR_HASH */ + /* @} */ + btr_path_t* path_arr; /*!< in estimating the number of + rows in range, we store in this array + information of the path through + the tree */ +}; + +/** If pessimistic delete fails because of lack of file space, there +is still a good change of success a little later. Try this many +times. */ +#define BTR_CUR_RETRY_DELETE_N_TIMES 100 +/** If pessimistic delete fails because of lack of file space, there +is still a good change of success a little later. Sleep this many +microseconds between retries. */ +#define BTR_CUR_RETRY_SLEEP_TIME 50000 + +/** The reference in a field for which data is stored on a different page. +The reference is at the end of the 'locally' stored part of the field. +'Locally' means storage in the index record. +We store locally a long enough prefix of each column so that we can determine +the ordering parts of each index record without looking into the externally +stored part. */ +/*-------------------------------------- @{ */ +#define BTR_EXTERN_SPACE_ID 0 /*!< space id where stored */ +#define BTR_EXTERN_PAGE_NO 4 /*!< page no where stored */ +#define BTR_EXTERN_OFFSET 8 /*!< offset of BLOB header + on that page */ +#define BTR_EXTERN_LEN 12 /*!< 8 bytes containing the + length of the externally + stored part of the BLOB. + The 2 highest bits are + reserved to the flags below. */ +/*-------------------------------------- @} */ +/* #define BTR_EXTERN_FIELD_REF_SIZE 20 // moved to btr0types.h */ + +/** The most significant bit of BTR_EXTERN_LEN (i.e., the most +significant bit of the byte at smallest address) is set to 1 if this +field does not 'own' the externally stored field; only the owner field +is allowed to free the field in purge! */ +#define BTR_EXTERN_OWNER_FLAG 128 +/** If the second most significant bit of BTR_EXTERN_LEN (i.e., the +second most significant bit of the byte at smallest address) is 1 then +it means that the externally stored field was inherited from an +earlier version of the row. In rollback we are not allowed to free an +inherited external field. */ +#define BTR_EXTERN_INHERITED_FLAG 64 + +/** Number of searches down the B-tree in btr_cur_search_to_nth_level(). */ +extern ulint btr_cur_n_non_sea; +/** Number of successful adaptive hash index lookups in +btr_cur_search_to_nth_level(). */ +extern ulint btr_cur_n_sea; +/** Old value of btr_cur_n_non_sea. Copied by +srv_refresh_innodb_monitor_stats(). Referenced by +srv_printf_innodb_monitor(). */ +extern ulint btr_cur_n_non_sea_old; +/** Old value of btr_cur_n_sea. Copied by +srv_refresh_innodb_monitor_stats(). Referenced by +srv_printf_innodb_monitor(). */ +extern ulint btr_cur_n_sea_old; +#endif /* !UNIV_HOTBACKUP */ + +#ifdef UNIV_DEBUG +/* Flag to limit optimistic insert records */ +extern uint btr_cur_limit_optimistic_insert_debug; +#endif /* UNIV_DEBUG */ + +#ifndef UNIV_NONINL +#include "btr0cur.ic" +#endif + +#endif diff --git a/storage/innobase/include/btr0cur.ic b/storage/innobase/include/btr0cur.ic new file mode 100644 index 00000000000..43ee3304c0e --- /dev/null +++ b/storage/innobase/include/btr0cur.ic @@ -0,0 +1,223 @@ +/***************************************************************************** + +Copyright (c) 1994, 2014, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/btr0cur.ic +The index tree cursor + +Created 10/16/1994 Heikki Tuuri +*******************************************************/ + +#ifndef UNIV_HOTBACKUP +#include "btr0btr.h" + +#ifdef UNIV_DEBUG +# define LIMIT_OPTIMISTIC_INSERT_DEBUG(NREC, CODE)\ +if (btr_cur_limit_optimistic_insert_debug > 1\ + && (NREC) >= (ulint)btr_cur_limit_optimistic_insert_debug) {\ + CODE;\ +} +#else +# define LIMIT_OPTIMISTIC_INSERT_DEBUG(NREC, CODE) +#endif /* UNIV_DEBUG */ + +#ifdef UNIV_DEBUG +/*********************************************************//** +Returns the page cursor component of a tree cursor. +@return pointer to page cursor component */ +UNIV_INLINE +page_cur_t* +btr_cur_get_page_cur( +/*=================*/ + const btr_cur_t* cursor) /*!< in: tree cursor */ +{ + return(&((btr_cur_t*) cursor)->page_cur); +} + +/*********************************************************//** +Returns the buffer block on which the tree cursor is positioned. +@return pointer to buffer block */ +UNIV_INLINE +buf_block_t* +btr_cur_get_block( +/*==============*/ + const btr_cur_t* cursor) /*!< in: tree cursor */ +{ + return(page_cur_get_block(btr_cur_get_page_cur(cursor))); +} + +/*********************************************************//** +Returns the record pointer of a tree cursor. +@return pointer to record */ +UNIV_INLINE +rec_t* +btr_cur_get_rec( +/*============*/ + const btr_cur_t* cursor) /*!< in: tree cursor */ +{ + return(page_cur_get_rec(btr_cur_get_page_cur(cursor))); +} +#endif /* UNIV_DEBUG */ + +/*********************************************************//** +Returns the compressed page on which the tree cursor is positioned. +@return pointer to compressed page, or NULL if the page is not compressed */ +UNIV_INLINE +page_zip_des_t* +btr_cur_get_page_zip( +/*=================*/ + btr_cur_t* cursor) /*!< in: tree cursor */ +{ + return(buf_block_get_page_zip(btr_cur_get_block(cursor))); +} + +/*********************************************************//** +Invalidates a tree cursor by setting record pointer to NULL. */ +UNIV_INLINE +void +btr_cur_invalidate( +/*===============*/ + btr_cur_t* cursor) /*!< in: tree cursor */ +{ + page_cur_invalidate(&(cursor->page_cur)); +} + +/*********************************************************//** +Returns the page of a tree cursor. +@return pointer to page */ +UNIV_INLINE +page_t* +btr_cur_get_page( +/*=============*/ + btr_cur_t* cursor) /*!< in: tree cursor */ +{ + return(page_align(page_cur_get_rec(&(cursor->page_cur)))); +} + +/*********************************************************//** +Positions a tree cursor at a given record. */ +UNIV_INLINE +void +btr_cur_position( +/*=============*/ + dict_index_t* index, /*!< in: index */ + rec_t* rec, /*!< in: record in tree */ + buf_block_t* block, /*!< in: buffer block of rec */ + btr_cur_t* cursor) /*!< out: cursor */ +{ + ut_ad(page_align(rec) == block->frame); + + page_cur_position(rec, block, btr_cur_get_page_cur(cursor)); + + cursor->index = index; +} + +/*********************************************************************//** +Checks if compressing an index page where a btr cursor is placed makes +sense. +@return TRUE if compression is recommended */ +UNIV_INLINE +ibool +btr_cur_compress_recommendation( +/*============================*/ + btr_cur_t* cursor, /*!< in: btr cursor */ + mtr_t* mtr) /*!< in: mtr */ +{ + const page_t* page; + + ut_ad(mtr_memo_contains(mtr, btr_cur_get_block(cursor), + MTR_MEMO_PAGE_X_FIX)); + + page = btr_cur_get_page(cursor); + + LIMIT_OPTIMISTIC_INSERT_DEBUG(page_get_n_recs(page) * 2, + return(FALSE)); + + if ((page_get_data_size(page) < BTR_CUR_PAGE_COMPRESS_LIMIT) + || ((btr_page_get_next(page, mtr) == FIL_NULL) + && (btr_page_get_prev(page, mtr) == FIL_NULL))) { + + /* The page fillfactor has dropped below a predefined + minimum value OR the level in the B-tree contains just + one page: we recommend compression if this is not the + root page. */ + + return(dict_index_get_page(cursor->index) + != page_get_page_no(page)); + } + + return(FALSE); +} + +/*********************************************************************//** +Checks if the record on which the cursor is placed can be deleted without +making tree compression necessary (or, recommended). +@return TRUE if can be deleted without recommended compression */ +UNIV_INLINE +ibool +btr_cur_can_delete_without_compress( +/*================================*/ + btr_cur_t* cursor, /*!< in: btr cursor */ + ulint rec_size,/*!< in: rec_get_size(btr_cur_get_rec(cursor))*/ + mtr_t* mtr) /*!< in: mtr */ +{ + page_t* page; + + ut_ad(mtr_memo_contains(mtr, btr_cur_get_block(cursor), + MTR_MEMO_PAGE_X_FIX)); + + page = btr_cur_get_page(cursor); + + if ((page_get_data_size(page) - rec_size < BTR_CUR_PAGE_COMPRESS_LIMIT) + || ((btr_page_get_next(page, mtr) == FIL_NULL) + && (btr_page_get_prev(page, mtr) == FIL_NULL)) + || (page_get_n_recs(page) < 2)) { + + /* The page fillfactor will drop below a predefined + minimum value, OR the level in the B-tree contains just + one page, OR the page will become empty: we recommend + compression if this is not the root page. */ + + return(dict_index_get_page(cursor->index) + == page_get_page_no(page)); + } + + return(TRUE); +} + +/*******************************************************************//** +Determine if an operation on off-page columns is an update. +@return TRUE if op != BTR_STORE_INSERT */ +UNIV_INLINE +ibool +btr_blob_op_is_update( +/*==================*/ + enum blob_op op) /*!< in: operation */ +{ + switch (op) { + case BTR_STORE_INSERT: + return(FALSE); + case BTR_STORE_INSERT_UPDATE: + case BTR_STORE_UPDATE: + return(TRUE); + } + + ut_ad(0); + return(FALSE); +} +#endif /* !UNIV_HOTBACKUP */ diff --git a/storage/innobase/include/btr0pcur.h b/storage/innobase/include/btr0pcur.h new file mode 100644 index 00000000000..cfbaacf4de3 --- /dev/null +++ b/storage/innobase/include/btr0pcur.h @@ -0,0 +1,548 @@ +/***************************************************************************** + +Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/btr0pcur.h +The index tree persistent cursor + +Created 2/23/1996 Heikki Tuuri +*******************************************************/ + +#ifndef btr0pcur_h +#define btr0pcur_h + +#include "univ.i" +#include "dict0dict.h" +#include "data0data.h" +#include "mtr0mtr.h" +#include "page0cur.h" +#include "btr0cur.h" +#include "btr0btr.h" +#include "btr0types.h" + +/* Relative positions for a stored cursor position */ +#define BTR_PCUR_ON 1 +#define BTR_PCUR_BEFORE 2 +#define BTR_PCUR_AFTER 3 +/* Note that if the tree is not empty, btr_pcur_store_position does not +use the following, but only uses the above three alternatives, where the +position is stored relative to a specific record: this makes implementation +of a scroll cursor easier */ +#define BTR_PCUR_BEFORE_FIRST_IN_TREE 4 /* in an empty tree */ +#define BTR_PCUR_AFTER_LAST_IN_TREE 5 /* in an empty tree */ + +/**************************************************************//** +Allocates memory for a persistent cursor object and initializes the cursor. +@return own: persistent cursor */ +UNIV_INTERN +btr_pcur_t* +btr_pcur_create_for_mysql(void); +/*============================*/ + +/**************************************************************//** +Resets a persistent cursor object, freeing ::old_rec_buf if it is +allocated and resetting the other members to their initial values. */ +UNIV_INTERN +void +btr_pcur_reset( +/*===========*/ + btr_pcur_t* cursor);/*!< in, out: persistent cursor */ + +/**************************************************************//** +Frees the memory for a persistent cursor object. */ +UNIV_INTERN +void +btr_pcur_free_for_mysql( +/*====================*/ + btr_pcur_t* cursor); /*!< in, own: persistent cursor */ +/**************************************************************//** +Copies the stored position of a pcur to another pcur. */ +UNIV_INTERN +void +btr_pcur_copy_stored_position( +/*==========================*/ + btr_pcur_t* pcur_receive, /*!< in: pcur which will receive the + position info */ + btr_pcur_t* pcur_donate); /*!< in: pcur from which the info is + copied */ +/**************************************************************//** +Sets the old_rec_buf field to NULL. */ +UNIV_INLINE +void +btr_pcur_init( +/*==========*/ + btr_pcur_t* pcur); /*!< in: persistent cursor */ +/**************************************************************//** +Initializes and opens a persistent cursor to an index tree. It should be +closed with btr_pcur_close. */ +UNIV_INLINE +void +btr_pcur_open_low( +/*==============*/ + dict_index_t* index, /*!< in: index */ + ulint level, /*!< in: level in the btree */ + const dtuple_t* tuple, /*!< in: tuple on which search done */ + ulint mode, /*!< in: PAGE_CUR_L, ...; + NOTE that if the search is made using a unique + prefix of a record, mode should be + PAGE_CUR_LE, not PAGE_CUR_GE, as the latter + may end up on the previous page from the + record! */ + ulint latch_mode,/*!< in: BTR_SEARCH_LEAF, ... */ + btr_pcur_t* cursor, /*!< in: memory buffer for persistent cursor */ + const char* file, /*!< in: file name */ + ulint line, /*!< in: line where called */ + mtr_t* mtr); /*!< in: mtr */ +#define btr_pcur_open(i,t,md,l,c,m) \ + btr_pcur_open_low(i,0,t,md,l,c,__FILE__,__LINE__,m) +/**************************************************************//** +Opens an persistent cursor to an index tree without initializing the +cursor. */ +UNIV_INLINE +void +btr_pcur_open_with_no_init_func( +/*============================*/ + dict_index_t* index, /*!< in: index */ + const dtuple_t* tuple, /*!< in: tuple on which search done */ + ulint mode, /*!< in: PAGE_CUR_L, ...; + NOTE that if the search is made using a unique + prefix of a record, mode should be + PAGE_CUR_LE, not PAGE_CUR_GE, as the latter + may end up on the previous page of the + record! */ + ulint latch_mode,/*!< in: BTR_SEARCH_LEAF, ...; + NOTE that if has_search_latch != 0 then + we maybe do not acquire a latch on the cursor + page, but assume that the caller uses his + btr search latch to protect the record! */ + btr_pcur_t* cursor, /*!< in: memory buffer for persistent cursor */ + ulint has_search_latch,/*!< in: latch mode the caller + currently has on btr_search_latch: + RW_S_LATCH, or 0 */ + const char* file, /*!< in: file name */ + ulint line, /*!< in: line where called */ + mtr_t* mtr); /*!< in: mtr */ +#define btr_pcur_open_with_no_init(ix,t,md,l,cur,has,m) \ + btr_pcur_open_with_no_init_func(ix,t,md,l,cur,has,__FILE__,__LINE__,m) + +/*****************************************************************//** +Opens a persistent cursor at either end of an index. */ +UNIV_INLINE +void +btr_pcur_open_at_index_side( +/*========================*/ + bool from_left, /*!< in: true if open to the low end, + false if to the high end */ + dict_index_t* index, /*!< in: index */ + ulint latch_mode, /*!< in: latch mode */ + btr_pcur_t* pcur, /*!< in/out: cursor */ + bool init_pcur, /*!< in: whether to initialize pcur */ + ulint level, /*!< in: level to search for + (0=leaf) */ + mtr_t* mtr) /*!< in/out: mini-transaction */ + __attribute__((nonnull)); +/**************************************************************//** +Gets the up_match value for a pcur after a search. +@return number of matched fields at the cursor or to the right if +search mode was PAGE_CUR_GE, otherwise undefined */ +UNIV_INLINE +ulint +btr_pcur_get_up_match( +/*==================*/ + const btr_pcur_t* cursor); /*!< in: persistent cursor */ +/**************************************************************//** +Gets the low_match value for a pcur after a search. +@return number of matched fields at the cursor or to the right if +search mode was PAGE_CUR_LE, otherwise undefined */ +UNIV_INLINE +ulint +btr_pcur_get_low_match( +/*===================*/ + const btr_pcur_t* cursor); /*!< in: persistent cursor */ +/**************************************************************//** +If mode is PAGE_CUR_G or PAGE_CUR_GE, opens a persistent cursor on the first +user record satisfying the search condition, in the case PAGE_CUR_L or +PAGE_CUR_LE, on the last user record. If no such user record exists, then +in the first case sets the cursor after last in tree, and in the latter case +before first in tree. The latching mode must be BTR_SEARCH_LEAF or +BTR_MODIFY_LEAF. */ +UNIV_INTERN +void +btr_pcur_open_on_user_rec_func( +/*===========================*/ + dict_index_t* index, /*!< in: index */ + const dtuple_t* tuple, /*!< in: tuple on which search done */ + ulint mode, /*!< in: PAGE_CUR_L, ... */ + ulint latch_mode, /*!< in: BTR_SEARCH_LEAF or + BTR_MODIFY_LEAF */ + btr_pcur_t* cursor, /*!< in: memory buffer for persistent + cursor */ + const char* file, /*!< in: file name */ + ulint line, /*!< in: line where called */ + mtr_t* mtr); /*!< in: mtr */ +#define btr_pcur_open_on_user_rec(i,t,md,l,c,m) \ + btr_pcur_open_on_user_rec_func(i,t,md,l,c,__FILE__,__LINE__,m) +/**********************************************************************//** +Positions a cursor at a randomly chosen position within a B-tree. */ +UNIV_INLINE +void +btr_pcur_open_at_rnd_pos_func( +/*==========================*/ + dict_index_t* index, /*!< in: index */ + ulint latch_mode, /*!< in: BTR_SEARCH_LEAF, ... */ + btr_pcur_t* cursor, /*!< in/out: B-tree pcur */ + const char* file, /*!< in: file name */ + ulint line, /*!< in: line where called */ + mtr_t* mtr); /*!< in: mtr */ +#define btr_pcur_open_at_rnd_pos(i,l,c,m) \ + btr_pcur_open_at_rnd_pos_func(i,l,c,__FILE__,__LINE__,m) +/**************************************************************//** +Frees the possible memory heap of a persistent cursor and sets the latch +mode of the persistent cursor to BTR_NO_LATCHES. +WARNING: this function does not release the latch on the page where the +cursor is currently positioned. The latch is acquired by the +"move to next/previous" family of functions. Since recursive shared locks +are not allowed, you must take care (if using the cursor in S-mode) to +manually release the latch by either calling +btr_leaf_page_release(btr_pcur_get_block(&pcur), pcur.latch_mode, mtr) +or by committing the mini-transaction right after btr_pcur_close(). +A subsequent attempt to crawl the same page in the same mtr would cause +an assertion failure. */ +UNIV_INLINE +void +btr_pcur_close( +/*===========*/ + btr_pcur_t* cursor); /*!< in: persistent cursor */ +/**************************************************************//** +The position of the cursor is stored by taking an initial segment of the +record the cursor is positioned on, before, or after, and copying it to the +cursor data structure, or just setting a flag if the cursor id before the +first in an EMPTY tree, or after the last in an EMPTY tree. NOTE that the +page where the cursor is positioned must not be empty if the index tree is +not totally empty! */ +UNIV_INTERN +void +btr_pcur_store_position( +/*====================*/ + btr_pcur_t* cursor, /*!< in: persistent cursor */ + mtr_t* mtr); /*!< in: mtr */ +/**************************************************************//** +Restores the stored position of a persistent cursor bufferfixing the page and +obtaining the specified latches. If the cursor position was saved when the +(1) cursor was positioned on a user record: this function restores the position +to the last record LESS OR EQUAL to the stored record; +(2) cursor was positioned on a page infimum record: restores the position to +the last record LESS than the user record which was the successor of the page +infimum; +(3) cursor was positioned on the page supremum: restores to the first record +GREATER than the user record which was the predecessor of the supremum. +(4) cursor was positioned before the first or after the last in an empty tree: +restores to before first or after the last in the tree. +@return TRUE if the cursor position was stored when it was on a user +record and it can be restored on a user record whose ordering fields +are identical to the ones of the original user record */ +UNIV_INTERN +ibool +btr_pcur_restore_position_func( +/*===========================*/ + ulint latch_mode, /*!< in: BTR_SEARCH_LEAF, ... */ + btr_pcur_t* cursor, /*!< in: detached persistent cursor */ + const char* file, /*!< in: file name */ + ulint line, /*!< in: line where called */ + mtr_t* mtr); /*!< in: mtr */ +#define btr_pcur_restore_position(l,cur,mtr) \ + btr_pcur_restore_position_func(l,cur,__FILE__,__LINE__,mtr) +/*********************************************************//** +Gets the rel_pos field for a cursor whose position has been stored. +@return BTR_PCUR_ON, ... */ +UNIV_INLINE +ulint +btr_pcur_get_rel_pos( +/*=================*/ + const btr_pcur_t* cursor);/*!< in: persistent cursor */ +/**************************************************************//** +Commits the mtr and sets the pcur latch mode to BTR_NO_LATCHES, +that is, the cursor becomes detached. +Function btr_pcur_store_position should be used before calling this, +if restoration of cursor is wanted later. */ +UNIV_INLINE +void +btr_pcur_commit_specify_mtr( +/*========================*/ + btr_pcur_t* pcur, /*!< in: persistent cursor */ + mtr_t* mtr); /*!< in: mtr to commit */ +/*********************************************************//** +Moves the persistent cursor to the next record in the tree. If no records are +left, the cursor stays 'after last in tree'. +@return TRUE if the cursor was not after last in tree */ +UNIV_INLINE +ibool +btr_pcur_move_to_next( +/*==================*/ + btr_pcur_t* cursor, /*!< in: persistent cursor; NOTE that the + function may release the page latch */ + mtr_t* mtr); /*!< in: mtr */ +/*********************************************************//** +Moves the persistent cursor to the previous record in the tree. If no records +are left, the cursor stays 'before first in tree'. +@return TRUE if the cursor was not before first in tree */ +UNIV_INTERN +ibool +btr_pcur_move_to_prev( +/*==================*/ + btr_pcur_t* cursor, /*!< in: persistent cursor; NOTE that the + function may release the page latch */ + mtr_t* mtr); /*!< in: mtr */ +/*********************************************************//** +Moves the persistent cursor to the last record on the same page. */ +UNIV_INLINE +void +btr_pcur_move_to_last_on_page( +/*==========================*/ + btr_pcur_t* cursor, /*!< in: persistent cursor */ + mtr_t* mtr); /*!< in: mtr */ +/*********************************************************//** +Moves the persistent cursor to the next user record in the tree. If no user +records are left, the cursor ends up 'after last in tree'. +@return TRUE if the cursor moved forward, ending on a user record */ +UNIV_INLINE +ibool +btr_pcur_move_to_next_user_rec( +/*===========================*/ + btr_pcur_t* cursor, /*!< in: persistent cursor; NOTE that the + function may release the page latch */ + mtr_t* mtr); /*!< in: mtr */ +/*********************************************************//** +Moves the persistent cursor to the first record on the next page. +Releases the latch on the current page, and bufferunfixes it. +Note that there must not be modifications on the current page, +as then the x-latch can be released only in mtr_commit. */ +UNIV_INTERN +void +btr_pcur_move_to_next_page( +/*=======================*/ + btr_pcur_t* cursor, /*!< in: persistent cursor; must be on the + last record of the current page */ + mtr_t* mtr); /*!< in: mtr */ +/*********************************************************//** +Moves the persistent cursor backward if it is on the first record +of the page. Releases the latch on the current page, and bufferunfixes +it. Note that to prevent a possible deadlock, the operation first +stores the position of the cursor, releases the leaf latch, acquires +necessary latches and restores the cursor position again before returning. +The alphabetical position of the cursor is guaranteed to be sensible +on return, but it may happen that the cursor is not positioned on the +last record of any page, because the structure of the tree may have +changed while the cursor had no latches. */ +UNIV_INTERN +void +btr_pcur_move_backward_from_page( +/*=============================*/ + btr_pcur_t* cursor, /*!< in: persistent cursor, must be on the + first record of the current page */ + mtr_t* mtr); /*!< in: mtr */ +#ifdef UNIV_DEBUG +/*********************************************************//** +Returns the btr cursor component of a persistent cursor. +@return pointer to btr cursor component */ +UNIV_INLINE +btr_cur_t* +btr_pcur_get_btr_cur( +/*=================*/ + const btr_pcur_t* cursor); /*!< in: persistent cursor */ +/*********************************************************//** +Returns the page cursor component of a persistent cursor. +@return pointer to page cursor component */ +UNIV_INLINE +page_cur_t* +btr_pcur_get_page_cur( +/*==================*/ + const btr_pcur_t* cursor); /*!< in: persistent cursor */ +/*********************************************************//** +Returns the page of a persistent cursor. +@return pointer to the page */ +UNIV_INLINE +page_t* +btr_pcur_get_page( +/*==============*/ + const btr_pcur_t* cursor);/*!< in: persistent cursor */ +/*********************************************************//** +Returns the buffer block of a persistent cursor. +@return pointer to the block */ +UNIV_INLINE +buf_block_t* +btr_pcur_get_block( +/*===============*/ + const btr_pcur_t* cursor);/*!< in: persistent cursor */ +/*********************************************************//** +Returns the record of a persistent cursor. +@return pointer to the record */ +UNIV_INLINE +rec_t* +btr_pcur_get_rec( +/*=============*/ + const btr_pcur_t* cursor);/*!< in: persistent cursor */ +#else /* UNIV_DEBUG */ +# define btr_pcur_get_btr_cur(cursor) (&(cursor)->btr_cur) +# define btr_pcur_get_page_cur(cursor) (&(cursor)->btr_cur.page_cur) +# define btr_pcur_get_page(cursor) ((cursor)->btr_cur.page_cur.block->frame) +# define btr_pcur_get_block(cursor) ((cursor)->btr_cur.page_cur.block) +# define btr_pcur_get_rec(cursor) ((cursor)->btr_cur.page_cur.rec) +#endif /* UNIV_DEBUG */ +/*********************************************************//** +Checks if the persistent cursor is on a user record. */ +UNIV_INLINE +ibool +btr_pcur_is_on_user_rec( +/*====================*/ + const btr_pcur_t* cursor);/*!< in: persistent cursor */ +/*********************************************************//** +Checks if the persistent cursor is after the last user record on +a page. */ +UNIV_INLINE +ibool +btr_pcur_is_after_last_on_page( +/*===========================*/ + const btr_pcur_t* cursor);/*!< in: persistent cursor */ +/*********************************************************//** +Checks if the persistent cursor is before the first user record on +a page. */ +UNIV_INLINE +ibool +btr_pcur_is_before_first_on_page( +/*=============================*/ + const btr_pcur_t* cursor);/*!< in: persistent cursor */ +/*********************************************************//** +Checks if the persistent cursor is before the first user record in +the index tree. */ +UNIV_INLINE +ibool +btr_pcur_is_before_first_in_tree( +/*=============================*/ + btr_pcur_t* cursor, /*!< in: persistent cursor */ + mtr_t* mtr); /*!< in: mtr */ +/*********************************************************//** +Checks if the persistent cursor is after the last user record in +the index tree. */ +UNIV_INLINE +ibool +btr_pcur_is_after_last_in_tree( +/*===========================*/ + btr_pcur_t* cursor, /*!< in: persistent cursor */ + mtr_t* mtr); /*!< in: mtr */ +/*********************************************************//** +Moves the persistent cursor to the next record on the same page. */ +UNIV_INLINE +void +btr_pcur_move_to_next_on_page( +/*==========================*/ + btr_pcur_t* cursor);/*!< in/out: persistent cursor */ +/*********************************************************//** +Moves the persistent cursor to the previous record on the same page. */ +UNIV_INLINE +void +btr_pcur_move_to_prev_on_page( +/*==========================*/ + btr_pcur_t* cursor);/*!< in/out: persistent cursor */ +/*********************************************************//** +Moves the persistent cursor to the infimum record on the same page. */ +UNIV_INLINE +void +btr_pcur_move_before_first_on_page( +/*===============================*/ + btr_pcur_t* cursor); /*!< in/out: persistent cursor */ + +/** Position state of persistent B-tree cursor. */ +enum pcur_pos_t { + /** The persistent cursor is not positioned. */ + BTR_PCUR_NOT_POSITIONED = 0, + /** The persistent cursor was previously positioned. + TODO: currently, the state can be BTR_PCUR_IS_POSITIONED, + though it really should be BTR_PCUR_WAS_POSITIONED, + because we have no obligation to commit the cursor with + mtr; similarly latch_mode may be out of date. This can + lead to problems if btr_pcur is not used the right way; + all current code should be ok. */ + BTR_PCUR_WAS_POSITIONED, + /** The persistent cursor is positioned by optimistic get to the same + record as it was positioned at. Not used for rel_pos == BTR_PCUR_ON. + It may need adjustment depending on previous/current search direction + and rel_pos. */ + BTR_PCUR_IS_POSITIONED_OPTIMISTIC, + /** The persistent cursor is positioned by index search. + Or optimistic get for rel_pos == BTR_PCUR_ON. */ + BTR_PCUR_IS_POSITIONED +}; + +/* The persistent B-tree cursor structure. This is used mainly for SQL +selects, updates, and deletes. */ + +struct btr_pcur_t{ + btr_cur_t btr_cur; /*!< a B-tree cursor */ + ulint latch_mode; /*!< see TODO note below! + BTR_SEARCH_LEAF, BTR_MODIFY_LEAF, + BTR_MODIFY_TREE, or BTR_NO_LATCHES, + depending on the latching state of + the page and tree where the cursor is + positioned; BTR_NO_LATCHES means that + the cursor is not currently positioned: + we say then that the cursor is + detached; it can be restored to + attached if the old position was + stored in old_rec */ + ulint old_stored; /*!< BTR_PCUR_OLD_STORED + or BTR_PCUR_OLD_NOT_STORED */ + rec_t* old_rec; /*!< if cursor position is stored, + contains an initial segment of the + latest record cursor was positioned + either on, before, or after */ + ulint old_n_fields; /*!< number of fields in old_rec */ + ulint rel_pos; /*!< BTR_PCUR_ON, BTR_PCUR_BEFORE, or + BTR_PCUR_AFTER, depending on whether + cursor was on, before, or after the + old_rec record */ + buf_block_t* block_when_stored;/* buffer block when the position was + stored */ + ib_uint64_t modify_clock; /*!< the modify clock value of the + buffer block when the cursor position + was stored */ + enum pcur_pos_t pos_state; /*!< btr_pcur_store_position() and + btr_pcur_restore_position() state. */ + ulint search_mode; /*!< PAGE_CUR_G, ... */ + trx_t* trx_if_known; /*!< the transaction, if we know it; + otherwise this field is not defined; + can ONLY BE USED in error prints in + fatal assertion failures! */ + /*-----------------------------*/ + /* NOTE that the following fields may possess dynamically allocated + memory which should be freed if not needed anymore! */ + + byte* old_rec_buf; /*!< NULL, or a dynamically allocated + buffer for old_rec */ + ulint buf_size; /*!< old_rec_buf size if old_rec_buf + is not NULL */ +}; + +#define BTR_PCUR_OLD_STORED 908467085 +#define BTR_PCUR_OLD_NOT_STORED 122766467 + +#ifndef UNIV_NONINL +#include "btr0pcur.ic" +#endif + +#endif diff --git a/storage/innobase/include/btr0pcur.ic b/storage/innobase/include/btr0pcur.ic new file mode 100644 index 00000000000..7e355d3709d --- /dev/null +++ b/storage/innobase/include/btr0pcur.ic @@ -0,0 +1,606 @@ +/***************************************************************************** + +Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/btr0pcur.ic +The index tree persistent cursor + +Created 2/23/1996 Heikki Tuuri +*******************************************************/ + + +/*********************************************************//** +Gets the rel_pos field for a cursor whose position has been stored. +@return BTR_PCUR_ON, ... */ +UNIV_INLINE +ulint +btr_pcur_get_rel_pos( +/*=================*/ + const btr_pcur_t* cursor) /*!< in: persistent cursor */ +{ + ut_ad(cursor); + ut_ad(cursor->old_rec); + ut_ad(cursor->old_stored == BTR_PCUR_OLD_STORED); + ut_ad(cursor->pos_state == BTR_PCUR_WAS_POSITIONED + || cursor->pos_state == BTR_PCUR_IS_POSITIONED); + + return(cursor->rel_pos); +} + +#ifdef UNIV_DEBUG +/*********************************************************//** +Returns the btr cursor component of a persistent cursor. +@return pointer to btr cursor component */ +UNIV_INLINE +btr_cur_t* +btr_pcur_get_btr_cur( +/*=================*/ + const btr_pcur_t* cursor) /*!< in: persistent cursor */ +{ + const btr_cur_t* btr_cur = &cursor->btr_cur; + return((btr_cur_t*) btr_cur); +} + +/*********************************************************//** +Returns the page cursor component of a persistent cursor. +@return pointer to page cursor component */ +UNIV_INLINE +page_cur_t* +btr_pcur_get_page_cur( +/*==================*/ + const btr_pcur_t* cursor) /*!< in: persistent cursor */ +{ + return(btr_cur_get_page_cur(btr_pcur_get_btr_cur(cursor))); +} + +/*********************************************************//** +Returns the page of a persistent cursor. +@return pointer to the page */ +UNIV_INLINE +page_t* +btr_pcur_get_page( +/*==============*/ + const btr_pcur_t* cursor) /*!< in: persistent cursor */ +{ + ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED); + + return(btr_cur_get_page(btr_pcur_get_btr_cur(cursor))); +} + +/*********************************************************//** +Returns the buffer block of a persistent cursor. +@return pointer to the block */ +UNIV_INLINE +buf_block_t* +btr_pcur_get_block( +/*===============*/ + const btr_pcur_t* cursor) /*!< in: persistent cursor */ +{ + ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED); + + return(btr_cur_get_block(btr_pcur_get_btr_cur(cursor))); +} + +/*********************************************************//** +Returns the record of a persistent cursor. +@return pointer to the record */ +UNIV_INLINE +rec_t* +btr_pcur_get_rec( +/*=============*/ + const btr_pcur_t* cursor) /*!< in: persistent cursor */ +{ + ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED); + ut_ad(cursor->latch_mode != BTR_NO_LATCHES); + + return(btr_cur_get_rec(btr_pcur_get_btr_cur(cursor))); +} +#endif /* UNIV_DEBUG */ + +/**************************************************************//** +Gets the up_match value for a pcur after a search. +@return number of matched fields at the cursor or to the right if +search mode was PAGE_CUR_GE, otherwise undefined */ +UNIV_INLINE +ulint +btr_pcur_get_up_match( +/*==================*/ + const btr_pcur_t* cursor) /*!< in: persistent cursor */ +{ + const btr_cur_t* btr_cursor; + + ut_ad((cursor->pos_state == BTR_PCUR_WAS_POSITIONED) + || (cursor->pos_state == BTR_PCUR_IS_POSITIONED)); + + btr_cursor = btr_pcur_get_btr_cur(cursor); + + ut_ad(btr_cursor->up_match != ULINT_UNDEFINED); + + return(btr_cursor->up_match); +} + +/**************************************************************//** +Gets the low_match value for a pcur after a search. +@return number of matched fields at the cursor or to the right if +search mode was PAGE_CUR_LE, otherwise undefined */ +UNIV_INLINE +ulint +btr_pcur_get_low_match( +/*===================*/ + const btr_pcur_t* cursor) /*!< in: persistent cursor */ +{ + const btr_cur_t* btr_cursor; + + ut_ad((cursor->pos_state == BTR_PCUR_WAS_POSITIONED) + || (cursor->pos_state == BTR_PCUR_IS_POSITIONED)); + + btr_cursor = btr_pcur_get_btr_cur(cursor); + ut_ad(btr_cursor->low_match != ULINT_UNDEFINED); + + return(btr_cursor->low_match); +} + +/*********************************************************//** +Checks if the persistent cursor is after the last user record on +a page. */ +UNIV_INLINE +ibool +btr_pcur_is_after_last_on_page( +/*===========================*/ + const btr_pcur_t* cursor) /*!< in: persistent cursor */ +{ + ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED); + ut_ad(cursor->latch_mode != BTR_NO_LATCHES); + + return(page_cur_is_after_last(btr_pcur_get_page_cur(cursor))); +} + +/*********************************************************//** +Checks if the persistent cursor is before the first user record on +a page. */ +UNIV_INLINE +ibool +btr_pcur_is_before_first_on_page( +/*=============================*/ + const btr_pcur_t* cursor) /*!< in: persistent cursor */ +{ + ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED); + ut_ad(cursor->latch_mode != BTR_NO_LATCHES); + + return(page_cur_is_before_first(btr_pcur_get_page_cur(cursor))); +} + +/*********************************************************//** +Checks if the persistent cursor is on a user record. */ +UNIV_INLINE +ibool +btr_pcur_is_on_user_rec( +/*====================*/ + const btr_pcur_t* cursor) /*!< in: persistent cursor */ +{ + ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED); + ut_ad(cursor->latch_mode != BTR_NO_LATCHES); + + if (btr_pcur_is_before_first_on_page(cursor) + || btr_pcur_is_after_last_on_page(cursor)) { + + return(FALSE); + } + + return(TRUE); +} + +/*********************************************************//** +Checks if the persistent cursor is before the first user record in +the index tree. */ +UNIV_INLINE +ibool +btr_pcur_is_before_first_in_tree( +/*=============================*/ + btr_pcur_t* cursor, /*!< in: persistent cursor */ + mtr_t* mtr) /*!< in: mtr */ +{ + ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED); + ut_ad(cursor->latch_mode != BTR_NO_LATCHES); + + if (btr_page_get_prev(btr_pcur_get_page(cursor), mtr) != FIL_NULL) { + + return(FALSE); + } + + return(page_cur_is_before_first(btr_pcur_get_page_cur(cursor))); +} + +/*********************************************************//** +Checks if the persistent cursor is after the last user record in +the index tree. */ +UNIV_INLINE +ibool +btr_pcur_is_after_last_in_tree( +/*===========================*/ + btr_pcur_t* cursor, /*!< in: persistent cursor */ + mtr_t* mtr) /*!< in: mtr */ +{ + ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED); + ut_ad(cursor->latch_mode != BTR_NO_LATCHES); + + if (btr_page_get_next(btr_pcur_get_page(cursor), mtr) != FIL_NULL) { + + return(FALSE); + } + + return(page_cur_is_after_last(btr_pcur_get_page_cur(cursor))); +} + +/*********************************************************//** +Moves the persistent cursor to the next record on the same page. */ +UNIV_INLINE +void +btr_pcur_move_to_next_on_page( +/*==========================*/ + btr_pcur_t* cursor) /*!< in/out: persistent cursor */ +{ + ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED); + ut_ad(cursor->latch_mode != BTR_NO_LATCHES); + + page_cur_move_to_next(btr_pcur_get_page_cur(cursor)); + + cursor->old_stored = BTR_PCUR_OLD_NOT_STORED; +} + +/*********************************************************//** +Moves the persistent cursor to the previous record on the same page. */ +UNIV_INLINE +void +btr_pcur_move_to_prev_on_page( +/*==========================*/ + btr_pcur_t* cursor) /*!< in/out: persistent cursor */ +{ + ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED); + ut_ad(cursor->latch_mode != BTR_NO_LATCHES); + + page_cur_move_to_prev(btr_pcur_get_page_cur(cursor)); + + cursor->old_stored = BTR_PCUR_OLD_NOT_STORED; +} + +/*********************************************************//** +Moves the persistent cursor to the last record on the same page. */ +UNIV_INLINE +void +btr_pcur_move_to_last_on_page( +/*==========================*/ + btr_pcur_t* cursor, /*!< in: persistent cursor */ + mtr_t* mtr) /*!< in: mtr */ +{ + UT_NOT_USED(mtr); + ut_ad(cursor->latch_mode != BTR_NO_LATCHES); + + page_cur_set_after_last(btr_pcur_get_block(cursor), + btr_pcur_get_page_cur(cursor)); + + cursor->old_stored = BTR_PCUR_OLD_NOT_STORED; +} + +/*********************************************************//** +Moves the persistent cursor to the next user record in the tree. If no user +records are left, the cursor ends up 'after last in tree'. +@return TRUE if the cursor moved forward, ending on a user record */ +UNIV_INLINE +ibool +btr_pcur_move_to_next_user_rec( +/*===========================*/ + btr_pcur_t* cursor, /*!< in: persistent cursor; NOTE that the + function may release the page latch */ + mtr_t* mtr) /*!< in: mtr */ +{ + ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED); + ut_ad(cursor->latch_mode != BTR_NO_LATCHES); + cursor->old_stored = BTR_PCUR_OLD_NOT_STORED; +loop: + if (btr_pcur_is_after_last_on_page(cursor)) { + + if (btr_pcur_is_after_last_in_tree(cursor, mtr)) { + + return(FALSE); + } + + btr_pcur_move_to_next_page(cursor, mtr); + } else { + btr_pcur_move_to_next_on_page(cursor); + } + + if (btr_pcur_is_on_user_rec(cursor)) { + + return(TRUE); + } + + goto loop; +} + +/*********************************************************//** +Moves the persistent cursor to the next record in the tree. If no records are +left, the cursor stays 'after last in tree'. +@return TRUE if the cursor was not after last in tree */ +UNIV_INLINE +ibool +btr_pcur_move_to_next( +/*==================*/ + btr_pcur_t* cursor, /*!< in: persistent cursor; NOTE that the + function may release the page latch */ + mtr_t* mtr) /*!< in: mtr */ +{ + ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED); + ut_ad(cursor->latch_mode != BTR_NO_LATCHES); + + cursor->old_stored = BTR_PCUR_OLD_NOT_STORED; + + if (btr_pcur_is_after_last_on_page(cursor)) { + + if (btr_pcur_is_after_last_in_tree(cursor, mtr)) { + + return(FALSE); + } + + btr_pcur_move_to_next_page(cursor, mtr); + + return(TRUE); + } + + btr_pcur_move_to_next_on_page(cursor); + + return(TRUE); +} + +/**************************************************************//** +Commits the mtr and sets the pcur latch mode to BTR_NO_LATCHES, +that is, the cursor becomes detached. +Function btr_pcur_store_position should be used before calling this, +if restoration of cursor is wanted later. */ +UNIV_INLINE +void +btr_pcur_commit_specify_mtr( +/*========================*/ + btr_pcur_t* pcur, /*!< in: persistent cursor */ + mtr_t* mtr) /*!< in: mtr to commit */ +{ + ut_ad(pcur->pos_state == BTR_PCUR_IS_POSITIONED); + + pcur->latch_mode = BTR_NO_LATCHES; + + mtr_commit(mtr); + + pcur->pos_state = BTR_PCUR_WAS_POSITIONED; +} + +/**************************************************************//** +Sets the old_rec_buf field to NULL. */ +UNIV_INLINE +void +btr_pcur_init( +/*==========*/ + btr_pcur_t* pcur) /*!< in: persistent cursor */ +{ + pcur->old_stored = BTR_PCUR_OLD_NOT_STORED; + pcur->old_rec_buf = NULL; + pcur->old_rec = NULL; +} + +/**************************************************************//** +Initializes and opens a persistent cursor to an index tree. It should be +closed with btr_pcur_close. */ +UNIV_INLINE +void +btr_pcur_open_low( +/*==============*/ + dict_index_t* index, /*!< in: index */ + ulint level, /*!< in: level in the btree */ + const dtuple_t* tuple, /*!< in: tuple on which search done */ + ulint mode, /*!< in: PAGE_CUR_L, ...; + NOTE that if the search is made using a unique + prefix of a record, mode should be + PAGE_CUR_LE, not PAGE_CUR_GE, as the latter + may end up on the previous page from the + record! */ + ulint latch_mode,/*!< in: BTR_SEARCH_LEAF, ... */ + btr_pcur_t* cursor, /*!< in: memory buffer for persistent cursor */ + const char* file, /*!< in: file name */ + ulint line, /*!< in: line where called */ + mtr_t* mtr) /*!< in: mtr */ +{ + btr_cur_t* btr_cursor; + + /* Initialize the cursor */ + + btr_pcur_init(cursor); + + cursor->latch_mode = BTR_LATCH_MODE_WITHOUT_FLAGS(latch_mode); + cursor->search_mode = mode; + + /* Search with the tree cursor */ + + btr_cursor = btr_pcur_get_btr_cur(cursor); + + btr_cur_search_to_nth_level(index, level, tuple, mode, latch_mode, + btr_cursor, 0, file, line, mtr); + cursor->pos_state = BTR_PCUR_IS_POSITIONED; + + cursor->trx_if_known = NULL; +} + +/**************************************************************//** +Opens an persistent cursor to an index tree without initializing the +cursor. */ +UNIV_INLINE +void +btr_pcur_open_with_no_init_func( +/*============================*/ + dict_index_t* index, /*!< in: index */ + const dtuple_t* tuple, /*!< in: tuple on which search done */ + ulint mode, /*!< in: PAGE_CUR_L, ...; + NOTE that if the search is made using a unique + prefix of a record, mode should be + PAGE_CUR_LE, not PAGE_CUR_GE, as the latter + may end up on the previous page of the + record! */ + ulint latch_mode,/*!< in: BTR_SEARCH_LEAF, ...; + NOTE that if has_search_latch != 0 then + we maybe do not acquire a latch on the cursor + page, but assume that the caller uses his + btr search latch to protect the record! */ + btr_pcur_t* cursor, /*!< in: memory buffer for persistent cursor */ + ulint has_search_latch,/*!< in: latch mode the caller + currently has on btr_search_latch: + RW_S_LATCH, or 0 */ + const char* file, /*!< in: file name */ + ulint line, /*!< in: line where called */ + mtr_t* mtr) /*!< in: mtr */ +{ + btr_cur_t* btr_cursor; + + cursor->latch_mode = latch_mode; + cursor->search_mode = mode; + + /* Search with the tree cursor */ + + btr_cursor = btr_pcur_get_btr_cur(cursor); + + btr_cur_search_to_nth_level(index, 0, tuple, mode, latch_mode, + btr_cursor, has_search_latch, + file, line, mtr); + cursor->pos_state = BTR_PCUR_IS_POSITIONED; + + cursor->old_stored = BTR_PCUR_OLD_NOT_STORED; + + cursor->trx_if_known = NULL; +} + +/*****************************************************************//** +Opens a persistent cursor at either end of an index. */ +UNIV_INLINE +void +btr_pcur_open_at_index_side( +/*========================*/ + bool from_left, /*!< in: true if open to the low end, + false if to the high end */ + dict_index_t* index, /*!< in: index */ + ulint latch_mode, /*!< in: latch mode */ + btr_pcur_t* pcur, /*!< in/out: cursor */ + bool init_pcur, /*!< in: whether to initialize pcur */ + ulint level, /*!< in: level to search for + (0=leaf) */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + pcur->latch_mode = BTR_LATCH_MODE_WITHOUT_FLAGS(latch_mode); + + pcur->search_mode = from_left ? PAGE_CUR_G : PAGE_CUR_L; + + if (init_pcur) { + btr_pcur_init(pcur); + } + + btr_cur_open_at_index_side(from_left, index, latch_mode, + btr_pcur_get_btr_cur(pcur), level, mtr); + pcur->pos_state = BTR_PCUR_IS_POSITIONED; + + pcur->old_stored = BTR_PCUR_OLD_NOT_STORED; + + pcur->trx_if_known = NULL; +} + +/**********************************************************************//** +Positions a cursor at a randomly chosen position within a B-tree. */ +UNIV_INLINE +void +btr_pcur_open_at_rnd_pos_func( +/*==========================*/ + dict_index_t* index, /*!< in: index */ + ulint latch_mode, /*!< in: BTR_SEARCH_LEAF, ... */ + btr_pcur_t* cursor, /*!< in/out: B-tree pcur */ + const char* file, /*!< in: file name */ + ulint line, /*!< in: line where called */ + mtr_t* mtr) /*!< in: mtr */ +{ + /* Initialize the cursor */ + + cursor->latch_mode = latch_mode; + cursor->search_mode = PAGE_CUR_G; + + btr_pcur_init(cursor); + + btr_cur_open_at_rnd_pos_func(index, latch_mode, + btr_pcur_get_btr_cur(cursor), + file, line, mtr); + cursor->pos_state = BTR_PCUR_IS_POSITIONED; + cursor->old_stored = BTR_PCUR_OLD_NOT_STORED; + + cursor->trx_if_known = NULL; +} + +/**************************************************************//** +Frees the possible memory heap of a persistent cursor and sets the latch +mode of the persistent cursor to BTR_NO_LATCHES. +WARNING: this function does not release the latch on the page where the +cursor is currently positioned. The latch is acquired by the +"move to next/previous" family of functions. Since recursive shared locks +are not allowed, you must take care (if using the cursor in S-mode) to +manually release the latch by either calling +btr_leaf_page_release(btr_pcur_get_block(&pcur), pcur.latch_mode, mtr) +or by committing the mini-transaction right after btr_pcur_close(). +A subsequent attempt to crawl the same page in the same mtr would cause +an assertion failure. */ +UNIV_INLINE +void +btr_pcur_close( +/*===========*/ + btr_pcur_t* cursor) /*!< in: persistent cursor */ +{ + if (cursor->old_rec_buf != NULL) { + + mem_free(cursor->old_rec_buf); + + cursor->old_rec = NULL; + cursor->old_rec_buf = NULL; + } + + cursor->btr_cur.page_cur.rec = NULL; + cursor->btr_cur.page_cur.block = NULL; + cursor->old_rec = NULL; + cursor->old_stored = BTR_PCUR_OLD_NOT_STORED; + + cursor->latch_mode = BTR_NO_LATCHES; + cursor->pos_state = BTR_PCUR_NOT_POSITIONED; + + cursor->trx_if_known = NULL; +} + +/*********************************************************//** +Moves the persistent cursor to the infimum record on the same page. */ +UNIV_INLINE +void +btr_pcur_move_before_first_on_page( +/*===============================*/ + btr_pcur_t* cursor) /*!< in/out: persistent cursor */ +{ + ut_ad(cursor->latch_mode != BTR_NO_LATCHES); + + page_cur_set_before_first(btr_pcur_get_block(cursor), + btr_pcur_get_page_cur(cursor)); + + cursor->old_stored = BTR_PCUR_OLD_NOT_STORED; +} diff --git a/storage/innobase/include/btr0sea.h b/storage/innobase/include/btr0sea.h new file mode 100644 index 00000000000..848bde451a0 --- /dev/null +++ b/storage/innobase/include/btr0sea.h @@ -0,0 +1,288 @@ +/***************************************************************************** + +Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/********************************************************************//** +@file include/btr0sea.h +The index tree adaptive search + +Created 2/17/1996 Heikki Tuuri +*************************************************************************/ + +#ifndef btr0sea_h +#define btr0sea_h + +#include "univ.i" + +#include "rem0rec.h" +#include "dict0dict.h" +#include "btr0types.h" +#include "mtr0mtr.h" +#include "ha0ha.h" + +/*****************************************************************//** +Creates and initializes the adaptive search system at a database start. */ +UNIV_INTERN +void +btr_search_sys_create( +/*==================*/ + ulint hash_size); /*!< in: hash index hash table size */ +/*****************************************************************//** +Frees the adaptive search system at a database shutdown. */ +UNIV_INTERN +void +btr_search_sys_free(void); +/*=====================*/ + +/********************************************************************//** +Disable the adaptive hash search system and empty the index. */ +UNIV_INTERN +void +btr_search_disable(void); +/*====================*/ +/********************************************************************//** +Enable the adaptive hash search system. */ +UNIV_INTERN +void +btr_search_enable(void); +/*====================*/ + +/********************************************************************//** +Returns search info for an index. +@return search info; search mutex reserved */ +UNIV_INLINE +btr_search_t* +btr_search_get_info( +/*================*/ + dict_index_t* index) /*!< in: index */ + __attribute__((nonnull)); +/*****************************************************************//** +Creates and initializes a search info struct. +@return own: search info struct */ +UNIV_INTERN +btr_search_t* +btr_search_info_create( +/*===================*/ + mem_heap_t* heap); /*!< in: heap where created */ +/*****************************************************************//** +Returns the value of ref_count. The value is protected by +btr_search_latch. +@return ref_count value. */ +UNIV_INTERN +ulint +btr_search_info_get_ref_count( +/*==========================*/ + btr_search_t* info); /*!< in: search info. */ +/*********************************************************************//** +Updates the search info. */ +UNIV_INLINE +void +btr_search_info_update( +/*===================*/ + dict_index_t* index, /*!< in: index of the cursor */ + btr_cur_t* cursor);/*!< in: cursor which was just positioned */ +/******************************************************************//** +Tries to guess the right search position based on the hash search info +of the index. Note that if mode is PAGE_CUR_LE, which is used in inserts, +and the function returns TRUE, then cursor->up_match and cursor->low_match +both have sensible values. +@return TRUE if succeeded */ +UNIV_INTERN +ibool +btr_search_guess_on_hash( +/*=====================*/ + dict_index_t* index, /*!< in: index */ + btr_search_t* info, /*!< in: index search info */ + const dtuple_t* tuple, /*!< in: logical record */ + ulint mode, /*!< in: PAGE_CUR_L, ... */ + ulint latch_mode, /*!< in: BTR_SEARCH_LEAF, ... */ + btr_cur_t* cursor, /*!< out: tree cursor */ + ulint has_search_latch,/*!< in: latch mode the caller + currently has on btr_search_latch: + RW_S_LATCH, RW_X_LATCH, or 0 */ + mtr_t* mtr); /*!< in: mtr */ +/********************************************************************//** +Moves or deletes hash entries for moved records. If new_page is already hashed, +then the hash index for page, if any, is dropped. If new_page is not hashed, +and page is hashed, then a new hash index is built to new_page with the same +parameters as page (this often happens when a page is split). */ +UNIV_INTERN +void +btr_search_move_or_delete_hash_entries( +/*===================================*/ + buf_block_t* new_block, /*!< in: records are copied + to this page */ + buf_block_t* block, /*!< in: index page from which + records were copied, and the + copied records will be deleted + from this page */ + dict_index_t* index); /*!< in: record descriptor */ +/********************************************************************//** +Drops a page hash index. */ +UNIV_INTERN +void +btr_search_drop_page_hash_index( +/*============================*/ + buf_block_t* block); /*!< in: block containing index page, + s- or x-latched, or an index page + for which we know that + block->buf_fix_count == 0 */ +/********************************************************************//** +Drops a possible page hash index when a page is evicted from the buffer pool +or freed in a file segment. */ +UNIV_INTERN +void +btr_search_drop_page_hash_when_freed( +/*=================================*/ + ulint space, /*!< in: space id */ + ulint zip_size, /*!< in: compressed page size in bytes + or 0 for uncompressed pages */ + ulint page_no); /*!< in: page number */ +/********************************************************************//** +Updates the page hash index when a single record is inserted on a page. */ +UNIV_INTERN +void +btr_search_update_hash_node_on_insert( +/*==================================*/ + btr_cur_t* cursor);/*!< in: cursor which was positioned to the + place to insert using btr_cur_search_..., + and the new record has been inserted next + to the cursor */ +/********************************************************************//** +Updates the page hash index when a single record is inserted on a page. */ +UNIV_INTERN +void +btr_search_update_hash_on_insert( +/*=============================*/ + btr_cur_t* cursor);/*!< in: cursor which was positioned to the + place to insert using btr_cur_search_..., + and the new record has been inserted next + to the cursor */ +/********************************************************************//** +Updates the page hash index when a single record is deleted from a page. */ +UNIV_INTERN +void +btr_search_update_hash_on_delete( +/*=============================*/ + btr_cur_t* cursor);/*!< in: cursor which was positioned on the + record to delete using btr_cur_search_..., + the record is not yet deleted */ +#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG +/********************************************************************//** +Validates the search system. +@return TRUE if ok */ +UNIV_INTERN +ibool +btr_search_validate(void); +/*======================*/ +#endif /* defined UNIV_AHI_DEBUG || defined UNIV_DEBUG */ + +/** The search info struct in an index */ +struct btr_search_t{ + ulint ref_count; /*!< Number of blocks in this index tree + that have search index built + i.e. block->index points to this index. + Protected by btr_search_latch except + when during initialization in + btr_search_info_create(). */ + + /* @{ The following fields are not protected by any latch. + Unfortunately, this means that they must be aligned to + the machine word, i.e., they cannot be turned into bit-fields. */ + buf_block_t* root_guess;/*!< the root page frame when it was last time + fetched, or NULL */ + ulint hash_analysis; /*!< when this exceeds + BTR_SEARCH_HASH_ANALYSIS, the hash + analysis starts; this is reset if no + success noticed */ + ibool last_hash_succ; /*!< TRUE if the last search would have + succeeded, or did succeed, using the hash + index; NOTE that the value here is not exact: + it is not calculated for every search, and the + calculation itself is not always accurate! */ + ulint n_hash_potential; + /*!< number of consecutive searches + which would have succeeded, or did succeed, + using the hash index; + the range is 0 .. BTR_SEARCH_BUILD_LIMIT + 5 */ + /* @} */ + /*---------------------- @{ */ + ulint n_fields; /*!< recommended prefix length for hash search: + number of full fields */ + ulint n_bytes; /*!< recommended prefix: number of bytes in + an incomplete field + @see BTR_PAGE_MAX_REC_SIZE */ + ibool left_side; /*!< TRUE or FALSE, depending on whether + the leftmost record of several records with + the same prefix should be indexed in the + hash index */ + /*---------------------- @} */ +#ifdef UNIV_SEARCH_PERF_STAT + ulint n_hash_succ; /*!< number of successful hash searches thus + far */ + ulint n_hash_fail; /*!< number of failed hash searches */ + ulint n_patt_succ; /*!< number of successful pattern searches thus + far */ + ulint n_searches; /*!< number of searches */ +#endif /* UNIV_SEARCH_PERF_STAT */ +#ifdef UNIV_DEBUG + ulint magic_n; /*!< magic number @see BTR_SEARCH_MAGIC_N */ +/** value of btr_search_t::magic_n, used in assertions */ +# define BTR_SEARCH_MAGIC_N 1112765 +#endif /* UNIV_DEBUG */ +}; + +/** The hash index system */ +struct btr_search_sys_t{ + hash_table_t* hash_index; /*!< the adaptive hash index, + mapping dtuple_fold values + to rec_t pointers on index pages */ +}; + +/** The adaptive hash index */ +extern btr_search_sys_t* btr_search_sys; + +#ifdef UNIV_SEARCH_PERF_STAT +/** Number of successful adaptive hash index lookups */ +extern ulint btr_search_n_succ; +/** Number of failed adaptive hash index lookups */ +extern ulint btr_search_n_hash_fail; +#endif /* UNIV_SEARCH_PERF_STAT */ + +/** After change in n_fields or n_bytes in info, this many rounds are waited +before starting the hash analysis again: this is to save CPU time when there +is no hope in building a hash index. */ +#define BTR_SEARCH_HASH_ANALYSIS 17 + +/** Limit of consecutive searches for trying a search shortcut on the search +pattern */ +#define BTR_SEARCH_ON_PATTERN_LIMIT 3 + +/** Limit of consecutive searches for trying a search shortcut using +the hash index */ +#define BTR_SEARCH_ON_HASH_LIMIT 3 + +/** We do this many searches before trying to keep the search latch +over calls from MySQL. If we notice someone waiting for the latch, we +again set this much timeout. This is to reduce contention. */ +#define BTR_SEA_TIMEOUT 10000 + +#ifndef UNIV_NONINL +#include "btr0sea.ic" +#endif + +#endif diff --git a/storage/innobase/include/btr0sea.ic b/storage/innobase/include/btr0sea.ic new file mode 100644 index 00000000000..0bd869be136 --- /dev/null +++ b/storage/innobase/include/btr0sea.ic @@ -0,0 +1,82 @@ +/***************************************************************************** + +Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/********************************************************************//** +@file include/btr0sea.ic +The index tree adaptive search + +Created 2/17/1996 Heikki Tuuri +*************************************************************************/ + +#include "dict0mem.h" +#include "btr0cur.h" +#include "buf0buf.h" + +/*********************************************************************//** +Updates the search info. */ +UNIV_INTERN +void +btr_search_info_update_slow( +/*========================*/ + btr_search_t* info, /*!< in/out: search info */ + btr_cur_t* cursor);/*!< in: cursor which was just positioned */ + +/********************************************************************//** +Returns search info for an index. +@return search info; search mutex reserved */ +UNIV_INLINE +btr_search_t* +btr_search_get_info( +/*================*/ + dict_index_t* index) /*!< in: index */ +{ + return(index->search_info); +} + +/*********************************************************************//** +Updates the search info. */ +UNIV_INLINE +void +btr_search_info_update( +/*===================*/ + dict_index_t* index, /*!< in: index of the cursor */ + btr_cur_t* cursor) /*!< in: cursor which was just positioned */ +{ + btr_search_t* info; + +#ifdef UNIV_SYNC_DEBUG + ut_ad(!rw_lock_own(&btr_search_latch, RW_LOCK_SHARED)); + ut_ad(!rw_lock_own(&btr_search_latch, RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + + info = btr_search_get_info(index); + + info->hash_analysis++; + + if (info->hash_analysis < BTR_SEARCH_HASH_ANALYSIS) { + + /* Do nothing */ + + return; + + } + + ut_ad(cursor->flag != BTR_CUR_HASH); + + btr_search_info_update_slow(info, cursor); +} diff --git a/storage/innobase/include/btr0types.h b/storage/innobase/include/btr0types.h new file mode 100644 index 00000000000..c1a4531f861 --- /dev/null +++ b/storage/innobase/include/btr0types.h @@ -0,0 +1,203 @@ +/***************************************************************************** + +Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/********************************************************************//** +@file include/btr0types.h +The index tree general types + +Created 2/17/1996 Heikki Tuuri +*************************************************************************/ + +#ifndef btr0types_h +#define btr0types_h + +#include "univ.i" + +#include "rem0types.h" +#include "page0types.h" +#include "sync0rw.h" + +/** Persistent cursor */ +struct btr_pcur_t; +/** B-tree cursor */ +struct btr_cur_t; +/** B-tree search information for the adaptive hash index */ +struct btr_search_t; + +#ifndef UNIV_HOTBACKUP + +/** @brief The latch protecting the adaptive search system + +This latch protects the +(1) hash index; +(2) columns of a record to which we have a pointer in the hash index; + +but does NOT protect: + +(3) next record offset field in a record; +(4) next or previous records on the same page. + +Bear in mind (3) and (4) when using the hash index. +*/ +extern rw_lock_t* btr_search_latch_temp; + +#endif /* UNIV_HOTBACKUP */ + +/** The latch protecting the adaptive search system */ +#define btr_search_latch (*btr_search_latch_temp) + +/** Flag: has the search system been enabled? +Protected by btr_search_latch. */ +extern char btr_search_enabled; + +#ifdef UNIV_BLOB_DEBUG +# include "buf0types.h" +/** An index->blobs entry for keeping track of off-page column references */ +struct btr_blob_dbg_t; + +/** Insert to index->blobs a reference to an off-page column. +@param index the index tree +@param b the reference +@param ctx context (for logging) */ +UNIV_INTERN +void +btr_blob_dbg_rbt_insert( +/*====================*/ + dict_index_t* index, /*!< in/out: index tree */ + const btr_blob_dbg_t* b, /*!< in: the reference */ + const char* ctx) /*!< in: context (for logging) */ + __attribute__((nonnull)); + +/** Remove from index->blobs a reference to an off-page column. +@param index the index tree +@param b the reference +@param ctx context (for logging) */ +UNIV_INTERN +void +btr_blob_dbg_rbt_delete( +/*====================*/ + dict_index_t* index, /*!< in/out: index tree */ + const btr_blob_dbg_t* b, /*!< in: the reference */ + const char* ctx) /*!< in: context (for logging) */ + __attribute__((nonnull)); + +/**************************************************************//** +Add to index->blobs any references to off-page columns from a record. +@return number of references added */ +UNIV_INTERN +ulint +btr_blob_dbg_add_rec( +/*=================*/ + const rec_t* rec, /*!< in: record */ + dict_index_t* index, /*!< in/out: index */ + const ulint* offsets,/*!< in: offsets */ + const char* ctx) /*!< in: context (for logging) */ + __attribute__((nonnull)); +/**************************************************************//** +Remove from index->blobs any references to off-page columns from a record. +@return number of references removed */ +UNIV_INTERN +ulint +btr_blob_dbg_remove_rec( +/*====================*/ + const rec_t* rec, /*!< in: record */ + dict_index_t* index, /*!< in/out: index */ + const ulint* offsets,/*!< in: offsets */ + const char* ctx) /*!< in: context (for logging) */ + __attribute__((nonnull)); +/**************************************************************//** +Count and add to index->blobs any references to off-page columns +from records on a page. +@return number of references added */ +UNIV_INTERN +ulint +btr_blob_dbg_add( +/*=============*/ + const page_t* page, /*!< in: rewritten page */ + dict_index_t* index, /*!< in/out: index */ + const char* ctx) /*!< in: context (for logging) */ + __attribute__((nonnull)); +/**************************************************************//** +Count and remove from index->blobs any references to off-page columns +from records on a page. +Used when reorganizing a page, before copying the records. +@return number of references removed */ +UNIV_INTERN +ulint +btr_blob_dbg_remove( +/*================*/ + const page_t* page, /*!< in: b-tree page */ + dict_index_t* index, /*!< in/out: index */ + const char* ctx) /*!< in: context (for logging) */ + __attribute__((nonnull)); +/**************************************************************//** +Restore in index->blobs any references to off-page columns +Used when page reorganize fails due to compressed page overflow. */ +UNIV_INTERN +void +btr_blob_dbg_restore( +/*=================*/ + const page_t* npage, /*!< in: page that failed to compress */ + const page_t* page, /*!< in: copy of original page */ + dict_index_t* index, /*!< in/out: index */ + const char* ctx) /*!< in: context (for logging) */ + __attribute__((nonnull)); + +/** Operation that processes the BLOB references of an index record +@param[in] rec record on index page +@param[in/out] index the index tree of the record +@param[in] offsets rec_get_offsets(rec,index) +@param[in] ctx context (for logging) +@return number of BLOB references processed */ +typedef ulint (*btr_blob_dbg_op_f) +(const rec_t* rec,dict_index_t* index,const ulint* offsets,const char* ctx); + +/**************************************************************//** +Count and process all references to off-page columns on a page. +@return number of references processed */ +UNIV_INTERN +ulint +btr_blob_dbg_op( +/*============*/ + const page_t* page, /*!< in: B-tree leaf page */ + const rec_t* rec, /*!< in: record to start from + (NULL to process the whole page) */ + dict_index_t* index, /*!< in/out: index */ + const char* ctx, /*!< in: context (for logging) */ + const btr_blob_dbg_op_f op) /*!< in: operation on records */ + __attribute__((nonnull(1,3,4,5))); +#else /* UNIV_BLOB_DEBUG */ +# define btr_blob_dbg_add_rec(rec, index, offsets, ctx) ((void) 0) +# define btr_blob_dbg_add(page, index, ctx) ((void) 0) +# define btr_blob_dbg_remove_rec(rec, index, offsets, ctx) ((void) 0) +# define btr_blob_dbg_remove(page, index, ctx) ((void) 0) +# define btr_blob_dbg_restore(npage, page, index, ctx) ((void) 0) +# define btr_blob_dbg_op(page, rec, index, ctx, op) ((void) 0) +#endif /* UNIV_BLOB_DEBUG */ + +/** The size of a reference to data stored on a different page. +The reference is stored at the end of the prefix of the field +in the index record. */ +#define BTR_EXTERN_FIELD_REF_SIZE 20 + +/** A BLOB field reference full of zero, for use in assertions and tests. +Initially, BLOB field references are set to zero, in +dtuple_convert_big_rec(). */ +extern const byte field_ref_zero[BTR_EXTERN_FIELD_REF_SIZE]; + +#endif diff --git a/storage/innobase/include/buf0buddy.h b/storage/innobase/include/buf0buddy.h new file mode 100644 index 00000000000..fab9a4b828b --- /dev/null +++ b/storage/innobase/include/buf0buddy.h @@ -0,0 +1,77 @@ +/***************************************************************************** + +Copyright (c) 2006, 2011, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/buf0buddy.h +Binary buddy allocator for compressed pages + +Created December 2006 by Marko Makela +*******************************************************/ + +#ifndef buf0buddy_h +#define buf0buddy_h + +#ifdef UNIV_MATERIALIZE +# undef UNIV_INLINE +# define UNIV_INLINE +#endif + +#include "univ.i" +#include "buf0types.h" + +/**********************************************************************//** +Allocate a block. The thread calling this function must hold +buf_pool->mutex and must not hold buf_pool->zip_mutex or any +block->mutex. The buf_pool->mutex may be released and reacquired. +This function should only be used for allocating compressed page frames. +@return allocated block, never NULL */ +UNIV_INLINE +byte* +buf_buddy_alloc( +/*============*/ + buf_pool_t* buf_pool, /*!< in/out: buffer pool in which + the page resides */ + ulint size, /*!< in: compressed page size + (between UNIV_ZIP_SIZE_MIN and + UNIV_PAGE_SIZE) */ + ibool* lru) /*!< in: pointer to a variable + that will be assigned TRUE if + storage was allocated from the + LRU list and buf_pool->mutex was + temporarily released */ + __attribute__((malloc, nonnull)); + +/**********************************************************************//** +Deallocate a block. */ +UNIV_INLINE +void +buf_buddy_free( +/*===========*/ + buf_pool_t* buf_pool, /*!< in/out: buffer pool in which + the block resides */ + void* buf, /*!< in: block to be freed, must not + be pointed to by the buffer pool */ + ulint size) /*!< in: block size, + up to UNIV_PAGE_SIZE */ + __attribute__((nonnull)); + +#ifndef UNIV_NONINL +# include "buf0buddy.ic" +#endif + +#endif /* buf0buddy_h */ diff --git a/storage/innobase/include/buf0buddy.ic b/storage/innobase/include/buf0buddy.ic new file mode 100644 index 00000000000..be2f950162d --- /dev/null +++ b/storage/innobase/include/buf0buddy.ic @@ -0,0 +1,143 @@ +/***************************************************************************** + +Copyright (c) 2006, 2011, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/buf0buddy.ic +Binary buddy allocator for compressed pages + +Created December 2006 by Marko Makela +*******************************************************/ + +#ifdef UNIV_MATERIALIZE +# undef UNIV_INLINE +# define UNIV_INLINE +#endif + +#include "buf0buf.h" +#include "buf0buddy.h" +#include "ut0ut.h" +#include "sync0sync.h" + +/**********************************************************************//** +Allocate a block. The thread calling this function must hold +buf_pool->mutex and must not hold buf_pool->zip_mutex or any block->mutex. +The buf_pool_mutex may be released and reacquired. +@return allocated block, never NULL */ +UNIV_INTERN +void* +buf_buddy_alloc_low( +/*================*/ + buf_pool_t* buf_pool, /*!< in/out: buffer pool instance */ + ulint i, /*!< in: index of buf_pool->zip_free[], + or BUF_BUDDY_SIZES */ + ibool* lru) /*!< in: pointer to a variable that + will be assigned TRUE if storage was + allocated from the LRU list and + buf_pool->mutex was temporarily + released */ + __attribute__((malloc, nonnull)); + +/**********************************************************************//** +Deallocate a block. */ +UNIV_INTERN +void +buf_buddy_free_low( +/*===============*/ + buf_pool_t* buf_pool, /*!< in: buffer pool instance */ + void* buf, /*!< in: block to be freed, must not be + pointed to by the buffer pool */ + ulint i) /*!< in: index of buf_pool->zip_free[], + or BUF_BUDDY_SIZES */ + __attribute__((nonnull)); + +/**********************************************************************//** +Get the index of buf_pool->zip_free[] for a given block size. +@return index of buf_pool->zip_free[], or BUF_BUDDY_SIZES */ +UNIV_INLINE +ulint +buf_buddy_get_slot( +/*===============*/ + ulint size) /*!< in: block size */ +{ + ulint i; + ulint s; + + ut_ad(size >= UNIV_ZIP_SIZE_MIN); + + for (i = 0, s = BUF_BUDDY_LOW; s < size; i++, s <<= 1) { + } + + ut_ad(i <= BUF_BUDDY_SIZES); + return(i); +} + +/**********************************************************************//** +Allocate a block. The thread calling this function must hold +buf_pool->mutex and must not hold buf_pool->zip_mutex or any +block->mutex. The buf_pool->mutex may be released and reacquired. +This function should only be used for allocating compressed page frames. +@return allocated block, never NULL */ +UNIV_INLINE +byte* +buf_buddy_alloc( +/*============*/ + buf_pool_t* buf_pool, /*!< in/out: buffer pool in which + the page resides */ + ulint size, /*!< in: compressed page size + (between UNIV_ZIP_SIZE_MIN and + UNIV_PAGE_SIZE) */ + ibool* lru) /*!< in: pointer to a variable + that will be assigned TRUE if + storage was allocated from the + LRU list and buf_pool->mutex was + temporarily released */ +{ + ut_ad(buf_pool_mutex_own(buf_pool)); + ut_ad(ut_is_2pow(size)); + ut_ad(size >= UNIV_ZIP_SIZE_MIN); + ut_ad(size <= UNIV_PAGE_SIZE); + + return((byte*) buf_buddy_alloc_low(buf_pool, buf_buddy_get_slot(size), + lru)); +} + +/**********************************************************************//** +Deallocate a block. */ +UNIV_INLINE +void +buf_buddy_free( +/*===========*/ + buf_pool_t* buf_pool, /*!< in/out: buffer pool in which + the block resides */ + void* buf, /*!< in: block to be freed, must not + be pointed to by the buffer pool */ + ulint size) /*!< in: block size, + up to UNIV_PAGE_SIZE */ +{ + ut_ad(buf_pool_mutex_own(buf_pool)); + ut_ad(ut_is_2pow(size)); + ut_ad(size >= UNIV_ZIP_SIZE_MIN); + ut_ad(size <= UNIV_PAGE_SIZE); + + buf_buddy_free_low(buf_pool, buf, buf_buddy_get_slot(size)); +} + +#ifdef UNIV_MATERIALIZE +# undef UNIV_INLINE +# define UNIV_INLINE UNIV_INLINE_ORIGINAL +#endif diff --git a/storage/innobase/include/buf0buf.h b/storage/innobase/include/buf0buf.h new file mode 100644 index 00000000000..b669bd203e0 --- /dev/null +++ b/storage/innobase/include/buf0buf.h @@ -0,0 +1,2179 @@ +/***************************************************************************** + +Copyright (c) 1995, 2014, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/buf0buf.h +The database buffer pool high-level routines + +Created 11/5/1995 Heikki Tuuri +*******************************************************/ + +#ifndef buf0buf_h +#define buf0buf_h + +#include "univ.i" +#include "fil0fil.h" +#include "mtr0types.h" +#include "buf0types.h" +#include "hash0hash.h" +#include "ut0byte.h" +#include "page0types.h" +#ifndef UNIV_HOTBACKUP +#include "ut0rbt.h" +#include "os0proc.h" +#include "log0log.h" + +/** @name Modes for buf_page_get_gen */ +/* @{ */ +#define BUF_GET 10 /*!< get always */ +#define BUF_GET_IF_IN_POOL 11 /*!< get if in pool */ +#define BUF_PEEK_IF_IN_POOL 12 /*!< get if in pool, do not make + the block young in the LRU list */ +#define BUF_GET_NO_LATCH 14 /*!< get and bufferfix, but + set no latch; we have + separated this case, because + it is error-prone programming + not to set a latch, and it + should be used with care */ +#define BUF_GET_IF_IN_POOL_OR_WATCH 15 + /*!< Get the page only if it's in the + buffer pool, if not then set a watch + on the page. */ +#define BUF_GET_POSSIBLY_FREED 16 + /*!< Like BUF_GET, but do not mind + if the file page has been freed. */ +/* @} */ +/** @name Modes for buf_page_get_known_nowait */ +/* @{ */ +#define BUF_MAKE_YOUNG 51 /*!< Move the block to the + start of the LRU list if there + is a danger that the block + would drift out of the buffer + pool*/ +#define BUF_KEEP_OLD 52 /*!< Preserve the current LRU + position of the block. */ +/* @} */ + +#define MAX_BUFFER_POOLS_BITS 6 /*!< Number of bits to representing + a buffer pool ID */ + +#define MAX_BUFFER_POOLS (1 << MAX_BUFFER_POOLS_BITS) + /*!< The maximum number of buffer + pools that can be defined */ + +#define BUF_POOL_WATCH_SIZE (srv_n_purge_threads + 1) + /*!< Maximum number of concurrent + buffer pool watches */ +#define MAX_PAGE_HASH_LOCKS 1024 /*!< The maximum number of + page_hash locks */ + +extern buf_pool_t* buf_pool_ptr; /*!< The buffer pools + of the database */ +#ifdef UNIV_DEBUG +extern ibool buf_debug_prints;/*!< If this is set TRUE, the program + prints info whenever read or flush + occurs */ +#endif /* UNIV_DEBUG */ +extern ulint srv_buf_pool_instances; +extern ulint srv_buf_pool_curr_size; +#else /* !UNIV_HOTBACKUP */ +extern buf_block_t* back_block1; /*!< first block, for --apply-log */ +extern buf_block_t* back_block2; /*!< second block, for page reorganize */ +#endif /* !UNIV_HOTBACKUP */ + +/** Magic value to use instead of checksums when they are disabled */ +#define BUF_NO_CHECKSUM_MAGIC 0xDEADBEEFUL + +/** @brief States of a control block +@see buf_page_t + +The enumeration values must be 0..7. */ +enum buf_page_state { + BUF_BLOCK_POOL_WATCH, /*!< a sentinel for the buffer pool + watch, element of buf_pool->watch[] */ + BUF_BLOCK_ZIP_PAGE, /*!< contains a clean + compressed page */ + BUF_BLOCK_ZIP_DIRTY, /*!< contains a compressed + page that is in the + buf_pool->flush_list */ + + BUF_BLOCK_NOT_USED, /*!< is in the free list; + must be after the BUF_BLOCK_ZIP_ + constants for compressed-only pages + @see buf_block_state_valid() */ + BUF_BLOCK_READY_FOR_USE, /*!< when buf_LRU_get_free_block + returns a block, it is in this state */ + BUF_BLOCK_FILE_PAGE, /*!< contains a buffered file page */ + BUF_BLOCK_MEMORY, /*!< contains some main memory + object */ + BUF_BLOCK_REMOVE_HASH /*!< hash index should be removed + before putting to the free list */ +}; + + +/** This structure defines information we will fetch from each buffer pool. It +will be used to print table IO stats */ +struct buf_pool_info_t{ + /* General buffer pool info */ + ulint pool_unique_id; /*!< Buffer Pool ID */ + ulint pool_size; /*!< Buffer Pool size in pages */ + ulint lru_len; /*!< Length of buf_pool->LRU */ + ulint old_lru_len; /*!< buf_pool->LRU_old_len */ + ulint free_list_len; /*!< Length of buf_pool->free list */ + ulint flush_list_len; /*!< Length of buf_pool->flush_list */ + ulint n_pend_unzip; /*!< buf_pool->n_pend_unzip, pages + pending decompress */ + ulint n_pend_reads; /*!< buf_pool->n_pend_reads, pages + pending read */ + ulint n_pending_flush_lru; /*!< Pages pending flush in LRU */ + ulint n_pending_flush_single_page;/*!< Pages pending to be + flushed as part of single page + flushes issued by various user + threads */ + ulint n_pending_flush_list; /*!< Pages pending flush in FLUSH + LIST */ + ulint n_pages_made_young; /*!< number of pages made young */ + ulint n_pages_not_made_young; /*!< number of pages not made young */ + ulint n_pages_read; /*!< buf_pool->n_pages_read */ + ulint n_pages_created; /*!< buf_pool->n_pages_created */ + ulint n_pages_written; /*!< buf_pool->n_pages_written */ + ulint n_page_gets; /*!< buf_pool->n_page_gets */ + ulint n_ra_pages_read_rnd; /*!< buf_pool->n_ra_pages_read_rnd, + number of pages readahead */ + ulint n_ra_pages_read; /*!< buf_pool->n_ra_pages_read, number + of pages readahead */ + ulint n_ra_pages_evicted; /*!< buf_pool->n_ra_pages_evicted, + number of readahead pages evicted + without access */ + ulint n_page_get_delta; /*!< num of buffer pool page gets since + last printout */ + + /* Buffer pool access stats */ + double page_made_young_rate; /*!< page made young rate in pages + per second */ + double page_not_made_young_rate;/*!< page not made young rate + in pages per second */ + double pages_read_rate; /*!< num of pages read per second */ + double pages_created_rate; /*!< num of pages create per second */ + double pages_written_rate; /*!< num of pages written per second */ + ulint page_read_delta; /*!< num of pages read since last + printout */ + ulint young_making_delta; /*!< num of pages made young since + last printout */ + ulint not_young_making_delta; /*!< num of pages not make young since + last printout */ + + /* Statistics about read ahead algorithm. */ + double pages_readahead_rnd_rate;/*!< random readahead rate in pages per + second */ + double pages_readahead_rate; /*!< readahead rate in pages per + second */ + double pages_evicted_rate; /*!< rate of readahead page evicted + without access, in pages per second */ + + /* Stats about LRU eviction */ + ulint unzip_lru_len; /*!< length of buf_pool->unzip_LRU + list */ + /* Counters for LRU policy */ + ulint io_sum; /*!< buf_LRU_stat_sum.io */ + ulint io_cur; /*!< buf_LRU_stat_cur.io, num of IO + for current interval */ + ulint unzip_sum; /*!< buf_LRU_stat_sum.unzip */ + ulint unzip_cur; /*!< buf_LRU_stat_cur.unzip, num + pages decompressed in current + interval */ +}; + +/** The occupied bytes of lists in all buffer pools */ +struct buf_pools_list_size_t { + ulint LRU_bytes; /*!< LRU size in bytes */ + ulint unzip_LRU_bytes; /*!< unzip_LRU size in bytes */ + ulint flush_list_bytes; /*!< flush_list size in bytes */ +}; + +#ifndef UNIV_HOTBACKUP +/********************************************************************//** +Acquire mutex on all buffer pool instances */ +UNIV_INLINE +void +buf_pool_mutex_enter_all(void); +/*===========================*/ + +/********************************************************************//** +Release mutex on all buffer pool instances */ +UNIV_INLINE +void +buf_pool_mutex_exit_all(void); +/*==========================*/ + +/********************************************************************//** +Creates the buffer pool. +@return DB_SUCCESS if success, DB_ERROR if not enough memory or error */ +UNIV_INTERN +dberr_t +buf_pool_init( +/*=========*/ + ulint size, /*!< in: Size of the total pool in bytes */ + ulint n_instances); /*!< in: Number of instances */ +/********************************************************************//** +Frees the buffer pool at shutdown. This must not be invoked before +freeing all mutexes. */ +UNIV_INTERN +void +buf_pool_free( +/*==========*/ + ulint n_instances); /*!< in: numbere of instances to free */ + +/********************************************************************//** +Clears the adaptive hash index on all pages in the buffer pool. */ +UNIV_INTERN +void +buf_pool_clear_hash_index(void); +/*===========================*/ + +/********************************************************************//** +Relocate a buffer control block. Relocates the block on the LRU list +and in buf_pool->page_hash. Does not relocate bpage->list. +The caller must take care of relocating bpage->list. */ +UNIV_INTERN +void +buf_relocate( +/*=========*/ + buf_page_t* bpage, /*!< in/out: control block being relocated; + buf_page_get_state(bpage) must be + BUF_BLOCK_ZIP_DIRTY or BUF_BLOCK_ZIP_PAGE */ + buf_page_t* dpage) /*!< in/out: destination control block */ + __attribute__((nonnull)); +/*********************************************************************//** +Gets the current size of buffer buf_pool in bytes. +@return size in bytes */ +UNIV_INLINE +ulint +buf_pool_get_curr_size(void); +/*========================*/ +/*********************************************************************//** +Gets the current size of buffer buf_pool in frames. +@return size in pages */ +UNIV_INLINE +ulint +buf_pool_get_n_pages(void); +/*=======================*/ +/********************************************************************//** +Gets the smallest oldest_modification lsn for any page in the pool. Returns +zero if all modified pages have been flushed to disk. +@return oldest modification in pool, zero if none */ +UNIV_INTERN +lsn_t +buf_pool_get_oldest_modification(void); +/*==================================*/ + +/********************************************************************//** +Allocates a buf_page_t descriptor. This function must succeed. In case +of failure we assert in this function. */ +UNIV_INLINE +buf_page_t* +buf_page_alloc_descriptor(void) +/*===========================*/ + __attribute__((malloc)); +/********************************************************************//** +Free a buf_page_t descriptor. */ +UNIV_INLINE +void +buf_page_free_descriptor( +/*=====================*/ + buf_page_t* bpage) /*!< in: bpage descriptor to free. */ + __attribute__((nonnull)); + +/********************************************************************//** +Allocates a buffer block. +@return own: the allocated block, in state BUF_BLOCK_MEMORY */ +UNIV_INTERN +buf_block_t* +buf_block_alloc( +/*============*/ + buf_pool_t* buf_pool); /*!< in: buffer pool instance, + or NULL for round-robin selection + of the buffer pool */ +/********************************************************************//** +Frees a buffer block which does not contain a file page. */ +UNIV_INLINE +void +buf_block_free( +/*===========*/ + buf_block_t* block); /*!< in, own: block to be freed */ +#endif /* !UNIV_HOTBACKUP */ +/*********************************************************************//** +Copies contents of a buffer frame to a given buffer. +@return buf */ +UNIV_INLINE +byte* +buf_frame_copy( +/*===========*/ + byte* buf, /*!< in: buffer to copy to */ + const buf_frame_t* frame); /*!< in: buffer frame */ +#ifndef UNIV_HOTBACKUP +/**************************************************************//** +NOTE! The following macros should be used instead of buf_page_get_gen, +to improve debugging. Only values RW_S_LATCH and RW_X_LATCH are allowed +in LA! */ +#define buf_page_get(SP, ZS, OF, LA, MTR) buf_page_get_gen(\ + SP, ZS, OF, LA, NULL,\ + BUF_GET, __FILE__, __LINE__, MTR) +/**************************************************************//** +Use these macros to bufferfix a page with no latching. Remember not to +read the contents of the page unless you know it is safe. Do not modify +the contents of the page! We have separated this case, because it is +error-prone programming not to set a latch, and it should be used +with care. */ +#define buf_page_get_with_no_latch(SP, ZS, OF, MTR) buf_page_get_gen(\ + SP, ZS, OF, RW_NO_LATCH, NULL,\ + BUF_GET_NO_LATCH, __FILE__, __LINE__, MTR) +/********************************************************************//** +This is the general function used to get optimistic access to a database +page. +@return TRUE if success */ +UNIV_INTERN +ibool +buf_page_optimistic_get( +/*====================*/ + ulint rw_latch,/*!< in: RW_S_LATCH, RW_X_LATCH */ + buf_block_t* block, /*!< in: guessed block */ + ib_uint64_t modify_clock,/*!< in: modify clock value */ + const char* file, /*!< in: file name */ + ulint line, /*!< in: line where called */ + mtr_t* mtr); /*!< in: mini-transaction */ +/********************************************************************//** +This is used to get access to a known database page, when no waiting can be +done. +@return TRUE if success */ +UNIV_INTERN +ibool +buf_page_get_known_nowait( +/*======================*/ + ulint rw_latch,/*!< in: RW_S_LATCH, RW_X_LATCH */ + buf_block_t* block, /*!< in: the known page */ + ulint mode, /*!< in: BUF_MAKE_YOUNG or BUF_KEEP_OLD */ + const char* file, /*!< in: file name */ + ulint line, /*!< in: line where called */ + mtr_t* mtr); /*!< in: mini-transaction */ + +/*******************************************************************//** +Given a tablespace id and page number tries to get that page. If the +page is not in the buffer pool it is not loaded and NULL is returned. +Suitable for using when holding the lock_sys_t::mutex. */ +UNIV_INTERN +const buf_block_t* +buf_page_try_get_func( +/*==================*/ + ulint space_id,/*!< in: tablespace id */ + ulint page_no,/*!< in: page number */ + const char* file, /*!< in: file name */ + ulint line, /*!< in: line where called */ + mtr_t* mtr); /*!< in: mini-transaction */ + +/** Tries to get a page. If the page is not in the buffer pool it is +not loaded. Suitable for using when holding the lock_sys_t::mutex. +@param space_id in: tablespace id +@param page_no in: page number +@param mtr in: mini-transaction +@return the page if in buffer pool, NULL if not */ +#define buf_page_try_get(space_id, page_no, mtr) \ + buf_page_try_get_func(space_id, page_no, __FILE__, __LINE__, mtr); + +/********************************************************************//** +Get read access to a compressed page (usually of type +FIL_PAGE_TYPE_ZBLOB or FIL_PAGE_TYPE_ZBLOB2). +The page must be released with buf_page_release_zip(). +NOTE: the page is not protected by any latch. Mutual exclusion has to +be implemented at a higher level. In other words, all possible +accesses to a given page through this function must be protected by +the same set of mutexes or latches. +@return pointer to the block, or NULL if not compressed */ +UNIV_INTERN +buf_page_t* +buf_page_get_zip( +/*=============*/ + ulint space, /*!< in: space id */ + ulint zip_size,/*!< in: compressed page size */ + ulint offset);/*!< in: page number */ +/********************************************************************//** +This is the general function used to get access to a database page. +@return pointer to the block or NULL */ +UNIV_INTERN +buf_block_t* +buf_page_get_gen( +/*=============*/ + ulint space, /*!< in: space id */ + ulint zip_size,/*!< in: compressed page size in bytes + or 0 for uncompressed pages */ + ulint offset, /*!< in: page number */ + ulint rw_latch,/*!< in: RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH */ + buf_block_t* guess, /*!< in: guessed block or NULL */ + ulint mode, /*!< in: BUF_GET, BUF_GET_IF_IN_POOL, + BUF_PEEK_IF_IN_POOL, BUF_GET_NO_LATCH or + BUF_GET_IF_IN_POOL_OR_WATCH */ + const char* file, /*!< in: file name */ + ulint line, /*!< in: line where called */ + mtr_t* mtr); /*!< in: mini-transaction */ +/********************************************************************//** +Initializes a page to the buffer buf_pool. The page is usually not read +from a file even if it cannot be found in the buffer buf_pool. This is one +of the functions which perform to a block a state transition NOT_USED => +FILE_PAGE (the other is buf_page_get_gen). +@return pointer to the block, page bufferfixed */ +UNIV_INTERN +buf_block_t* +buf_page_create( +/*============*/ + ulint space, /*!< in: space id */ + ulint offset, /*!< in: offset of the page within space in units of + a page */ + ulint zip_size,/*!< in: compressed page size, or 0 */ + mtr_t* mtr); /*!< in: mini-transaction handle */ +#else /* !UNIV_HOTBACKUP */ +/********************************************************************//** +Inits a page to the buffer buf_pool, for use in mysqlbackup --restore. */ +UNIV_INTERN +void +buf_page_init_for_backup_restore( +/*=============================*/ + ulint space, /*!< in: space id */ + ulint offset, /*!< in: offset of the page within space + in units of a page */ + ulint zip_size,/*!< in: compressed page size in bytes + or 0 for uncompressed pages */ + buf_block_t* block); /*!< in: block to init */ +#endif /* !UNIV_HOTBACKUP */ + +#ifndef UNIV_HOTBACKUP +/********************************************************************//** +Releases a compressed-only page acquired with buf_page_get_zip(). */ +UNIV_INLINE +void +buf_page_release_zip( +/*=================*/ + buf_page_t* bpage); /*!< in: buffer block */ +/********************************************************************//** +Decrements the bufferfix count of a buffer control block and releases +a latch, if specified. */ +UNIV_INLINE +void +buf_page_release( +/*=============*/ + buf_block_t* block, /*!< in: buffer block */ + ulint rw_latch); /*!< in: RW_S_LATCH, RW_X_LATCH, + RW_NO_LATCH */ +/********************************************************************//** +Moves a page to the start of the buffer pool LRU list. This high-level +function can be used to prevent an important page from slipping out of +the buffer pool. */ +UNIV_INTERN +void +buf_page_make_young( +/*================*/ + buf_page_t* bpage); /*!< in: buffer block of a file page */ +/********************************************************************//** +Returns TRUE if the page can be found in the buffer pool hash table. + +NOTE that it is possible that the page is not yet read from disk, +though. + +@return TRUE if found in the page hash table */ +UNIV_INLINE +ibool +buf_page_peek( +/*==========*/ + ulint space, /*!< in: space id */ + ulint offset);/*!< in: page number */ +#if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG +/********************************************************************//** +Sets file_page_was_freed TRUE if the page is found in the buffer pool. +This function should be called when we free a file page and want the +debug version to check that it is not accessed any more unless +reallocated. +@return control block if found in page hash table, otherwise NULL */ +UNIV_INTERN +buf_page_t* +buf_page_set_file_page_was_freed( +/*=============================*/ + ulint space, /*!< in: space id */ + ulint offset);/*!< in: page number */ +/********************************************************************//** +Sets file_page_was_freed FALSE if the page is found in the buffer pool. +This function should be called when we free a file page and want the +debug version to check that it is not accessed any more unless +reallocated. +@return control block if found in page hash table, otherwise NULL */ +UNIV_INTERN +buf_page_t* +buf_page_reset_file_page_was_freed( +/*===============================*/ + ulint space, /*!< in: space id */ + ulint offset); /*!< in: page number */ +#endif /* UNIV_DEBUG_FILE_ACCESSES || UNIV_DEBUG */ +/********************************************************************//** +Reads the freed_page_clock of a buffer block. +@return freed_page_clock */ +UNIV_INLINE +ulint +buf_page_get_freed_page_clock( +/*==========================*/ + const buf_page_t* bpage) /*!< in: block */ + __attribute__((pure)); +/********************************************************************//** +Reads the freed_page_clock of a buffer block. +@return freed_page_clock */ +UNIV_INLINE +ulint +buf_block_get_freed_page_clock( +/*===========================*/ + const buf_block_t* block) /*!< in: block */ + __attribute__((pure)); + +/********************************************************************//** +Tells if a block is still close enough to the MRU end of the LRU list +meaning that it is not in danger of getting evicted and also implying +that it has been accessed recently. +Note that this is for heuristics only and does not reserve buffer pool +mutex. +@return TRUE if block is close to MRU end of LRU */ +UNIV_INLINE +ibool +buf_page_peek_if_young( +/*===================*/ + const buf_page_t* bpage); /*!< in: block */ +/********************************************************************//** +Recommends a move of a block to the start of the LRU list if there is danger +of dropping from the buffer pool. NOTE: does not reserve the buffer pool +mutex. +@return TRUE if should be made younger */ +UNIV_INLINE +ibool +buf_page_peek_if_too_old( +/*=====================*/ + const buf_page_t* bpage); /*!< in: block to make younger */ +/********************************************************************//** +Gets the youngest modification log sequence number for a frame. +Returns zero if not file page or no modification occurred yet. +@return newest modification to page */ +UNIV_INLINE +lsn_t +buf_page_get_newest_modification( +/*=============================*/ + const buf_page_t* bpage); /*!< in: block containing the + page frame */ +/********************************************************************//** +Increments the modify clock of a frame by 1. The caller must (1) own the +buf_pool->mutex and block bufferfix count has to be zero, (2) or own an x-lock +on the block. */ +UNIV_INLINE +void +buf_block_modify_clock_inc( +/*=======================*/ + buf_block_t* block); /*!< in: block */ +/********************************************************************//** +Returns the value of the modify clock. The caller must have an s-lock +or x-lock on the block. +@return value */ +UNIV_INLINE +ib_uint64_t +buf_block_get_modify_clock( +/*=======================*/ + buf_block_t* block); /*!< in: block */ +/*******************************************************************//** +Increments the bufferfix count. */ +UNIV_INLINE +void +buf_block_buf_fix_inc_func( +/*=======================*/ +# ifdef UNIV_SYNC_DEBUG + const char* file, /*!< in: file name */ + ulint line, /*!< in: line */ +# endif /* UNIV_SYNC_DEBUG */ + buf_block_t* block) /*!< in/out: block to bufferfix */ + __attribute__((nonnull)); + +/*******************************************************************//** +Increments the bufferfix count. */ +UNIV_INLINE +void +buf_block_fix( +/*===========*/ + buf_block_t* block); /*!< in/out: block to bufferfix */ + +/*******************************************************************//** +Increments the bufferfix count. */ +UNIV_INLINE +void +buf_block_unfix( +/*===========*/ + buf_block_t* block); /*!< in/out: block to bufferfix */ + +# ifdef UNIV_SYNC_DEBUG +/** Increments the bufferfix count. +@param b in/out: block to bufferfix +@param f in: file name where requested +@param l in: line number where requested */ +# define buf_block_buf_fix_inc(b,f,l) buf_block_buf_fix_inc_func(f,l,b) +# else /* UNIV_SYNC_DEBUG */ +/** Increments the bufferfix count. +@param b in/out: block to bufferfix +@param f in: file name where requested +@param l in: line number where requested */ +# define buf_block_buf_fix_inc(b,f,l) buf_block_buf_fix_inc_func(b) +# endif /* UNIV_SYNC_DEBUG */ +#else /* !UNIV_HOTBACKUP */ +# define buf_block_modify_clock_inc(block) ((void) 0) +#endif /* !UNIV_HOTBACKUP */ +/********************************************************************//** +Checks if a page is corrupt. +@return TRUE if corrupted */ +UNIV_INTERN +ibool +buf_page_is_corrupted( +/*==================*/ + bool check_lsn, /*!< in: true if we need to check the + and complain about the LSN */ + const byte* read_buf, /*!< in: a database page */ + ulint zip_size) /*!< in: size of compressed page; + 0 for uncompressed pages */ + __attribute__((nonnull, warn_unused_result)); +/********************************************************************//** +Checks if a page is all zeroes. +@return TRUE if the page is all zeroes */ +bool +buf_page_is_zeroes( +/*===============*/ + const byte* read_buf, /*!< in: a database page */ + const ulint zip_size); /*!< in: size of compressed page; + 0 for uncompressed pages */ +#ifndef UNIV_HOTBACKUP +/**********************************************************************//** +Gets the space id, page offset, and byte offset within page of a +pointer pointing to a buffer frame containing a file page. */ +UNIV_INLINE +void +buf_ptr_get_fsp_addr( +/*=================*/ + const void* ptr, /*!< in: pointer to a buffer frame */ + ulint* space, /*!< out: space id */ + fil_addr_t* addr); /*!< out: page offset and byte offset */ +/**********************************************************************//** +Gets the hash value of a block. This can be used in searches in the +lock hash table. +@return lock hash value */ +UNIV_INLINE +ulint +buf_block_get_lock_hash_val( +/*========================*/ + const buf_block_t* block) /*!< in: block */ + __attribute__((pure)); +#ifdef UNIV_DEBUG +/*********************************************************************//** +Finds a block in the buffer pool that points to a +given compressed page. +@return buffer block pointing to the compressed page, or NULL */ +UNIV_INTERN +buf_block_t* +buf_pool_contains_zip( +/*==================*/ + buf_pool_t* buf_pool, /*!< in: buffer pool instance */ + const void* data); /*!< in: pointer to compressed page */ +#endif /* UNIV_DEBUG */ + +/*********************************************************************** +FIXME_FTS: Gets the frame the pointer is pointing to. */ +UNIV_INLINE +buf_frame_t* +buf_frame_align( +/*============*/ + /* out: pointer to frame */ + byte* ptr); /* in: pointer to a frame */ + + +#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG +/*********************************************************************//** +Validates the buffer pool data structure. +@return TRUE */ +UNIV_INTERN +ibool +buf_validate(void); +/*==============*/ +#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ +#if defined UNIV_DEBUG_PRINT || defined UNIV_DEBUG || defined UNIV_BUF_DEBUG +/*********************************************************************//** +Prints info of the buffer pool data structure. */ +UNIV_INTERN +void +buf_print(void); +/*============*/ +#endif /* UNIV_DEBUG_PRINT || UNIV_DEBUG || UNIV_BUF_DEBUG */ +#endif /* !UNIV_HOTBACKUP */ +enum buf_page_print_flags { + /** Do not crash at the end of buf_page_print(). */ + BUF_PAGE_PRINT_NO_CRASH = 1, + /** Do not print the full page dump. */ + BUF_PAGE_PRINT_NO_FULL = 2 +}; + +/********************************************************************//** +Prints a page to stderr. */ +UNIV_INTERN +void +buf_page_print( +/*===========*/ + const byte* read_buf, /*!< in: a database page */ + ulint zip_size, /*!< in: compressed page size, or + 0 for uncompressed pages */ + ulint flags) /*!< in: 0 or + BUF_PAGE_PRINT_NO_CRASH or + BUF_PAGE_PRINT_NO_FULL */ + UNIV_COLD __attribute__((nonnull)); +/********************************************************************//** +Decompress a block. +@return TRUE if successful */ +UNIV_INTERN +ibool +buf_zip_decompress( +/*===============*/ + buf_block_t* block, /*!< in/out: block */ + ibool check); /*!< in: TRUE=verify the page checksum */ +#ifndef UNIV_HOTBACKUP +#ifdef UNIV_DEBUG +/*********************************************************************//** +Returns the number of latched pages in the buffer pool. +@return number of latched pages */ +UNIV_INTERN +ulint +buf_get_latched_pages_number(void); +/*==============================*/ +#endif /* UNIV_DEBUG */ +/*********************************************************************//** +Returns the number of pending buf pool read ios. +@return number of pending read I/O operations */ +UNIV_INTERN +ulint +buf_get_n_pending_read_ios(void); +/*============================*/ +/*********************************************************************//** +Prints info of the buffer i/o. */ +UNIV_INTERN +void +buf_print_io( +/*=========*/ + FILE* file); /*!< in: file where to print */ +/*******************************************************************//** +Collect buffer pool stats information for a buffer pool. Also +record aggregated stats if there are more than one buffer pool +in the server */ +UNIV_INTERN +void +buf_stats_get_pool_info( +/*====================*/ + buf_pool_t* buf_pool, /*!< in: buffer pool */ + ulint pool_id, /*!< in: buffer pool ID */ + buf_pool_info_t* all_pool_info); /*!< in/out: buffer pool info + to fill */ +/*********************************************************************//** +Returns the ratio in percents of modified pages in the buffer pool / +database pages in the buffer pool. +@return modified page percentage ratio */ +UNIV_INTERN +ulint +buf_get_modified_ratio_pct(void); +/*============================*/ +/**********************************************************************//** +Refreshes the statistics used to print per-second averages. */ +UNIV_INTERN +void +buf_refresh_io_stats( +/*=================*/ + buf_pool_t* buf_pool); /*!< buffer pool instance */ +/**********************************************************************//** +Refreshes the statistics used to print per-second averages. */ +UNIV_INTERN +void +buf_refresh_io_stats_all(void); +/*=================*/ +/*********************************************************************//** +Asserts that all file pages in the buffer are in a replaceable state. +@return TRUE */ +UNIV_INTERN +ibool +buf_all_freed(void); +/*===============*/ +/*********************************************************************//** +Checks that there currently are no pending i/o-operations for the buffer +pool. +@return number of pending i/o operations */ +UNIV_INTERN +ulint +buf_pool_check_no_pending_io(void); +/*==============================*/ +/*********************************************************************//** +Invalidates the file pages in the buffer pool when an archive recovery is +completed. All the file pages buffered must be in a replaceable state when +this function is called: not latched and not modified. */ +UNIV_INTERN +void +buf_pool_invalidate(void); +/*=====================*/ +#endif /* !UNIV_HOTBACKUP */ + +/*======================================================================== +--------------------------- LOWER LEVEL ROUTINES ------------------------- +=========================================================================*/ + +#ifdef UNIV_SYNC_DEBUG +/*********************************************************************//** +Adds latch level info for the rw-lock protecting the buffer frame. This +should be called in the debug version after a successful latching of a +page if we know the latching order level of the acquired latch. */ +UNIV_INLINE +void +buf_block_dbg_add_level( +/*====================*/ + buf_block_t* block, /*!< in: buffer page + where we have acquired latch */ + ulint level); /*!< in: latching order level */ +#else /* UNIV_SYNC_DEBUG */ +# define buf_block_dbg_add_level(block, level) /* nothing */ +#endif /* UNIV_SYNC_DEBUG */ +/*********************************************************************//** +Gets the state of a block. +@return state */ +UNIV_INLINE +enum buf_page_state +buf_page_get_state( +/*===============*/ + const buf_page_t* bpage); /*!< in: pointer to the control block */ +/*********************************************************************//** +Gets the state of a block. +@return state */ +UNIV_INLINE +enum buf_page_state +buf_block_get_state( +/*================*/ + const buf_block_t* block) /*!< in: pointer to the control block */ + __attribute__((pure)); +/*********************************************************************//** +Sets the state of a block. */ +UNIV_INLINE +void +buf_page_set_state( +/*===============*/ + buf_page_t* bpage, /*!< in/out: pointer to control block */ + enum buf_page_state state); /*!< in: state */ +/*********************************************************************//** +Sets the state of a block. */ +UNIV_INLINE +void +buf_block_set_state( +/*================*/ + buf_block_t* block, /*!< in/out: pointer to control block */ + enum buf_page_state state); /*!< in: state */ +/*********************************************************************//** +Determines if a block is mapped to a tablespace. +@return TRUE if mapped */ +UNIV_INLINE +ibool +buf_page_in_file( +/*=============*/ + const buf_page_t* bpage) /*!< in: pointer to control block */ + __attribute__((pure)); +#ifndef UNIV_HOTBACKUP +/*********************************************************************//** +Determines if a block should be on unzip_LRU list. +@return TRUE if block belongs to unzip_LRU */ +UNIV_INLINE +ibool +buf_page_belongs_to_unzip_LRU( +/*==========================*/ + const buf_page_t* bpage) /*!< in: pointer to control block */ + __attribute__((pure)); + +/*********************************************************************//** +Gets the mutex of a block. +@return pointer to mutex protecting bpage */ +UNIV_INLINE +ib_mutex_t* +buf_page_get_mutex( +/*===============*/ + const buf_page_t* bpage) /*!< in: pointer to control block */ + __attribute__((pure)); + +/*********************************************************************//** +Get the flush type of a page. +@return flush type */ +UNIV_INLINE +buf_flush_t +buf_page_get_flush_type( +/*====================*/ + const buf_page_t* bpage) /*!< in: buffer page */ + __attribute__((pure)); +/*********************************************************************//** +Set the flush type of a page. */ +UNIV_INLINE +void +buf_page_set_flush_type( +/*====================*/ + buf_page_t* bpage, /*!< in: buffer page */ + buf_flush_t flush_type); /*!< in: flush type */ +/*********************************************************************//** +Map a block to a file page. */ +UNIV_INLINE +void +buf_block_set_file_page( +/*====================*/ + buf_block_t* block, /*!< in/out: pointer to control block */ + ulint space, /*!< in: tablespace id */ + ulint page_no);/*!< in: page number */ +/*********************************************************************//** +Gets the io_fix state of a block. +@return io_fix state */ +UNIV_INLINE +enum buf_io_fix +buf_page_get_io_fix( +/*================*/ + const buf_page_t* bpage) /*!< in: pointer to the control block */ + __attribute__((pure)); +/*********************************************************************//** +Gets the io_fix state of a block. +@return io_fix state */ +UNIV_INLINE +enum buf_io_fix +buf_block_get_io_fix( +/*================*/ + const buf_block_t* block) /*!< in: pointer to the control block */ + __attribute__((pure)); +/*********************************************************************//** +Sets the io_fix state of a block. */ +UNIV_INLINE +void +buf_page_set_io_fix( +/*================*/ + buf_page_t* bpage, /*!< in/out: control block */ + enum buf_io_fix io_fix);/*!< in: io_fix state */ +/*********************************************************************//** +Sets the io_fix state of a block. */ +UNIV_INLINE +void +buf_block_set_io_fix( +/*=================*/ + buf_block_t* block, /*!< in/out: control block */ + enum buf_io_fix io_fix);/*!< in: io_fix state */ +/*********************************************************************//** +Makes a block sticky. A sticky block implies that even after we release +the buf_pool->mutex and the block->mutex: +* it cannot be removed from the flush_list +* the block descriptor cannot be relocated +* it cannot be removed from the LRU list +Note that: +* the block can still change its position in the LRU list +* the next and previous pointers can change. */ +UNIV_INLINE +void +buf_page_set_sticky( +/*================*/ + buf_page_t* bpage); /*!< in/out: control block */ +/*********************************************************************//** +Removes stickiness of a block. */ +UNIV_INLINE +void +buf_page_unset_sticky( +/*==================*/ + buf_page_t* bpage); /*!< in/out: control block */ +/********************************************************************//** +Determine if a buffer block can be relocated in memory. The block +can be dirty, but it must not be I/O-fixed or bufferfixed. */ +UNIV_INLINE +ibool +buf_page_can_relocate( +/*==================*/ + const buf_page_t* bpage) /*!< control block being relocated */ + __attribute__((pure)); + +/*********************************************************************//** +Determine if a block has been flagged old. +@return TRUE if old */ +UNIV_INLINE +ibool +buf_page_is_old( +/*============*/ + const buf_page_t* bpage) /*!< in: control block */ + __attribute__((pure)); +/*********************************************************************//** +Flag a block old. */ +UNIV_INLINE +void +buf_page_set_old( +/*=============*/ + buf_page_t* bpage, /*!< in/out: control block */ + ibool old); /*!< in: old */ +/*********************************************************************//** +Determine the time of first access of a block in the buffer pool. +@return ut_time_ms() at the time of first access, 0 if not accessed */ +UNIV_INLINE +unsigned +buf_page_is_accessed( +/*=================*/ + const buf_page_t* bpage) /*!< in: control block */ + __attribute__((nonnull, pure)); +/*********************************************************************//** +Flag a block accessed. */ +UNIV_INLINE +void +buf_page_set_accessed( +/*==================*/ + buf_page_t* bpage) /*!< in/out: control block */ + __attribute__((nonnull)); +/*********************************************************************//** +Gets the buf_block_t handle of a buffered file block if an uncompressed +page frame exists, or NULL. Note: even though bpage is not declared a +const we don't update its value. It is safe to make this pure. +@return control block, or NULL */ +UNIV_INLINE +buf_block_t* +buf_page_get_block( +/*===============*/ + buf_page_t* bpage) /*!< in: control block, or NULL */ + __attribute__((pure)); +#endif /* !UNIV_HOTBACKUP */ +#ifdef UNIV_DEBUG +/*********************************************************************//** +Gets a pointer to the memory frame of a block. +@return pointer to the frame */ +UNIV_INLINE +buf_frame_t* +buf_block_get_frame( +/*================*/ + const buf_block_t* block) /*!< in: pointer to the control block */ + __attribute__((pure)); +#else /* UNIV_DEBUG */ +# define buf_block_get_frame(block) (block)->frame +#endif /* UNIV_DEBUG */ +/*********************************************************************//** +Gets the space id of a block. +@return space id */ +UNIV_INLINE +ulint +buf_page_get_space( +/*===============*/ + const buf_page_t* bpage) /*!< in: pointer to the control block */ + __attribute__((pure)); +/*********************************************************************//** +Gets the space id of a block. +@return space id */ +UNIV_INLINE +ulint +buf_block_get_space( +/*================*/ + const buf_block_t* block) /*!< in: pointer to the control block */ + __attribute__((pure)); +/*********************************************************************//** +Gets the page number of a block. +@return page number */ +UNIV_INLINE +ulint +buf_page_get_page_no( +/*=================*/ + const buf_page_t* bpage) /*!< in: pointer to the control block */ + __attribute__((pure)); +/*********************************************************************//** +Gets the page number of a block. +@return page number */ +UNIV_INLINE +ulint +buf_block_get_page_no( +/*==================*/ + const buf_block_t* block) /*!< in: pointer to the control block */ + __attribute__((pure)); +/*********************************************************************//** +Gets the compressed page size of a block. +@return compressed page size, or 0 */ +UNIV_INLINE +ulint +buf_page_get_zip_size( +/*==================*/ + const buf_page_t* bpage) /*!< in: pointer to the control block */ + __attribute__((pure)); +/*********************************************************************//** +Gets the compressed page size of a block. +@return compressed page size, or 0 */ +UNIV_INLINE +ulint +buf_block_get_zip_size( +/*===================*/ + const buf_block_t* block) /*!< in: pointer to the control block */ + __attribute__((pure)); +/*********************************************************************//** +Gets the compressed page descriptor corresponding to an uncompressed page +if applicable. */ +#define buf_block_get_page_zip(block) \ + ((block)->page.zip.data ? &(block)->page.zip : NULL) +#ifndef UNIV_HOTBACKUP +/*******************************************************************//** +Gets the block to whose frame the pointer is pointing to. +@return pointer to block, never NULL */ +UNIV_INTERN +buf_block_t* +buf_block_align( +/*============*/ + const byte* ptr); /*!< in: pointer to a frame */ +/********************************************************************//** +Find out if a pointer belongs to a buf_block_t. It can be a pointer to +the buf_block_t itself or a member of it +@return TRUE if ptr belongs to a buf_block_t struct */ +UNIV_INTERN +ibool +buf_pointer_is_block_field( +/*=======================*/ + const void* ptr); /*!< in: pointer not + dereferenced */ +/** Find out if a pointer corresponds to a buf_block_t::mutex. +@param m in: mutex candidate +@return TRUE if m is a buf_block_t::mutex */ +#define buf_pool_is_block_mutex(m) \ + buf_pointer_is_block_field((const void*)(m)) +/** Find out if a pointer corresponds to a buf_block_t::lock. +@param l in: rw-lock candidate +@return TRUE if l is a buf_block_t::lock */ +#define buf_pool_is_block_lock(l) \ + buf_pointer_is_block_field((const void*)(l)) + +#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG +/*********************************************************************//** +Gets the compressed page descriptor corresponding to an uncompressed page +if applicable. +@return compressed page descriptor, or NULL */ +UNIV_INLINE +const page_zip_des_t* +buf_frame_get_page_zip( +/*===================*/ + const byte* ptr); /*!< in: pointer to the page */ +#endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */ +/********************************************************************//** +Function which inits a page for read to the buffer buf_pool. If the page is +(1) already in buf_pool, or +(2) if we specify to read only ibuf pages and the page is not an ibuf page, or +(3) if the space is deleted or being deleted, +then this function does nothing. +Sets the io_fix flag to BUF_IO_READ and sets a non-recursive exclusive lock +on the buffer frame. The io-handler must take care that the flag is cleared +and the lock released later. +@return pointer to the block or NULL */ +UNIV_INTERN +buf_page_t* +buf_page_init_for_read( +/*===================*/ + dberr_t* err, /*!< out: DB_SUCCESS or DB_TABLESPACE_DELETED */ + ulint mode, /*!< in: BUF_READ_IBUF_PAGES_ONLY, ... */ + ulint space, /*!< in: space id */ + ulint zip_size,/*!< in: compressed page size, or 0 */ + ibool unzip, /*!< in: TRUE=request uncompressed page */ + ib_int64_t tablespace_version,/*!< in: prevents reading from a wrong + version of the tablespace in case we have done + DISCARD + IMPORT */ + ulint offset);/*!< in: page number */ +/********************************************************************//** +Completes an asynchronous read or write request of a file page to or from +the buffer pool. +@return true if successful */ +UNIV_INTERN +bool +buf_page_io_complete( +/*=================*/ + buf_page_t* bpage); /*!< in: pointer to the block in question */ +/********************************************************************//** +Calculates a folded value of a file page address to use in the page hash +table. +@return the folded value */ +UNIV_INLINE +ulint +buf_page_address_fold( +/*==================*/ + ulint space, /*!< in: space id */ + ulint offset) /*!< in: offset of the page within space */ + __attribute__((const)); +/********************************************************************//** +Calculates the index of a buffer pool to the buf_pool[] array. +@return the position of the buffer pool in buf_pool[] */ +UNIV_INLINE +ulint +buf_pool_index( +/*===========*/ + const buf_pool_t* buf_pool) /*!< in: buffer pool */ + __attribute__((nonnull, const)); +/******************************************************************//** +Returns the buffer pool instance given a page instance +@return buf_pool */ +UNIV_INLINE +buf_pool_t* +buf_pool_from_bpage( +/*================*/ + const buf_page_t* bpage); /*!< in: buffer pool page */ +/******************************************************************//** +Returns the buffer pool instance given a block instance +@return buf_pool */ +UNIV_INLINE +buf_pool_t* +buf_pool_from_block( +/*================*/ + const buf_block_t* block); /*!< in: block */ +/******************************************************************//** +Returns the buffer pool instance given space and offset of page +@return buffer pool */ +UNIV_INLINE +buf_pool_t* +buf_pool_get( +/*==========*/ + ulint space, /*!< in: space id */ + ulint offset);/*!< in: offset of the page within space */ +/******************************************************************//** +Returns the buffer pool instance given its array index +@return buffer pool */ +UNIV_INLINE +buf_pool_t* +buf_pool_from_array( +/*================*/ + ulint index); /*!< in: array index to get + buffer pool instance from */ +/******************************************************************//** +Returns the control block of a file page, NULL if not found. +@return block, NULL if not found */ +UNIV_INLINE +buf_page_t* +buf_page_hash_get_low( +/*==================*/ + buf_pool_t* buf_pool,/*!< buffer pool instance */ + ulint space, /*!< in: space id */ + ulint offset, /*!< in: offset of the page within space */ + ulint fold); /*!< in: buf_page_address_fold(space, offset) */ +/******************************************************************//** +Returns the control block of a file page, NULL if not found. +If the block is found and lock is not NULL then the appropriate +page_hash lock is acquired in the specified lock mode. Otherwise, +mode value is ignored. It is up to the caller to release the +lock. If the block is found and the lock is NULL then the page_hash +lock is released by this function. +@return block, NULL if not found, or watch sentinel (if watch is true) */ +UNIV_INLINE +buf_page_t* +buf_page_hash_get_locked( +/*=====================*/ + /*!< out: pointer to the bpage, + or NULL; if NULL, hash_lock + is also NULL. */ + buf_pool_t* buf_pool, /*!< buffer pool instance */ + ulint space, /*!< in: space id */ + ulint offset, /*!< in: page number */ + rw_lock_t** lock, /*!< in/out: lock of the page + hash acquired if bpage is + found. NULL otherwise. If NULL + is passed then the hash_lock + is released by this function */ + ulint lock_mode, /*!< in: RW_LOCK_EX or + RW_LOCK_SHARED. Ignored if + lock == NULL */ + bool watch = false); /*!< in: if true, return watch + sentinel also. */ +/******************************************************************//** +Returns the control block of a file page, NULL if not found. +If the block is found and lock is not NULL then the appropriate +page_hash lock is acquired in the specified lock mode. Otherwise, +mode value is ignored. It is up to the caller to release the +lock. If the block is found and the lock is NULL then the page_hash +lock is released by this function. +@return block, NULL if not found */ +UNIV_INLINE +buf_block_t* +buf_block_hash_get_locked( +/*=====================*/ + /*!< out: pointer to the bpage, + or NULL; if NULL, hash_lock + is also NULL. */ + buf_pool_t* buf_pool, /*!< buffer pool instance */ + ulint space, /*!< in: space id */ + ulint offset, /*!< in: page number */ + rw_lock_t** lock, /*!< in/out: lock of the page + hash acquired if bpage is + found. NULL otherwise. If NULL + is passed then the hash_lock + is released by this function */ + ulint lock_mode); /*!< in: RW_LOCK_EX or + RW_LOCK_SHARED. Ignored if + lock == NULL */ +/* There are four different ways we can try to get a bpage or block +from the page hash: +1) Caller already holds the appropriate page hash lock: in the case call +buf_page_hash_get_low() function. +2) Caller wants to hold page hash lock in x-mode +3) Caller wants to hold page hash lock in s-mode +4) Caller doesn't want to hold page hash lock */ +#define buf_page_hash_get_s_locked(b, s, o, l) \ + buf_page_hash_get_locked(b, s, o, l, RW_LOCK_SHARED) +#define buf_page_hash_get_x_locked(b, s, o, l) \ + buf_page_hash_get_locked(b, s, o, l, RW_LOCK_EX) +#define buf_page_hash_get(b, s, o) \ + buf_page_hash_get_locked(b, s, o, NULL, 0) +#define buf_page_get_also_watch(b, s, o) \ + buf_page_hash_get_locked(b, s, o, NULL, 0, true) + +#define buf_block_hash_get_s_locked(b, s, o, l) \ + buf_block_hash_get_locked(b, s, o, l, RW_LOCK_SHARED) +#define buf_block_hash_get_x_locked(b, s, o, l) \ + buf_block_hash_get_locked(b, s, o, l, RW_LOCK_EX) +#define buf_block_hash_get(b, s, o) \ + buf_block_hash_get_locked(b, s, o, NULL, 0) + +/*********************************************************************//** +Gets the current length of the free list of buffer blocks. +@return length of the free list */ +UNIV_INTERN +ulint +buf_get_free_list_len(void); +/*=======================*/ + +/********************************************************************//** +Determine if a block is a sentinel for a buffer pool watch. +@return TRUE if a sentinel for a buffer pool watch, FALSE if not */ +UNIV_INTERN +ibool +buf_pool_watch_is_sentinel( +/*=======================*/ + buf_pool_t* buf_pool, /*!< buffer pool instance */ + const buf_page_t* bpage) /*!< in: block */ + __attribute__((nonnull, warn_unused_result)); +/****************************************************************//** +Add watch for the given page to be read in. Caller must have the buffer pool +@return NULL if watch set, block if the page is in the buffer pool */ +UNIV_INTERN +buf_page_t* +buf_pool_watch_set( +/*===============*/ + ulint space, /*!< in: space id */ + ulint offset, /*!< in: page number */ + ulint fold) /*!< in: buf_page_address_fold(space, offset) */ + __attribute__((warn_unused_result)); +/****************************************************************//** +Stop watching if the page has been read in. +buf_pool_watch_set(space,offset) must have returned NULL before. */ +UNIV_INTERN +void +buf_pool_watch_unset( +/*=================*/ + ulint space, /*!< in: space id */ + ulint offset);/*!< in: page number */ +/****************************************************************//** +Check if the page has been read in. +This may only be called after buf_pool_watch_set(space,offset) +has returned NULL and before invoking buf_pool_watch_unset(space,offset). +@return FALSE if the given page was not read in, TRUE if it was */ +UNIV_INTERN +ibool +buf_pool_watch_occurred( +/*====================*/ + ulint space, /*!< in: space id */ + ulint offset) /*!< in: page number */ + __attribute__((warn_unused_result)); +/********************************************************************//** +Get total buffer pool statistics. */ +UNIV_INTERN +void +buf_get_total_list_len( +/*===================*/ + ulint* LRU_len, /*!< out: length of all LRU lists */ + ulint* free_len, /*!< out: length of all free lists */ + ulint* flush_list_len);/*!< out: length of all flush lists */ +/********************************************************************//** +Get total list size in bytes from all buffer pools. */ +UNIV_INTERN +void +buf_get_total_list_size_in_bytes( +/*=============================*/ + buf_pools_list_size_t* buf_pools_list_size); /*!< out: list sizes + in all buffer pools */ +/********************************************************************//** +Get total buffer pool statistics. */ +UNIV_INTERN +void +buf_get_total_stat( +/*===============*/ + buf_pool_stat_t*tot_stat); /*!< out: buffer pool stats */ +/*********************************************************************//** +Get the nth chunk's buffer block in the specified buffer pool. +@return the nth chunk's buffer block. */ +UNIV_INLINE +buf_block_t* +buf_get_nth_chunk_block( +/*====================*/ + const buf_pool_t* buf_pool, /*!< in: buffer pool instance */ + ulint n, /*!< in: nth chunk in the buffer pool */ + ulint* chunk_size); /*!< in: chunk size */ + +/********************************************************************//** +Calculate the checksum of a page from compressed table and update the page. */ +UNIV_INTERN +void +buf_flush_update_zip_checksum( +/*==========================*/ + buf_frame_t* page, /*!< in/out: Page to update */ + ulint zip_size, /*!< in: Compressed page size */ + lsn_t lsn); /*!< in: Lsn to stamp on the page */ + +#endif /* !UNIV_HOTBACKUP */ + +/** The common buffer control block structure +for compressed and uncompressed frames */ + +/** Number of bits used for buffer page states. */ +#define BUF_PAGE_STATE_BITS 3 + +struct buf_page_t{ + /** @name General fields + None of these bit-fields must be modified without holding + buf_page_get_mutex() [buf_block_t::mutex or + buf_pool->zip_mutex], since they can be stored in the same + machine word. Some of these fields are additionally protected + by buf_pool->mutex. */ + /* @{ */ + + ib_uint32_t space; /*!< tablespace id; also protected + by buf_pool->mutex. */ + ib_uint32_t offset; /*!< page number; also protected + by buf_pool->mutex. */ + /** count of how manyfold this block is currently bufferfixed */ +#ifdef PAGE_ATOMIC_REF_COUNT + ib_uint32_t buf_fix_count; + + /** type of pending I/O operation; also protected by + buf_pool->mutex for writes only @see enum buf_io_fix */ + byte io_fix; + + byte state; +#else + unsigned buf_fix_count:19; + + /** type of pending I/O operation; also protected by + buf_pool->mutex for writes only @see enum buf_io_fix */ + unsigned io_fix:2; + + /*!< state of the control block; also protected by buf_pool->mutex. + State transitions from BUF_BLOCK_READY_FOR_USE to BUF_BLOCK_MEMORY + need not be protected by buf_page_get_mutex(). @see enum buf_page_state. + State changes that are relevant to page_hash are additionally protected + by the appropriate page_hash mutex i.e.: if a page is in page_hash or + is being added to/removed from page_hash then the corresponding changes + must also be protected by page_hash mutex. */ + unsigned state:BUF_PAGE_STATE_BITS; + +#endif /* PAGE_ATOMIC_REF_COUNT */ + +#ifndef UNIV_HOTBACKUP + unsigned flush_type:2; /*!< if this block is currently being + flushed to disk, this tells the + flush_type. + @see buf_flush_t */ + unsigned buf_pool_index:6;/*!< index number of the buffer pool + that this block belongs to */ +# if MAX_BUFFER_POOLS > 64 +# error "MAX_BUFFER_POOLS > 64; redefine buf_pool_index:6" +# endif + /* @} */ +#endif /* !UNIV_HOTBACKUP */ + page_zip_des_t zip; /*!< compressed page; zip.data + (but not the data it points to) is + also protected by buf_pool->mutex; + state == BUF_BLOCK_ZIP_PAGE and + zip.data == NULL means an active + buf_pool->watch */ +#ifndef UNIV_HOTBACKUP + buf_page_t* hash; /*!< node used in chaining to + buf_pool->page_hash or + buf_pool->zip_hash */ +#ifdef UNIV_DEBUG + ibool in_page_hash; /*!< TRUE if in buf_pool->page_hash */ + ibool in_zip_hash; /*!< TRUE if in buf_pool->zip_hash */ +#endif /* UNIV_DEBUG */ + + /** @name Page flushing fields + All these are protected by buf_pool->mutex. */ + /* @{ */ + + UT_LIST_NODE_T(buf_page_t) list; + /*!< based on state, this is a + list node, protected either by + buf_pool->mutex or by + buf_pool->flush_list_mutex, + in one of the following lists in + buf_pool: + + - BUF_BLOCK_NOT_USED: free + - BUF_BLOCK_FILE_PAGE: flush_list + - BUF_BLOCK_ZIP_DIRTY: flush_list + - BUF_BLOCK_ZIP_PAGE: zip_clean + + If bpage is part of flush_list + then the node pointers are + covered by buf_pool->flush_list_mutex. + Otherwise these pointers are + protected by buf_pool->mutex. + + The contents of the list node + is undefined if !in_flush_list + && state == BUF_BLOCK_FILE_PAGE, + or if state is one of + BUF_BLOCK_MEMORY, + BUF_BLOCK_REMOVE_HASH or + BUF_BLOCK_READY_IN_USE. */ + +#ifdef UNIV_DEBUG + ibool in_flush_list; /*!< TRUE if in buf_pool->flush_list; + when buf_pool->flush_list_mutex is + free, the following should hold: + in_flush_list + == (state == BUF_BLOCK_FILE_PAGE + || state == BUF_BLOCK_ZIP_DIRTY) + Writes to this field must be + covered by both block->mutex + and buf_pool->flush_list_mutex. Hence + reads can happen while holding + any one of the two mutexes */ + ibool in_free_list; /*!< TRUE if in buf_pool->free; when + buf_pool->mutex is free, the following + should hold: in_free_list + == (state == BUF_BLOCK_NOT_USED) */ +#endif /* UNIV_DEBUG */ + lsn_t newest_modification; + /*!< log sequence number of + the youngest modification to + this block, zero if not + modified. Protected by block + mutex */ + lsn_t oldest_modification; + /*!< log sequence number of + the START of the log entry + written of the oldest + modification to this block + which has not yet been flushed + on disk; zero if all + modifications are on disk. + Writes to this field must be + covered by both block->mutex + and buf_pool->flush_list_mutex. Hence + reads can happen while holding + any one of the two mutexes */ + /* @} */ + /** @name LRU replacement algorithm fields + These fields are protected by buf_pool->mutex only (not + buf_pool->zip_mutex or buf_block_t::mutex). */ + /* @{ */ + + UT_LIST_NODE_T(buf_page_t) LRU; + /*!< node of the LRU list */ +#ifdef UNIV_DEBUG + ibool in_LRU_list; /*!< TRUE if the page is in + the LRU list; used in + debugging */ +#endif /* UNIV_DEBUG */ + unsigned old:1; /*!< TRUE if the block is in the old + blocks in buf_pool->LRU_old */ + unsigned freed_page_clock:31;/*!< the value of + buf_pool->freed_page_clock + when this block was the last + time put to the head of the + LRU list; a thread is allowed + to read this for heuristic + purposes without holding any + mutex or latch */ + /* @} */ + unsigned access_time; /*!< time of first access, or + 0 if the block was never accessed + in the buffer pool. Protected by + block mutex */ +# if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG + ibool file_page_was_freed; + /*!< this is set to TRUE when + fsp frees a page in buffer pool; + protected by buf_pool->zip_mutex + or buf_block_t::mutex. */ +# endif /* UNIV_DEBUG_FILE_ACCESSES || UNIV_DEBUG */ +#endif /* !UNIV_HOTBACKUP */ +}; + +/** The buffer control block structure */ + +struct buf_block_t{ + + /** @name General fields */ + /* @{ */ + + buf_page_t page; /*!< page information; this must + be the first field, so that + buf_pool->page_hash can point + to buf_page_t or buf_block_t */ + byte* frame; /*!< pointer to buffer frame which + is of size UNIV_PAGE_SIZE, and + aligned to an address divisible by + UNIV_PAGE_SIZE */ +#ifndef UNIV_HOTBACKUP + UT_LIST_NODE_T(buf_block_t) unzip_LRU; + /*!< node of the decompressed LRU list; + a block is in the unzip_LRU list + if page.state == BUF_BLOCK_FILE_PAGE + and page.zip.data != NULL */ +#ifdef UNIV_DEBUG + ibool in_unzip_LRU_list;/*!< TRUE if the page is in the + decompressed LRU list; + used in debugging */ +#endif /* UNIV_DEBUG */ + ib_mutex_t mutex; /*!< mutex protecting this block: + state (also protected by the buffer + pool mutex), io_fix, buf_fix_count, + and accessed; we introduce this new + mutex in InnoDB-5.1 to relieve + contention on the buffer pool mutex */ + rw_lock_t lock; /*!< read-write lock of the buffer + frame */ + unsigned lock_hash_val:32;/*!< hashed value of the page address + in the record lock hash table; + protected by buf_block_t::lock + (or buf_block_t::mutex, buf_pool->mutex + in buf_page_get_gen(), + buf_page_init_for_read() + and buf_page_create()) */ + ibool check_index_page_at_flush; + /*!< TRUE if we know that this is + an index page, and want the database + to check its consistency before flush; + note that there may be pages in the + buffer pool which are index pages, + but this flag is not set because + we do not keep track of all pages; + NOT protected by any mutex */ + /* @} */ + /** @name Optimistic search field */ + /* @{ */ + + ib_uint64_t modify_clock; /*!< this clock is incremented every + time a pointer to a record on the + page may become obsolete; this is + used in the optimistic cursor + positioning: if the modify clock has + not changed, we know that the pointer + is still valid; this field may be + changed if the thread (1) owns the + pool mutex and the page is not + bufferfixed, or (2) the thread has an + x-latch on the block */ + /* @} */ + /** @name Hash search fields (unprotected) + NOTE that these fields are NOT protected by any semaphore! */ + /* @{ */ + + ulint n_hash_helps; /*!< counter which controls building + of a new hash index for the page */ + ulint n_fields; /*!< recommended prefix length for hash + search: number of full fields */ + ulint n_bytes; /*!< recommended prefix: number of bytes + in an incomplete field */ + ibool left_side; /*!< TRUE or FALSE, depending on + whether the leftmost record of several + records with the same prefix should be + indexed in the hash index */ + /* @} */ + + /** @name Hash search fields + These 5 fields may only be modified when we have + an x-latch on btr_search_latch AND + - we are holding an s-latch or x-latch on buf_block_t::lock or + - we know that buf_block_t::buf_fix_count == 0. + + An exception to this is when we init or create a page + in the buffer pool in buf0buf.cc. + + Another exception is that assigning block->index = NULL + is allowed whenever holding an x-latch on btr_search_latch. */ + + /* @{ */ + +#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG + ulint n_pointers; /*!< used in debugging: the number of + pointers in the adaptive hash index + pointing to this frame */ +#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */ + unsigned curr_n_fields:10;/*!< prefix length for hash indexing: + number of full fields */ + unsigned curr_n_bytes:15;/*!< number of bytes in hash + indexing */ + unsigned curr_left_side:1;/*!< TRUE or FALSE in hash indexing */ + dict_index_t* index; /*!< Index for which the + adaptive hash index has been + created, or NULL if the page + does not exist in the + index. Note that it does not + guarantee that the index is + complete, though: there may + have been hash collisions, + record deletions, etc. */ + /* @} */ +# ifdef UNIV_SYNC_DEBUG + /** @name Debug fields */ + /* @{ */ + rw_lock_t debug_latch; /*!< in the debug version, each thread + which bufferfixes the block acquires + an s-latch here; so we can use the + debug utilities in sync0rw */ + /* @} */ +# endif +#endif /* !UNIV_HOTBACKUP */ +}; + +/** Check if a buf_block_t object is in a valid state +@param block buffer block +@return TRUE if valid */ +#define buf_block_state_valid(block) \ +(buf_block_get_state(block) >= BUF_BLOCK_NOT_USED \ + && (buf_block_get_state(block) <= BUF_BLOCK_REMOVE_HASH)) + +#ifndef UNIV_HOTBACKUP +/**********************************************************************//** +Compute the hash fold value for blocks in buf_pool->zip_hash. */ +/* @{ */ +#define BUF_POOL_ZIP_FOLD_PTR(ptr) ((ulint) (ptr) / UNIV_PAGE_SIZE) +#define BUF_POOL_ZIP_FOLD(b) BUF_POOL_ZIP_FOLD_PTR((b)->frame) +#define BUF_POOL_ZIP_FOLD_BPAGE(b) BUF_POOL_ZIP_FOLD((buf_block_t*) (b)) +/* @} */ + +/** Struct that is embedded in the free zip blocks */ +struct buf_buddy_free_t { + union { + ulint size; /*!< size of the block */ + byte bytes[FIL_PAGE_DATA]; + /*!< stamp[FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID] + == BUF_BUDDY_FREE_STAMP denotes a free + block. If the space_id field of buddy + block != BUF_BUDDY_FREE_STAMP, the block + is not in any zip_free list. If the + space_id is BUF_BUDDY_FREE_STAMP then + stamp[0] will contain the + buddy block size. */ + } stamp; + + buf_page_t bpage; /*!< Embedded bpage descriptor */ + UT_LIST_NODE_T(buf_buddy_free_t) list; + /*!< Node of zip_free list */ +}; + +/** @brief The buffer pool statistics structure. */ +struct buf_pool_stat_t{ + ulint n_page_gets; /*!< number of page gets performed; + also successful searches through + the adaptive hash index are + counted as page gets; this field + is NOT protected by the buffer + pool mutex */ + ulint n_pages_read; /*!< number read operations */ + ulint n_pages_written;/*!< number write operations */ + ulint n_pages_created;/*!< number of pages created + in the pool with no read */ + ulint n_ra_pages_read_rnd;/*!< number of pages read in + as part of random read ahead */ + ulint n_ra_pages_read;/*!< number of pages read in + as part of read ahead */ + ulint n_ra_pages_evicted;/*!< number of read ahead + pages that are evicted without + being accessed */ + ulint n_pages_made_young; /*!< number of pages made young, in + calls to buf_LRU_make_block_young() */ + ulint n_pages_not_made_young; /*!< number of pages not made + young because the first access + was not long enough ago, in + buf_page_peek_if_too_old() */ + ulint LRU_bytes; /*!< LRU size in bytes */ + ulint flush_list_bytes;/*!< flush_list size in bytes */ +}; + +/** Statistics of buddy blocks of a given size. */ +struct buf_buddy_stat_t { + /** Number of blocks allocated from the buddy system. */ + ulint used; + /** Number of blocks relocated by the buddy system. */ + ib_uint64_t relocated; + /** Total duration of block relocations, in microseconds. */ + ib_uint64_t relocated_usec; +}; + +/** @brief The buffer pool structure. + +NOTE! The definition appears here only for other modules of this +directory (buf) to see it. Do not use from outside! */ + +struct buf_pool_t{ + + /** @name General fields */ + /* @{ */ + ib_mutex_t mutex; /*!< Buffer pool mutex of this + instance */ + ib_mutex_t zip_mutex; /*!< Zip mutex of this buffer + pool instance, protects compressed + only pages (of type buf_page_t, not + buf_block_t */ + ulint instance_no; /*!< Array index of this buffer + pool instance */ + ulint old_pool_size; /*!< Old pool size in bytes */ + ulint curr_pool_size; /*!< Current pool size in bytes */ + ulint LRU_old_ratio; /*!< Reserve this much of the buffer + pool for "old" blocks */ +#ifdef UNIV_DEBUG + ulint buddy_n_frames; /*!< Number of frames allocated from + the buffer pool to the buddy system */ +#endif +#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG + ulint mutex_exit_forbidden; /*!< Forbid release mutex */ +#endif + ulint n_chunks; /*!< number of buffer pool chunks */ + buf_chunk_t* chunks; /*!< buffer pool chunks */ + ulint curr_size; /*!< current pool size in pages */ + hash_table_t* page_hash; /*!< hash table of buf_page_t or + buf_block_t file pages, + buf_page_in_file() == TRUE, + indexed by (space_id, offset). + page_hash is protected by an + array of mutexes. + Changes in page_hash are protected + by buf_pool->mutex and the relevant + page_hash mutex. Lookups can happen + while holding the buf_pool->mutex or + the relevant page_hash mutex. */ + hash_table_t* zip_hash; /*!< hash table of buf_block_t blocks + whose frames are allocated to the + zip buddy system, + indexed by block->frame */ + ulint n_pend_reads; /*!< number of pending read + operations */ + ulint n_pend_unzip; /*!< number of pending decompressions */ + + time_t last_printout_time; + /*!< when buf_print_io was last time + called */ + buf_buddy_stat_t buddy_stat[BUF_BUDDY_SIZES_MAX + 1]; + /*!< Statistics of buddy system, + indexed by block size */ + buf_pool_stat_t stat; /*!< current statistics */ + buf_pool_stat_t old_stat; /*!< old statistics */ + + /* @} */ + + /** @name Page flushing algorithm fields */ + + /* @{ */ + + ib_mutex_t flush_list_mutex;/*!< mutex protecting the + flush list access. This mutex + protects flush_list, flush_rbt + and bpage::list pointers when + the bpage is on flush_list. It + also protects writes to + bpage::oldest_modification and + flush_list_hp */ + const buf_page_t* flush_list_hp;/*!< "hazard pointer" + used during scan of flush_list + while doing flush list batch. + Protected by flush_list_mutex */ + UT_LIST_BASE_NODE_T(buf_page_t) flush_list; + /*!< base node of the modified block + list */ + ibool init_flush[BUF_FLUSH_N_TYPES]; + /*!< this is TRUE when a flush of the + given type is being initialized */ + ulint n_flush[BUF_FLUSH_N_TYPES]; + /*!< this is the number of pending + writes in the given flush type */ + os_event_t no_flush[BUF_FLUSH_N_TYPES]; + /*!< this is in the set state + when there is no flush batch + of the given type running */ + ib_rbt_t* flush_rbt; /*!< a red-black tree is used + exclusively during recovery to + speed up insertions in the + flush_list. This tree contains + blocks in order of + oldest_modification LSN and is + kept in sync with the + flush_list. + Each member of the tree MUST + also be on the flush_list. + This tree is relevant only in + recovery and is set to NULL + once the recovery is over. + Protected by flush_list_mutex */ + ulint freed_page_clock;/*!< a sequence number used + to count the number of buffer + blocks removed from the end of + the LRU list; NOTE that this + counter may wrap around at 4 + billion! A thread is allowed + to read this for heuristic + purposes without holding any + mutex or latch */ + ibool try_LRU_scan; /*!< Set to FALSE when an LRU + scan for free block fails. This + flag is used to avoid repeated + scans of LRU list when we know + that there is no free block + available in the scan depth for + eviction. Set to TRUE whenever + we flush a batch from the + buffer pool. Protected by the + buf_pool->mutex */ + /* @} */ + + /** @name LRU replacement algorithm fields */ + /* @{ */ + + UT_LIST_BASE_NODE_T(buf_page_t) free; + /*!< base node of the free + block list */ + UT_LIST_BASE_NODE_T(buf_page_t) LRU; + /*!< base node of the LRU list */ + buf_page_t* LRU_old; /*!< pointer to the about + LRU_old_ratio/BUF_LRU_OLD_RATIO_DIV + oldest blocks in the LRU list; + NULL if LRU length less than + BUF_LRU_OLD_MIN_LEN; + NOTE: when LRU_old != NULL, its length + should always equal LRU_old_len */ + ulint LRU_old_len; /*!< length of the LRU list from + the block to which LRU_old points + onward, including that block; + see buf0lru.cc for the restrictions + on this value; 0 if LRU_old == NULL; + NOTE: LRU_old_len must be adjusted + whenever LRU_old shrinks or grows! */ + + UT_LIST_BASE_NODE_T(buf_block_t) unzip_LRU; + /*!< base node of the + unzip_LRU list */ + + /* @} */ + /** @name Buddy allocator fields + The buddy allocator is used for allocating compressed page + frames and buf_page_t descriptors of blocks that exist + in the buffer pool only in compressed form. */ + /* @{ */ +#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG + UT_LIST_BASE_NODE_T(buf_page_t) zip_clean; + /*!< unmodified compressed pages */ +#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ + UT_LIST_BASE_NODE_T(buf_buddy_free_t) zip_free[BUF_BUDDY_SIZES_MAX]; + /*!< buddy free lists */ + + buf_page_t* watch; + /*!< Sentinel records for buffer + pool watches. Protected by + buf_pool->mutex. */ + +#if BUF_BUDDY_LOW > UNIV_ZIP_SIZE_MIN +# error "BUF_BUDDY_LOW > UNIV_ZIP_SIZE_MIN" +#endif + /* @} */ +}; + +/** @name Accessors for buf_pool->mutex. +Use these instead of accessing buf_pool->mutex directly. */ +/* @{ */ + +/** Test if a buffer pool mutex is owned. */ +#define buf_pool_mutex_own(b) mutex_own(&b->mutex) +/** Acquire a buffer pool mutex. */ +#define buf_pool_mutex_enter(b) do { \ + ut_ad(!mutex_own(&b->zip_mutex)); \ + mutex_enter(&b->mutex); \ +} while (0) + +/** Test if flush list mutex is owned. */ +#define buf_flush_list_mutex_own(b) mutex_own(&b->flush_list_mutex) + +/** Acquire the flush list mutex. */ +#define buf_flush_list_mutex_enter(b) do { \ + mutex_enter(&b->flush_list_mutex); \ +} while (0) +/** Release the flush list mutex. */ +# define buf_flush_list_mutex_exit(b) do { \ + mutex_exit(&b->flush_list_mutex); \ +} while (0) + +/** Test if block->mutex is owned. */ +#define buf_block_mutex_own(b) mutex_own(&(b)->mutex) + +/** Acquire the block->mutex. */ +#define buf_block_mutex_enter(b) do { \ + mutex_enter(&(b)->mutex); \ +} while (0) + +/** Release the trx->mutex. */ +#define buf_block_mutex_exit(b) do { \ + mutex_exit(&(b)->mutex); \ +} while (0) + + +/** Get appropriate page_hash_lock. */ +# define buf_page_hash_lock_get(b, f) \ + hash_get_lock(b->page_hash, f) + +#ifdef UNIV_SYNC_DEBUG +/** Test if page_hash lock is held in s-mode. */ +# define buf_page_hash_lock_held_s(b, p) \ + rw_lock_own(buf_page_hash_lock_get(b, \ + buf_page_address_fold(p->space, \ + p->offset)), \ + RW_LOCK_SHARED) + +/** Test if page_hash lock is held in x-mode. */ +# define buf_page_hash_lock_held_x(b, p) \ + rw_lock_own(buf_page_hash_lock_get(b, \ + buf_page_address_fold(p->space, \ + p->offset)), \ + RW_LOCK_EX) + +/** Test if page_hash lock is held in x or s-mode. */ +# define buf_page_hash_lock_held_s_or_x(b, p) \ + (buf_page_hash_lock_held_s(b, p) \ + || buf_page_hash_lock_held_x(b, p)) + +# define buf_block_hash_lock_held_s(b, p) \ + buf_page_hash_lock_held_s(b, &(p->page)) + +# define buf_block_hash_lock_held_x(b, p) \ + buf_page_hash_lock_held_x(b, &(p->page)) + +# define buf_block_hash_lock_held_s_or_x(b, p) \ + buf_page_hash_lock_held_s_or_x(b, &(p->page)) +#else /* UNIV_SYNC_DEBUG */ +# define buf_page_hash_lock_held_s(b, p) (TRUE) +# define buf_page_hash_lock_held_x(b, p) (TRUE) +# define buf_page_hash_lock_held_s_or_x(b, p) (TRUE) +# define buf_block_hash_lock_held_s(b, p) (TRUE) +# define buf_block_hash_lock_held_x(b, p) (TRUE) +# define buf_block_hash_lock_held_s_or_x(b, p) (TRUE) +#endif /* UNIV_SYNC_DEBUG */ + +#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG +/** Forbid the release of the buffer pool mutex. */ +# define buf_pool_mutex_exit_forbid(b) do { \ + ut_ad(buf_pool_mutex_own(b)); \ + b->mutex_exit_forbidden++; \ +} while (0) +/** Allow the release of the buffer pool mutex. */ +# define buf_pool_mutex_exit_allow(b) do { \ + ut_ad(buf_pool_mutex_own(b)); \ + ut_a(b->mutex_exit_forbidden); \ + b->mutex_exit_forbidden--; \ +} while (0) +/** Release the buffer pool mutex. */ +# define buf_pool_mutex_exit(b) do { \ + ut_a(!b->mutex_exit_forbidden); \ + mutex_exit(&b->mutex); \ +} while (0) +#else +/** Forbid the release of the buffer pool mutex. */ +# define buf_pool_mutex_exit_forbid(b) ((void) 0) +/** Allow the release of the buffer pool mutex. */ +# define buf_pool_mutex_exit_allow(b) ((void) 0) +/** Release the buffer pool mutex. */ +# define buf_pool_mutex_exit(b) mutex_exit(&b->mutex) +#endif +#endif /* !UNIV_HOTBACKUP */ +/* @} */ + +/********************************************************************** +Let us list the consistency conditions for different control block states. + +NOT_USED: is in free list, not in LRU list, not in flush list, nor + page hash table +READY_FOR_USE: is not in free list, LRU list, or flush list, nor page + hash table +MEMORY: is not in free list, LRU list, or flush list, nor page + hash table +FILE_PAGE: space and offset are defined, is in page hash table + if io_fix == BUF_IO_WRITE, + pool: no_flush[flush_type] is in reset state, + pool: n_flush[flush_type] > 0 + + (1) if buf_fix_count == 0, then + is in LRU list, not in free list + is in flush list, + if and only if oldest_modification > 0 + is x-locked, + if and only if io_fix == BUF_IO_READ + is s-locked, + if and only if io_fix == BUF_IO_WRITE + + (2) if buf_fix_count > 0, then + is not in LRU list, not in free list + is in flush list, + if and only if oldest_modification > 0 + if io_fix == BUF_IO_READ, + is x-locked + if io_fix == BUF_IO_WRITE, + is s-locked + +State transitions: + +NOT_USED => READY_FOR_USE +READY_FOR_USE => MEMORY +READY_FOR_USE => FILE_PAGE +MEMORY => NOT_USED +FILE_PAGE => NOT_USED NOTE: This transition is allowed if and only if + (1) buf_fix_count == 0, + (2) oldest_modification == 0, and + (3) io_fix == 0. +*/ + +#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG +/** Functor to validate the LRU list. */ +struct CheckInLRUList { + void operator()(const buf_page_t* elem) const + { + ut_a(elem->in_LRU_list); + } +}; + +/** Functor to validate the LRU list. */ +struct CheckInFreeList { + void operator()(const buf_page_t* elem) const + { + ut_a(elem->in_free_list); + } +}; + +struct CheckUnzipLRUAndLRUList { + void operator()(const buf_block_t* elem) const + { + ut_a(elem->page.in_LRU_list); + ut_a(elem->in_unzip_LRU_list); + } +}; +#endif /* UNIV_DEBUG || defined UNIV_BUF_DEBUG */ + +#ifndef UNIV_NONINL +#include "buf0buf.ic" +#endif + +#endif diff --git a/storage/innobase/include/buf0buf.ic b/storage/innobase/include/buf0buf.ic new file mode 100644 index 00000000000..56616c6deeb --- /dev/null +++ b/storage/innobase/include/buf0buf.ic @@ -0,0 +1,1460 @@ +/***************************************************************************** + +Copyright (c) 1995, 2014, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2008, Google Inc. + +Portions of this file contain modifications contributed and copyrighted by +Google, Inc. Those modifications are gratefully acknowledged and are described +briefly in the InnoDB documentation. The contributions by Google are +incorporated with their permission, and subject to the conditions contained in +the file COPYING.Google. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/buf0buf.ic +The database buffer buf_pool + +Created 11/5/1995 Heikki Tuuri +*******************************************************/ + +#include "mtr0mtr.h" +#ifndef UNIV_HOTBACKUP +#include "buf0flu.h" +#include "buf0lru.h" +#include "buf0rea.h" + +/** A chunk of buffers. The buffer pool is allocated in chunks. */ +struct buf_chunk_t{ + ulint mem_size; /*!< allocated size of the chunk */ + ulint size; /*!< size of frames[] and blocks[] */ + void* mem; /*!< pointer to the memory area which + was allocated for the frames */ + buf_block_t* blocks; /*!< array of buffer control blocks */ +}; + +/*********************************************************************//** +Gets the current size of buffer buf_pool in bytes. +@return size in bytes */ +UNIV_INLINE +ulint +buf_pool_get_curr_size(void) +/*========================*/ +{ + return(srv_buf_pool_curr_size); +} + +/********************************************************************//** +Calculates the index of a buffer pool to the buf_pool[] array. +@return the position of the buffer pool in buf_pool[] */ +UNIV_INLINE +ulint +buf_pool_index( +/*===========*/ + const buf_pool_t* buf_pool) /*!< in: buffer pool */ +{ + ulint i = buf_pool - buf_pool_ptr; + ut_ad(i < MAX_BUFFER_POOLS); + ut_ad(i < srv_buf_pool_instances); + return(i); +} + +/******************************************************************//** +Returns the buffer pool instance given a page instance +@return buf_pool */ +UNIV_INLINE +buf_pool_t* +buf_pool_from_bpage( +/*================*/ + const buf_page_t* bpage) /*!< in: buffer pool page */ +{ + ulint i; + i = bpage->buf_pool_index; + ut_ad(i < srv_buf_pool_instances); + return(&buf_pool_ptr[i]); +} + +/******************************************************************//** +Returns the buffer pool instance given a block instance +@return buf_pool */ +UNIV_INLINE +buf_pool_t* +buf_pool_from_block( +/*================*/ + const buf_block_t* block) /*!< in: block */ +{ + return(buf_pool_from_bpage(&block->page)); +} + +/*********************************************************************//** +Gets the current size of buffer buf_pool in pages. +@return size in pages*/ +UNIV_INLINE +ulint +buf_pool_get_n_pages(void) +/*======================*/ +{ + return(buf_pool_get_curr_size() / UNIV_PAGE_SIZE); +} + +/********************************************************************//** +Reads the freed_page_clock of a buffer block. +@return freed_page_clock */ +UNIV_INLINE +ulint +buf_page_get_freed_page_clock( +/*==========================*/ + const buf_page_t* bpage) /*!< in: block */ +{ + /* This is sometimes read without holding buf_pool->mutex. */ + return(bpage->freed_page_clock); +} + +/********************************************************************//** +Reads the freed_page_clock of a buffer block. +@return freed_page_clock */ +UNIV_INLINE +ulint +buf_block_get_freed_page_clock( +/*===========================*/ + const buf_block_t* block) /*!< in: block */ +{ + return(buf_page_get_freed_page_clock(&block->page)); +} + +/********************************************************************//** +Tells if a block is still close enough to the MRU end of the LRU list +meaning that it is not in danger of getting evicted and also implying +that it has been accessed recently. +Note that this is for heuristics only and does not reserve buffer pool +mutex. +@return TRUE if block is close to MRU end of LRU */ +UNIV_INLINE +ibool +buf_page_peek_if_young( +/*===================*/ + const buf_page_t* bpage) /*!< in: block */ +{ + buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); + + /* FIXME: bpage->freed_page_clock is 31 bits */ + return((buf_pool->freed_page_clock & ((1UL << 31) - 1)) + < ((ulint) bpage->freed_page_clock + + (buf_pool->curr_size + * (BUF_LRU_OLD_RATIO_DIV - buf_pool->LRU_old_ratio) + / (BUF_LRU_OLD_RATIO_DIV * 4)))); +} + +/********************************************************************//** +Recommends a move of a block to the start of the LRU list if there is danger +of dropping from the buffer pool. NOTE: does not reserve the buffer pool +mutex. +@return TRUE if should be made younger */ +UNIV_INLINE +ibool +buf_page_peek_if_too_old( +/*=====================*/ + const buf_page_t* bpage) /*!< in: block to make younger */ +{ + buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); + + if (buf_pool->freed_page_clock == 0) { + /* If eviction has not started yet, do not update the + statistics or move blocks in the LRU list. This is + either the warm-up phase or an in-memory workload. */ + return(FALSE); + } else if (buf_LRU_old_threshold_ms && bpage->old) { + unsigned access_time = buf_page_is_accessed(bpage); + + if (access_time > 0 + && ((ib_uint32_t) (ut_time_ms() - access_time)) + >= buf_LRU_old_threshold_ms) { + return(TRUE); + } + + buf_pool->stat.n_pages_not_made_young++; + return(FALSE); + } else { + return(!buf_page_peek_if_young(bpage)); + } +} +#endif /* !UNIV_HOTBACKUP */ + +/*********************************************************************//** +Gets the state of a block. +@return state */ +UNIV_INLINE +enum buf_page_state +buf_page_get_state( +/*===============*/ + const buf_page_t* bpage) /*!< in: pointer to the control block */ +{ + enum buf_page_state state = (enum buf_page_state) bpage->state; + +#ifdef UNIV_DEBUG + switch (state) { + case BUF_BLOCK_POOL_WATCH: + case BUF_BLOCK_ZIP_PAGE: + case BUF_BLOCK_ZIP_DIRTY: + case BUF_BLOCK_NOT_USED: + case BUF_BLOCK_READY_FOR_USE: + case BUF_BLOCK_FILE_PAGE: + case BUF_BLOCK_MEMORY: + case BUF_BLOCK_REMOVE_HASH: + break; + default: + ut_error; + } +#endif /* UNIV_DEBUG */ + + return(state); +} +/*********************************************************************//** +Gets the state of a block. +@return state */ +UNIV_INLINE +enum buf_page_state +buf_block_get_state( +/*================*/ + const buf_block_t* block) /*!< in: pointer to the control block */ +{ + return(buf_page_get_state(&block->page)); +} +/*********************************************************************//** +Sets the state of a block. */ +UNIV_INLINE +void +buf_page_set_state( +/*===============*/ + buf_page_t* bpage, /*!< in/out: pointer to control block */ + enum buf_page_state state) /*!< in: state */ +{ +#ifdef UNIV_DEBUG + enum buf_page_state old_state = buf_page_get_state(bpage); + + switch (old_state) { + case BUF_BLOCK_POOL_WATCH: + ut_error; + break; + case BUF_BLOCK_ZIP_PAGE: + ut_a(state == BUF_BLOCK_ZIP_DIRTY); + break; + case BUF_BLOCK_ZIP_DIRTY: + ut_a(state == BUF_BLOCK_ZIP_PAGE); + break; + case BUF_BLOCK_NOT_USED: + ut_a(state == BUF_BLOCK_READY_FOR_USE); + break; + case BUF_BLOCK_READY_FOR_USE: + ut_a(state == BUF_BLOCK_MEMORY + || state == BUF_BLOCK_FILE_PAGE + || state == BUF_BLOCK_NOT_USED); + break; + case BUF_BLOCK_MEMORY: + ut_a(state == BUF_BLOCK_NOT_USED); + break; + case BUF_BLOCK_FILE_PAGE: + ut_a(state == BUF_BLOCK_NOT_USED + || state == BUF_BLOCK_REMOVE_HASH); + break; + case BUF_BLOCK_REMOVE_HASH: + ut_a(state == BUF_BLOCK_MEMORY); + break; + } +#endif /* UNIV_DEBUG */ + bpage->state = state; + ut_ad(buf_page_get_state(bpage) == state); +} + +/*********************************************************************//** +Sets the state of a block. */ +UNIV_INLINE +void +buf_block_set_state( +/*================*/ + buf_block_t* block, /*!< in/out: pointer to control block */ + enum buf_page_state state) /*!< in: state */ +{ + buf_page_set_state(&block->page, state); +} + +/*********************************************************************//** +Determines if a block is mapped to a tablespace. +@return TRUE if mapped */ +UNIV_INLINE +ibool +buf_page_in_file( +/*=============*/ + const buf_page_t* bpage) /*!< in: pointer to control block */ +{ + switch (buf_page_get_state(bpage)) { + case BUF_BLOCK_POOL_WATCH: + ut_error; + break; + case BUF_BLOCK_ZIP_PAGE: + case BUF_BLOCK_ZIP_DIRTY: + case BUF_BLOCK_FILE_PAGE: + return(TRUE); + case BUF_BLOCK_NOT_USED: + case BUF_BLOCK_READY_FOR_USE: + case BUF_BLOCK_MEMORY: + case BUF_BLOCK_REMOVE_HASH: + break; + } + + return(FALSE); +} + +#ifndef UNIV_HOTBACKUP +/*********************************************************************//** +Determines if a block should be on unzip_LRU list. +@return TRUE if block belongs to unzip_LRU */ +UNIV_INLINE +ibool +buf_page_belongs_to_unzip_LRU( +/*==========================*/ + const buf_page_t* bpage) /*!< in: pointer to control block */ +{ + ut_ad(buf_page_in_file(bpage)); + + return(bpage->zip.data + && buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE); +} + +/*********************************************************************//** +Gets the mutex of a block. +@return pointer to mutex protecting bpage */ +UNIV_INLINE +ib_mutex_t* +buf_page_get_mutex( +/*===============*/ + const buf_page_t* bpage) /*!< in: pointer to control block */ +{ + switch (buf_page_get_state(bpage)) { + case BUF_BLOCK_POOL_WATCH: + ut_error; + return(NULL); + case BUF_BLOCK_ZIP_PAGE: + case BUF_BLOCK_ZIP_DIRTY: { + buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); + + return(&buf_pool->zip_mutex); + } + default: + return(&((buf_block_t*) bpage)->mutex); + } +} + +/*********************************************************************//** +Get the flush type of a page. +@return flush type */ +UNIV_INLINE +buf_flush_t +buf_page_get_flush_type( +/*====================*/ + const buf_page_t* bpage) /*!< in: buffer page */ +{ + buf_flush_t flush_type = (buf_flush_t) bpage->flush_type; + +#ifdef UNIV_DEBUG + switch (flush_type) { + case BUF_FLUSH_LRU: + case BUF_FLUSH_LIST: + case BUF_FLUSH_SINGLE_PAGE: + return(flush_type); + case BUF_FLUSH_N_TYPES: + ut_error; + } + ut_error; +#endif /* UNIV_DEBUG */ + return(flush_type); +} +/*********************************************************************//** +Set the flush type of a page. */ +UNIV_INLINE +void +buf_page_set_flush_type( +/*====================*/ + buf_page_t* bpage, /*!< in: buffer page */ + buf_flush_t flush_type) /*!< in: flush type */ +{ + bpage->flush_type = flush_type; + ut_ad(buf_page_get_flush_type(bpage) == flush_type); +} + +/*********************************************************************//** +Map a block to a file page. */ +UNIV_INLINE +void +buf_block_set_file_page( +/*====================*/ + buf_block_t* block, /*!< in/out: pointer to control block */ + ulint space, /*!< in: tablespace id */ + ulint page_no)/*!< in: page number */ +{ + buf_block_set_state(block, BUF_BLOCK_FILE_PAGE); + block->page.space = static_cast<ib_uint32_t>(space); + block->page.offset = static_cast<ib_uint32_t>(page_no); +} + +/*********************************************************************//** +Gets the io_fix state of a block. +@return io_fix state */ +UNIV_INLINE +enum buf_io_fix +buf_page_get_io_fix( +/*================*/ + const buf_page_t* bpage) /*!< in: pointer to the control block */ +{ + ut_ad(bpage != NULL); + + enum buf_io_fix io_fix = (enum buf_io_fix) bpage->io_fix; +#ifdef UNIV_DEBUG + switch (io_fix) { + case BUF_IO_NONE: + case BUF_IO_READ: + case BUF_IO_WRITE: + case BUF_IO_PIN: + return(io_fix); + } + ut_error; +#endif /* UNIV_DEBUG */ + return(io_fix); +} + +/*********************************************************************//** +Gets the io_fix state of a block. +@return io_fix state */ +UNIV_INLINE +enum buf_io_fix +buf_block_get_io_fix( +/*=================*/ + const buf_block_t* block) /*!< in: pointer to the control block */ +{ + return(buf_page_get_io_fix(&block->page)); +} + +/*********************************************************************//** +Sets the io_fix state of a block. */ +UNIV_INLINE +void +buf_page_set_io_fix( +/*================*/ + buf_page_t* bpage, /*!< in/out: control block */ + enum buf_io_fix io_fix) /*!< in: io_fix state */ +{ +#ifdef UNIV_DEBUG + buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); + ut_ad(buf_pool_mutex_own(buf_pool)); +#endif + ut_ad(mutex_own(buf_page_get_mutex(bpage))); + + bpage->io_fix = io_fix; + ut_ad(buf_page_get_io_fix(bpage) == io_fix); +} + +/*********************************************************************//** +Sets the io_fix state of a block. */ +UNIV_INLINE +void +buf_block_set_io_fix( +/*=================*/ + buf_block_t* block, /*!< in/out: control block */ + enum buf_io_fix io_fix) /*!< in: io_fix state */ +{ + buf_page_set_io_fix(&block->page, io_fix); +} + +/*********************************************************************//** +Makes a block sticky. A sticky block implies that even after we release +the buf_pool->mutex and the block->mutex: +* it cannot be removed from the flush_list +* the block descriptor cannot be relocated +* it cannot be removed from the LRU list +Note that: +* the block can still change its position in the LRU list +* the next and previous pointers can change. */ +UNIV_INLINE +void +buf_page_set_sticky( +/*================*/ + buf_page_t* bpage) /*!< in/out: control block */ +{ +#ifdef UNIV_DEBUG + buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); + ut_ad(buf_pool_mutex_own(buf_pool)); +#endif + ut_ad(mutex_own(buf_page_get_mutex(bpage))); + ut_ad(buf_page_get_io_fix(bpage) == BUF_IO_NONE); + + bpage->io_fix = BUF_IO_PIN; +} + +/*********************************************************************//** +Removes stickiness of a block. */ +UNIV_INLINE +void +buf_page_unset_sticky( +/*==================*/ + buf_page_t* bpage) /*!< in/out: control block */ +{ +#ifdef UNIV_DEBUG + buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); + ut_ad(buf_pool_mutex_own(buf_pool)); +#endif + ut_ad(mutex_own(buf_page_get_mutex(bpage))); + ut_ad(buf_page_get_io_fix(bpage) == BUF_IO_PIN); + + bpage->io_fix = BUF_IO_NONE; +} + +/********************************************************************//** +Determine if a buffer block can be relocated in memory. The block +can be dirty, but it must not be I/O-fixed or bufferfixed. */ +UNIV_INLINE +ibool +buf_page_can_relocate( +/*==================*/ + const buf_page_t* bpage) /*!< control block being relocated */ +{ +#ifdef UNIV_DEBUG + buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); + ut_ad(buf_pool_mutex_own(buf_pool)); +#endif + ut_ad(mutex_own(buf_page_get_mutex(bpage))); + ut_ad(buf_page_in_file(bpage)); + ut_ad(bpage->in_LRU_list); + + return(buf_page_get_io_fix(bpage) == BUF_IO_NONE + && bpage->buf_fix_count == 0); +} + +/*********************************************************************//** +Determine if a block has been flagged old. +@return TRUE if old */ +UNIV_INLINE +ibool +buf_page_is_old( +/*============*/ + const buf_page_t* bpage) /*!< in: control block */ +{ +#ifdef UNIV_DEBUG + buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); + ut_ad(buf_pool_mutex_own(buf_pool)); +#endif + ut_ad(buf_page_in_file(bpage)); + + return(bpage->old); +} + +/*********************************************************************//** +Flag a block old. */ +UNIV_INLINE +void +buf_page_set_old( +/*=============*/ + buf_page_t* bpage, /*!< in/out: control block */ + ibool old) /*!< in: old */ +{ +#ifdef UNIV_DEBUG + buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); +#endif /* UNIV_DEBUG */ + ut_a(buf_page_in_file(bpage)); + ut_ad(buf_pool_mutex_own(buf_pool)); + ut_ad(bpage->in_LRU_list); + +#ifdef UNIV_LRU_DEBUG + ut_a((buf_pool->LRU_old_len == 0) == (buf_pool->LRU_old == NULL)); + /* If a block is flagged "old", the LRU_old list must exist. */ + ut_a(!old || buf_pool->LRU_old); + + if (UT_LIST_GET_PREV(LRU, bpage) && UT_LIST_GET_NEXT(LRU, bpage)) { + const buf_page_t* prev = UT_LIST_GET_PREV(LRU, bpage); + const buf_page_t* next = UT_LIST_GET_NEXT(LRU, bpage); + if (prev->old == next->old) { + ut_a(prev->old == old); + } else { + ut_a(!prev->old); + ut_a(buf_pool->LRU_old == (old ? bpage : next)); + } + } +#endif /* UNIV_LRU_DEBUG */ + + bpage->old = old; +} + +/*********************************************************************//** +Determine the time of first access of a block in the buffer pool. +@return ut_time_ms() at the time of first access, 0 if not accessed */ +UNIV_INLINE +unsigned +buf_page_is_accessed( +/*=================*/ + const buf_page_t* bpage) /*!< in: control block */ +{ + ut_ad(buf_page_in_file(bpage)); + + return(bpage->access_time); +} + +/*********************************************************************//** +Flag a block accessed. */ +UNIV_INLINE +void +buf_page_set_accessed( +/*==================*/ + buf_page_t* bpage) /*!< in/out: control block */ +{ +#ifdef UNIV_DEBUG + buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); + ut_ad(!buf_pool_mutex_own(buf_pool)); + ut_ad(mutex_own(buf_page_get_mutex(bpage))); +#endif /* UNIV_DEBUG */ + + ut_a(buf_page_in_file(bpage)); + + if (bpage->access_time == 0) { + /* Make this the time of the first access. */ + bpage->access_time = static_cast<uint>(ut_time_ms()); + } +} + +/*********************************************************************//** +Gets the buf_block_t handle of a buffered file block if an uncompressed +page frame exists, or NULL. +@return control block, or NULL */ +UNIV_INLINE +buf_block_t* +buf_page_get_block( +/*===============*/ + buf_page_t* bpage) /*!< in: control block, or NULL */ +{ + if (bpage != NULL) { + ut_ad(buf_page_in_file(bpage)); + + if (buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE) { + return((buf_block_t*) bpage); + } + } + + return(NULL); +} +#endif /* !UNIV_HOTBACKUP */ + +#ifdef UNIV_DEBUG +/*********************************************************************//** +Gets a pointer to the memory frame of a block. +@return pointer to the frame */ +UNIV_INLINE +buf_frame_t* +buf_block_get_frame( +/*================*/ + const buf_block_t* block) /*!< in: pointer to the control block */ +{ + ut_ad(block); + + switch (buf_block_get_state(block)) { + case BUF_BLOCK_POOL_WATCH: + case BUF_BLOCK_ZIP_PAGE: + case BUF_BLOCK_ZIP_DIRTY: + case BUF_BLOCK_NOT_USED: + ut_error; + break; + case BUF_BLOCK_FILE_PAGE: +# ifndef UNIV_HOTBACKUP + ut_a(block->page.buf_fix_count > 0); +# endif /* !UNIV_HOTBACKUP */ + /* fall through */ + case BUF_BLOCK_READY_FOR_USE: + case BUF_BLOCK_MEMORY: + case BUF_BLOCK_REMOVE_HASH: + goto ok; + } + ut_error; +ok: + return((buf_frame_t*) block->frame); +} +#endif /* UNIV_DEBUG */ + +/*********************************************************************//** +Gets the space id of a block. +@return space id */ +UNIV_INLINE +ulint +buf_page_get_space( +/*===============*/ + const buf_page_t* bpage) /*!< in: pointer to the control block */ +{ + ut_ad(bpage); + ut_a(buf_page_in_file(bpage)); + + return(bpage->space); +} + +/*********************************************************************//** +Gets the space id of a block. +@return space id */ +UNIV_INLINE +ulint +buf_block_get_space( +/*================*/ + const buf_block_t* block) /*!< in: pointer to the control block */ +{ + ut_ad(block); + ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE); + + return(block->page.space); +} + +/*********************************************************************//** +Gets the page number of a block. +@return page number */ +UNIV_INLINE +ulint +buf_page_get_page_no( +/*=================*/ + const buf_page_t* bpage) /*!< in: pointer to the control block */ +{ + ut_ad(bpage); + ut_a(buf_page_in_file(bpage)); + + return(bpage->offset); +} +/*********************************************************************** +FIXME_FTS Gets the frame the pointer is pointing to. */ +UNIV_INLINE +buf_frame_t* +buf_frame_align( +/*============*/ + /* out: pointer to frame */ + byte* ptr) /* in: pointer to a frame */ +{ + buf_frame_t* frame; + + ut_ad(ptr); + + frame = (buf_frame_t*) ut_align_down(ptr, UNIV_PAGE_SIZE); + + return(frame); +} + +/*********************************************************************//** +Gets the page number of a block. +@return page number */ +UNIV_INLINE +ulint +buf_block_get_page_no( +/*==================*/ + const buf_block_t* block) /*!< in: pointer to the control block */ +{ + ut_ad(block); + ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE); + + return(block->page.offset); +} + +/*********************************************************************//** +Gets the compressed page size of a block. +@return compressed page size, or 0 */ +UNIV_INLINE +ulint +buf_page_get_zip_size( +/*==================*/ + const buf_page_t* bpage) /*!< in: pointer to the control block */ +{ + return(bpage->zip.ssize + ? (UNIV_ZIP_SIZE_MIN >> 1) << bpage->zip.ssize : 0); +} + +/*********************************************************************//** +Gets the compressed page size of a block. +@return compressed page size, or 0 */ +UNIV_INLINE +ulint +buf_block_get_zip_size( +/*===================*/ + const buf_block_t* block) /*!< in: pointer to the control block */ +{ + return(block->page.zip.ssize + ? (UNIV_ZIP_SIZE_MIN >> 1) << block->page.zip.ssize : 0); +} + +#ifndef UNIV_HOTBACKUP +#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG +/*********************************************************************//** +Gets the compressed page descriptor corresponding to an uncompressed page +if applicable. +@return compressed page descriptor, or NULL */ +UNIV_INLINE +const page_zip_des_t* +buf_frame_get_page_zip( +/*===================*/ + const byte* ptr) /*!< in: pointer to the page */ +{ + return(buf_block_get_page_zip(buf_block_align(ptr))); +} +#endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */ +#endif /* !UNIV_HOTBACKUP */ + +/**********************************************************************//** +Gets the space id, page offset, and byte offset within page of a +pointer pointing to a buffer frame containing a file page. */ +UNIV_INLINE +void +buf_ptr_get_fsp_addr( +/*=================*/ + const void* ptr, /*!< in: pointer to a buffer frame */ + ulint* space, /*!< out: space id */ + fil_addr_t* addr) /*!< out: page offset and byte offset */ +{ + const page_t* page = (const page_t*) ut_align_down(ptr, + UNIV_PAGE_SIZE); + + *space = mach_read_from_4(page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID); + addr->page = mach_read_from_4(page + FIL_PAGE_OFFSET); + addr->boffset = ut_align_offset(ptr, UNIV_PAGE_SIZE); +} + +#ifndef UNIV_HOTBACKUP +/**********************************************************************//** +Gets the hash value of the page the pointer is pointing to. This can be used +in searches in the lock hash table. +@return lock hash value */ +UNIV_INLINE +ulint +buf_block_get_lock_hash_val( +/*========================*/ + const buf_block_t* block) /*!< in: block */ +{ + ut_ad(block); + ut_ad(buf_page_in_file(&block->page)); +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&(((buf_block_t*) block)->lock), RW_LOCK_EXCLUSIVE) + || rw_lock_own(&(((buf_block_t*) block)->lock), RW_LOCK_SHARED)); +#endif /* UNIV_SYNC_DEBUG */ + return(block->lock_hash_val); +} + +/********************************************************************//** +Allocates a buf_page_t descriptor. This function must succeed. In case +of failure we assert in this function. +@return: the allocated descriptor. */ +UNIV_INLINE +buf_page_t* +buf_page_alloc_descriptor(void) +/*===========================*/ +{ + buf_page_t* bpage; + + bpage = (buf_page_t*) ut_malloc(sizeof *bpage); + ut_d(memset(bpage, 0, sizeof *bpage)); + UNIV_MEM_ALLOC(bpage, sizeof *bpage); + + return(bpage); +} + +/********************************************************************//** +Free a buf_page_t descriptor. */ +UNIV_INLINE +void +buf_page_free_descriptor( +/*=====================*/ + buf_page_t* bpage) /*!< in: bpage descriptor to free. */ +{ + ut_free(bpage); +} + +/********************************************************************//** +Frees a buffer block which does not contain a file page. */ +UNIV_INLINE +void +buf_block_free( +/*===========*/ + buf_block_t* block) /*!< in, own: block to be freed */ +{ + buf_pool_t* buf_pool = buf_pool_from_bpage((buf_page_t*) block); + + buf_pool_mutex_enter(buf_pool); + + mutex_enter(&block->mutex); + + ut_a(buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE); + + buf_LRU_block_free_non_file_page(block); + + mutex_exit(&block->mutex); + + buf_pool_mutex_exit(buf_pool); +} +#endif /* !UNIV_HOTBACKUP */ + +/*********************************************************************//** +Copies contents of a buffer frame to a given buffer. +@return buf */ +UNIV_INLINE +byte* +buf_frame_copy( +/*===========*/ + byte* buf, /*!< in: buffer to copy to */ + const buf_frame_t* frame) /*!< in: buffer frame */ +{ + ut_ad(buf && frame); + + ut_memcpy(buf, frame, UNIV_PAGE_SIZE); + + return(buf); +} + +#ifndef UNIV_HOTBACKUP +/********************************************************************//** +Calculates a folded value of a file page address to use in the page hash +table. +@return the folded value */ +UNIV_INLINE +ulint +buf_page_address_fold( +/*==================*/ + ulint space, /*!< in: space id */ + ulint offset) /*!< in: offset of the page within space */ +{ + return((space << 20) + space + offset); +} + +/********************************************************************//** +Gets the youngest modification log sequence number for a frame. +Returns zero if not file page or no modification occurred yet. +@return newest modification to page */ +UNIV_INLINE +lsn_t +buf_page_get_newest_modification( +/*=============================*/ + const buf_page_t* bpage) /*!< in: block containing the + page frame */ +{ + lsn_t lsn; + ib_mutex_t* block_mutex = buf_page_get_mutex(bpage); + + mutex_enter(block_mutex); + + if (buf_page_in_file(bpage)) { + lsn = bpage->newest_modification; + } else { + lsn = 0; + } + + mutex_exit(block_mutex); + + return(lsn); +} + +/********************************************************************//** +Increments the modify clock of a frame by 1. The caller must (1) own the +buf_pool mutex and block bufferfix count has to be zero, (2) or own an x-lock +on the block. */ +UNIV_INLINE +void +buf_block_modify_clock_inc( +/*=======================*/ + buf_block_t* block) /*!< in: block */ +{ +#ifdef UNIV_SYNC_DEBUG + buf_pool_t* buf_pool = buf_pool_from_bpage((buf_page_t*) block); + + ut_ad((buf_pool_mutex_own(buf_pool) + && (block->page.buf_fix_count == 0)) + || rw_lock_own(&(block->lock), RW_LOCK_EXCLUSIVE)); +#endif /* UNIV_SYNC_DEBUG */ + + block->modify_clock++; +} + +/********************************************************************//** +Returns the value of the modify clock. The caller must have an s-lock +or x-lock on the block. +@return value */ +UNIV_INLINE +ib_uint64_t +buf_block_get_modify_clock( +/*=======================*/ + buf_block_t* block) /*!< in: block */ +{ +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&(block->lock), RW_LOCK_SHARED) + || rw_lock_own(&(block->lock), RW_LOCK_EXCLUSIVE)); +#endif /* UNIV_SYNC_DEBUG */ + + return(block->modify_clock); +} + +/*******************************************************************//** +Increments the bufferfix count. */ +UNIV_INLINE +void +buf_block_fix( +/*===========*/ + buf_block_t* block) /*!< in/out: block to bufferfix */ +{ +#ifdef PAGE_ATOMIC_REF_COUNT + os_atomic_increment_uint32(&block->page.buf_fix_count, 1); +#else + ib_mutex_t* block_mutex = buf_page_get_mutex(&block->page); + + mutex_enter(block_mutex); + ++block->page.buf_fix_count; + mutex_exit(block_mutex); +#endif /* PAGE_ATOMIC_REF_COUNT */ +} + +/*******************************************************************//** +Increments the bufferfix count. */ +UNIV_INLINE +void +buf_block_buf_fix_inc_func( +/*=======================*/ +#ifdef UNIV_SYNC_DEBUG + const char* file, /*!< in: file name */ + ulint line, /*!< in: line */ +#endif /* UNIV_SYNC_DEBUG */ + buf_block_t* block) /*!< in/out: block to bufferfix */ +{ +#ifdef UNIV_SYNC_DEBUG + ibool ret; + + ret = rw_lock_s_lock_nowait(&(block->debug_latch), file, line); + ut_a(ret); +#endif /* UNIV_SYNC_DEBUG */ + +#ifdef PAGE_ATOMIC_REF_COUNT + os_atomic_increment_uint32(&block->page.buf_fix_count, 1); +#else + ut_ad(mutex_own(&block->mutex)); + + ++block->page.buf_fix_count; +#endif /* PAGE_ATOMIC_REF_COUNT */ +} + +/*******************************************************************//** +Decrements the bufferfix count. */ +UNIV_INLINE +void +buf_block_unfix( +/*============*/ + buf_block_t* block) /*!< in/out: block to bufferunfix */ +{ + ut_ad(block->page.buf_fix_count > 0); + +#ifdef PAGE_ATOMIC_REF_COUNT + os_atomic_decrement_uint32(&block->page.buf_fix_count, 1); +#else + ib_mutex_t* block_mutex = buf_page_get_mutex(&block->page); + + mutex_enter(block_mutex); + --block->page.buf_fix_count; + mutex_exit(block_mutex); +#endif /* PAGE_ATOMIC_REF_COUNT */ +} + +/*******************************************************************//** +Decrements the bufferfix count. */ +UNIV_INLINE +void +buf_block_buf_fix_dec( +/*==================*/ + buf_block_t* block) /*!< in/out: block to bufferunfix */ +{ + ut_ad(block->page.buf_fix_count > 0); + +#ifdef PAGE_ATOMIC_REF_COUNT + os_atomic_decrement_uint32(&block->page.buf_fix_count, 1); +#else + mutex_enter(&block->mutex); + --block->page.buf_fix_count; + mutex_exit(&block->mutex); +#endif /* PAGE_ATOMIC_REF_COUNT */ + +#ifdef UNIV_SYNC_DEBUG + rw_lock_s_unlock(&block->debug_latch); +#endif +} + +/******************************************************************//** +Returns the buffer pool instance given space and offset of page +@return buffer pool */ +UNIV_INLINE +buf_pool_t* +buf_pool_get( +/*==========*/ + ulint space, /*!< in: space id */ + ulint offset) /*!< in: offset of the page within space */ +{ + ulint fold; + ulint index; + ulint ignored_offset; + + ignored_offset = offset >> 6; /* 2log of BUF_READ_AHEAD_AREA (64)*/ + fold = buf_page_address_fold(space, ignored_offset); + index = fold % srv_buf_pool_instances; + return(&buf_pool_ptr[index]); +} + +/******************************************************************//** +Returns the buffer pool instance given its array index +@return buffer pool */ +UNIV_INLINE +buf_pool_t* +buf_pool_from_array( +/*================*/ + ulint index) /*!< in: array index to get + buffer pool instance from */ +{ + ut_ad(index < MAX_BUFFER_POOLS); + ut_ad(index < srv_buf_pool_instances); + return(&buf_pool_ptr[index]); +} + +/******************************************************************//** +Returns the control block of a file page, NULL if not found. +@return block, NULL if not found */ +UNIV_INLINE +buf_page_t* +buf_page_hash_get_low( +/*==================*/ + buf_pool_t* buf_pool,/*!< buffer pool instance */ + ulint space, /*!< in: space id */ + ulint offset, /*!< in: offset of the page within space */ + ulint fold) /*!< in: buf_page_address_fold(space, offset) */ +{ + buf_page_t* bpage; + +#ifdef UNIV_SYNC_DEBUG + ulint hash_fold; + rw_lock_t* hash_lock; + + hash_fold = buf_page_address_fold(space, offset); + ut_ad(hash_fold == fold); + + hash_lock = hash_get_lock(buf_pool->page_hash, fold); + ut_ad(rw_lock_own(hash_lock, RW_LOCK_EX) + || rw_lock_own(hash_lock, RW_LOCK_SHARED)); +#endif /* UNIV_SYNC_DEBUG */ + + /* Look for the page in the hash table */ + + HASH_SEARCH(hash, buf_pool->page_hash, fold, buf_page_t*, bpage, + ut_ad(bpage->in_page_hash && !bpage->in_zip_hash + && buf_page_in_file(bpage)), + bpage->space == space && bpage->offset == offset); + if (bpage) { + ut_a(buf_page_in_file(bpage)); + ut_ad(bpage->in_page_hash); + ut_ad(!bpage->in_zip_hash); + } + + return(bpage); +} + +/******************************************************************//** +Returns the control block of a file page, NULL if not found. +If the block is found and lock is not NULL then the appropriate +page_hash lock is acquired in the specified lock mode. Otherwise, +mode value is ignored. It is up to the caller to release the +lock. If the block is found and the lock is NULL then the page_hash +lock is released by this function. +@return block, NULL if not found, or watch sentinel (if watch is true) */ +UNIV_INLINE +buf_page_t* +buf_page_hash_get_locked( +/*=====================*/ + /*!< out: pointer to the bpage, + or NULL; if NULL, hash_lock + is also NULL. */ + buf_pool_t* buf_pool, /*!< buffer pool instance */ + ulint space, /*!< in: space id */ + ulint offset, /*!< in: page number */ + rw_lock_t** lock, /*!< in/out: lock of the page + hash acquired if bpage is + found. NULL otherwise. If NULL + is passed then the hash_lock + is released by this function */ + ulint lock_mode, /*!< in: RW_LOCK_EX or + RW_LOCK_SHARED. Ignored if + lock == NULL */ + bool watch) /*!< in: if true, return watch + sentinel also. */ +{ + buf_page_t* bpage = NULL; + ulint fold; + rw_lock_t* hash_lock; + ulint mode = RW_LOCK_SHARED; + + if (lock != NULL) { + *lock = NULL; + ut_ad(lock_mode == RW_LOCK_EX + || lock_mode == RW_LOCK_SHARED); + mode = lock_mode; + } + + fold = buf_page_address_fold(space, offset); + hash_lock = hash_get_lock(buf_pool->page_hash, fold); + +#ifdef UNIV_SYNC_DEBUG + ut_ad(!rw_lock_own(hash_lock, RW_LOCK_EX) + && !rw_lock_own(hash_lock, RW_LOCK_SHARED)); +#endif /* UNIV_SYNC_DEBUG */ + + if (mode == RW_LOCK_SHARED) { + rw_lock_s_lock(hash_lock); + } else { + rw_lock_x_lock(hash_lock); + } + + bpage = buf_page_hash_get_low(buf_pool, space, offset, fold); + + if (!bpage || buf_pool_watch_is_sentinel(buf_pool, bpage)) { + if (!watch) { + bpage = NULL; + } + goto unlock_and_exit; + } + + ut_ad(buf_page_in_file(bpage)); + ut_ad(offset == bpage->offset); + ut_ad(space == bpage->space); + + if (lock == NULL) { + /* The caller wants us to release the page_hash lock */ + goto unlock_and_exit; + } else { + /* To be released by the caller */ + *lock = hash_lock; + goto exit; + } + +unlock_and_exit: + if (mode == RW_LOCK_SHARED) { + rw_lock_s_unlock(hash_lock); + } else { + rw_lock_x_unlock(hash_lock); + } +exit: + return(bpage); +} + +/******************************************************************//** +Returns the control block of a file page, NULL if not found. +If the block is found and lock is not NULL then the appropriate +page_hash lock is acquired in the specified lock mode. Otherwise, +mode value is ignored. It is up to the caller to release the +lock. If the block is found and the lock is NULL then the page_hash +lock is released by this function. +@return block, NULL if not found */ +UNIV_INLINE +buf_block_t* +buf_block_hash_get_locked( +/*=====================*/ + /*!< out: pointer to the bpage, + or NULL; if NULL, hash_lock + is also NULL. */ + buf_pool_t* buf_pool, /*!< buffer pool instance */ + ulint space, /*!< in: space id */ + ulint offset, /*!< in: page number */ + rw_lock_t** lock, /*!< in/out: lock of the page + hash acquired if bpage is + found. NULL otherwise. If NULL + is passed then the hash_lock + is released by this function */ + ulint lock_mode) /*!< in: RW_LOCK_EX or + RW_LOCK_SHARED. Ignored if + lock == NULL */ +{ + buf_page_t* bpage = buf_page_hash_get_locked(buf_pool, + space, + offset, + lock, + lock_mode); + buf_block_t* block = buf_page_get_block(bpage); + + if (block) { + ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE); +#ifdef UNIV_SYNC_DEBUG + ut_ad(!lock || rw_lock_own(*lock, lock_mode)); +#endif /* UNIV_SYNC_DEBUG */ + return(block); + } else if (bpage) { + /* It is not a block. Just a bpage */ + ut_ad(buf_page_in_file(bpage)); + + if (lock) { + if (lock_mode == RW_LOCK_SHARED) { + rw_lock_s_unlock(*lock); + } else { + rw_lock_x_unlock(*lock); + } + } + *lock = NULL; + return(NULL); + } + + ut_ad(!bpage); + ut_ad(lock == NULL ||*lock == NULL); + return(NULL); +} + +/********************************************************************//** +Returns TRUE if the page can be found in the buffer pool hash table. + +NOTE that it is possible that the page is not yet read from disk, +though. + +@return TRUE if found in the page hash table */ +UNIV_INLINE +ibool +buf_page_peek( +/*==========*/ + ulint space, /*!< in: space id */ + ulint offset) /*!< in: page number */ +{ + buf_pool_t* buf_pool = buf_pool_get(space, offset); + + return(buf_page_hash_get(buf_pool, space, offset) != NULL); +} + +/********************************************************************//** +Releases a compressed-only page acquired with buf_page_get_zip(). */ +UNIV_INLINE +void +buf_page_release_zip( +/*=================*/ + buf_page_t* bpage) /*!< in: buffer block */ +{ + buf_block_t* block; + + block = (buf_block_t*) bpage; + + switch (buf_page_get_state(bpage)) { + case BUF_BLOCK_FILE_PAGE: +#ifdef UNIV_SYNC_DEBUG + rw_lock_s_unlock(&block->debug_latch); +#endif /* UNUV_SYNC_DEBUG */ + /* Fall through */ + case BUF_BLOCK_ZIP_PAGE: + case BUF_BLOCK_ZIP_DIRTY: + buf_block_unfix(block); + return; + + case BUF_BLOCK_POOL_WATCH: + case BUF_BLOCK_NOT_USED: + case BUF_BLOCK_READY_FOR_USE: + case BUF_BLOCK_MEMORY: + case BUF_BLOCK_REMOVE_HASH: + break; + } + + ut_error; +} + +/********************************************************************//** +Decrements the bufferfix count of a buffer control block and releases +a latch, if specified. */ +UNIV_INLINE +void +buf_page_release( +/*=============*/ + buf_block_t* block, /*!< in: buffer block */ + ulint rw_latch) /*!< in: RW_S_LATCH, RW_X_LATCH, + RW_NO_LATCH */ +{ + ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE); + +#ifdef UNIV_SYNC_DEBUG + rw_lock_s_unlock(&(block->debug_latch)); +#endif + if (rw_latch == RW_S_LATCH) { + rw_lock_s_unlock(&(block->lock)); + } else if (rw_latch == RW_X_LATCH) { + rw_lock_x_unlock(&(block->lock)); + } + + buf_block_unfix(block); +} + +#ifdef UNIV_SYNC_DEBUG +/*********************************************************************//** +Adds latch level info for the rw-lock protecting the buffer frame. This +should be called in the debug version after a successful latching of a +page if we know the latching order level of the acquired latch. */ +UNIV_INLINE +void +buf_block_dbg_add_level( +/*====================*/ + buf_block_t* block, /*!< in: buffer page + where we have acquired latch */ + ulint level) /*!< in: latching order level */ +{ + sync_thread_add_level(&block->lock, level, FALSE); +} + +#endif /* UNIV_SYNC_DEBUG */ +/********************************************************************//** +Acquire mutex on all buffer pool instances. */ +UNIV_INLINE +void +buf_pool_mutex_enter_all(void) +/*==========================*/ +{ + ulint i; + + for (i = 0; i < srv_buf_pool_instances; i++) { + buf_pool_t* buf_pool; + + buf_pool = buf_pool_from_array(i); + buf_pool_mutex_enter(buf_pool); + } +} + +/********************************************************************//** +Release mutex on all buffer pool instances. */ +UNIV_INLINE +void +buf_pool_mutex_exit_all(void) +/*=========================*/ +{ + ulint i; + + for (i = 0; i < srv_buf_pool_instances; i++) { + buf_pool_t* buf_pool; + + buf_pool = buf_pool_from_array(i); + buf_pool_mutex_exit(buf_pool); + } +} +/*********************************************************************//** +Get the nth chunk's buffer block in the specified buffer pool. +@return the nth chunk's buffer block. */ +UNIV_INLINE +buf_block_t* +buf_get_nth_chunk_block( +/*====================*/ + const buf_pool_t* buf_pool, /*!< in: buffer pool instance */ + ulint n, /*!< in: nth chunk in the buffer pool */ + ulint* chunk_size) /*!< in: chunk size */ +{ + const buf_chunk_t* chunk; + + chunk = buf_pool->chunks + n; + *chunk_size = chunk->size; + return(chunk->blocks); +} +#endif /* !UNIV_HOTBACKUP */ diff --git a/storage/innobase/include/buf0checksum.h b/storage/innobase/include/buf0checksum.h new file mode 100644 index 00000000000..cd21781dc6e --- /dev/null +++ b/storage/innobase/include/buf0checksum.h @@ -0,0 +1,88 @@ +/***************************************************************************** + +Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file buf/buf0checksum.h +Buffer pool checksum functions, also linked from /extra/innochecksum.cc + +Created Aug 11, 2011 Vasil Dimov +*******************************************************/ + +#ifndef buf0checksum_h +#define buf0checksum_h + +#include "univ.i" + +#ifndef UNIV_INNOCHECKSUM + +#include "buf0types.h" + +#endif /* !UNIV_INNOCHECKSUM */ + +/********************************************************************//** +Calculates a page CRC32 which is stored to the page when it is written +to a file. Note that we must be careful to calculate the same value on +32-bit and 64-bit architectures. +@return checksum */ +UNIV_INTERN +ib_uint32_t +buf_calc_page_crc32( +/*================*/ + const byte* page); /*!< in: buffer page */ + +/********************************************************************//** +Calculates a page checksum which is stored to the page when it is written +to a file. Note that we must be careful to calculate the same value on +32-bit and 64-bit architectures. +@return checksum */ +UNIV_INTERN +ulint +buf_calc_page_new_checksum( +/*=======================*/ + const byte* page); /*!< in: buffer page */ + +/********************************************************************//** +In versions < 4.0.14 and < 4.1.1 there was a bug that the checksum only +looked at the first few bytes of the page. This calculates that old +checksum. +NOTE: we must first store the new formula checksum to +FIL_PAGE_SPACE_OR_CHKSUM before calculating and storing this old checksum +because this takes that field as an input! +@return checksum */ +UNIV_INTERN +ulint +buf_calc_page_old_checksum( +/*=======================*/ + const byte* page); /*!< in: buffer page */ + +#ifndef UNIV_INNOCHECKSUM + +/********************************************************************//** +Return a printable string describing the checksum algorithm. +@return algorithm name */ +UNIV_INTERN +const char* +buf_checksum_algorithm_name( +/*========================*/ + srv_checksum_algorithm_t algo); /*!< in: algorithm */ + +extern ulong srv_checksum_algorithm; + +#endif /* !UNIV_INNOCHECKSUM */ + +#endif /* buf0checksum_h */ diff --git a/storage/innobase/include/buf0dblwr.h b/storage/innobase/include/buf0dblwr.h new file mode 100644 index 00000000000..a62a6400d97 --- /dev/null +++ b/storage/innobase/include/buf0dblwr.h @@ -0,0 +1,162 @@ +/***************************************************************************** + +Copyright (c) 1995, 2014, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/buf0dblwr.h +Doublewrite buffer module + +Created 2011/12/19 Inaam Rana +*******************************************************/ + +#ifndef buf0dblwr_h +#define buf0dblwr_h + +#include "univ.i" +#include "ut0byte.h" +#include "log0log.h" +#include "log0recv.h" + +#ifndef UNIV_HOTBACKUP + +/** Doublewrite system */ +extern buf_dblwr_t* buf_dblwr; +/** Set to TRUE when the doublewrite buffer is being created */ +extern ibool buf_dblwr_being_created; + +/****************************************************************//** +Creates the doublewrite buffer to a new InnoDB installation. The header of the +doublewrite buffer is placed on the trx system header page. */ +UNIV_INTERN +void +buf_dblwr_create(void); +/*==================*/ + +/****************************************************************//** +At a database startup initializes the doublewrite buffer memory structure if +we already have a doublewrite buffer created in the data files. If we are +upgrading to an InnoDB version which supports multiple tablespaces, then this +function performs the necessary update operations. If we are in a crash +recovery, this function loads the pages from double write buffer into memory. */ +void +buf_dblwr_init_or_load_pages( +/*=========================*/ + os_file_t file, + char* path, + bool load_corrupt_pages); + +/****************************************************************//** +Process the double write buffer pages. */ +void +buf_dblwr_process(void); +/*===================*/ + +/****************************************************************//** +frees doublewrite buffer. */ +UNIV_INTERN +void +buf_dblwr_free(void); +/*================*/ +/********************************************************************//** +Updates the doublewrite buffer when an IO request is completed. */ +UNIV_INTERN +void +buf_dblwr_update( +/*=============*/ + const buf_page_t* bpage, /*!< in: buffer block descriptor */ + buf_flush_t flush_type);/*!< in: flush type */ +/****************************************************************//** +Determines if a page number is located inside the doublewrite buffer. +@return TRUE if the location is inside the two blocks of the +doublewrite buffer */ +UNIV_INTERN +ibool +buf_dblwr_page_inside( +/*==================*/ + ulint page_no); /*!< in: page number */ +/********************************************************************//** +Posts a buffer page for writing. If the doublewrite memory buffer is +full, calls buf_dblwr_flush_buffered_writes and waits for for free +space to appear. */ +UNIV_INTERN +void +buf_dblwr_add_to_batch( +/*====================*/ + buf_page_t* bpage); /*!< in: buffer block to write */ +/********************************************************************//** +Flushes possible buffered writes from the doublewrite memory buffer to disk, +and also wakes up the aio thread if simulated aio is used. It is very +important to call this function after a batch of writes has been posted, +and also when we may have to wait for a page latch! Otherwise a deadlock +of threads can occur. */ +UNIV_INTERN +void +buf_dblwr_flush_buffered_writes(void); +/*=================================*/ +/********************************************************************//** +Writes a page to the doublewrite buffer on disk, sync it, then write +the page to the datafile and sync the datafile. This function is used +for single page flushes. If all the buffers allocated for single page +flushes in the doublewrite buffer are in use we wait here for one to +become free. We are guaranteed that a slot will become free because any +thread that is using a slot must also release the slot before leaving +this function. */ +UNIV_INTERN +void +buf_dblwr_write_single_page( +/*========================*/ + buf_page_t* bpage, /*!< in: buffer block to write */ + bool sync); /*!< in: true if sync IO requested */ + +/** Doublewrite control struct */ +struct buf_dblwr_t{ + ib_mutex_t mutex; /*!< mutex protecting the first_free + field and write_buf */ + ulint block1; /*!< the page number of the first + doublewrite block (64 pages) */ + ulint block2; /*!< page number of the second block */ + ulint first_free;/*!< first free position in write_buf + measured in units of UNIV_PAGE_SIZE */ + ulint b_reserved;/*!< number of slots currently reserved + for batch flush. */ + os_event_t b_event;/*!< event where threads wait for a + batch flush to end. */ + ulint s_reserved;/*!< number of slots currently + reserved for single page flushes. */ + os_event_t s_event;/*!< event where threads wait for a + single page flush slot. */ + bool* in_use; /*!< flag used to indicate if a slot is + in use. Only used for single page + flushes. */ + bool batch_running;/*!< set to TRUE if currently a batch + is being written from the doublewrite + buffer. */ + byte* write_buf;/*!< write buffer used in writing to the + doublewrite buffer, aligned to an + address divisible by UNIV_PAGE_SIZE + (which is required by Windows aio) */ + byte* write_buf_unaligned;/*!< pointer to write_buf, + but unaligned */ + buf_page_t** buf_block_arr;/*!< array to store pointers to + the buffer blocks which have been + cached to write_buf */ +}; + + +#endif /* UNIV_HOTBACKUP */ + +#endif diff --git a/storage/innobase/include/buf0dump.h b/storage/innobase/include/buf0dump.h new file mode 100644 index 00000000000..c704a8e97e0 --- /dev/null +++ b/storage/innobase/include/buf0dump.h @@ -0,0 +1,72 @@ +/***************************************************************************** + +Copyright (c) 2011, 2011, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file buf/buf0dump.h +Implements a buffer pool dump/load. + +Created April 08, 2011 Vasil Dimov +*******************************************************/ + +#ifndef buf0dump_h +#define buf0dump_h + +#include "univ.i" + +/*****************************************************************//** +Wakes up the buffer pool dump/load thread and instructs it to start +a dump. This function is called by MySQL code via buffer_pool_dump_now() +and it should return immediately because the whole MySQL is frozen during +its execution. */ +UNIV_INTERN +void +buf_dump_start(); +/*============*/ + +/*****************************************************************//** +Wakes up the buffer pool dump/load thread and instructs it to start +a load. This function is called by MySQL code via buffer_pool_load_now() +and it should return immediately because the whole MySQL is frozen during +its execution. */ +UNIV_INTERN +void +buf_load_start(); +/*============*/ + +/*****************************************************************//** +Aborts a currently running buffer pool load. This function is called by +MySQL code via buffer_pool_load_abort() and it should return immediately +because the whole MySQL is frozen during its execution. */ +UNIV_INTERN +void +buf_load_abort(); +/*============*/ + +/*****************************************************************//** +This is the main thread for buffer pool dump/load. It waits for an +event and when waked up either performs a dump or load and sleeps +again. +@return this function does not return, it calls os_thread_exit() */ +extern "C" UNIV_INTERN +os_thread_ret_t +DECLARE_THREAD(buf_dump_thread)( +/*============================*/ + void* arg); /*!< in: a dummy parameter + required by os_thread_create */ + +#endif /* buf0dump_h */ diff --git a/storage/innobase/include/buf0flu.h b/storage/innobase/include/buf0flu.h new file mode 100644 index 00000000000..f116720574b --- /dev/null +++ b/storage/innobase/include/buf0flu.h @@ -0,0 +1,286 @@ +/***************************************************************************** + +Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/buf0flu.h +The database buffer pool flush algorithm + +Created 11/5/1995 Heikki Tuuri +*******************************************************/ + +#ifndef buf0flu_h +#define buf0flu_h + +#include "univ.i" +#include "ut0byte.h" +#include "log0log.h" +#ifndef UNIV_HOTBACKUP +#include "mtr0types.h" +#include "buf0types.h" + +/** Flag indicating if the page_cleaner is in active state. */ +extern ibool buf_page_cleaner_is_active; + +/********************************************************************//** +Remove a block from the flush list of modified blocks. */ +UNIV_INTERN +void +buf_flush_remove( +/*=============*/ + buf_page_t* bpage); /*!< in: pointer to the block in question */ +/*******************************************************************//** +Relocates a buffer control block on the flush_list. +Note that it is assumed that the contents of bpage has already been +copied to dpage. */ +UNIV_INTERN +void +buf_flush_relocate_on_flush_list( +/*=============================*/ + buf_page_t* bpage, /*!< in/out: control block being moved */ + buf_page_t* dpage); /*!< in/out: destination block */ +/********************************************************************//** +Updates the flush system data structures when a write is completed. */ +UNIV_INTERN +void +buf_flush_write_complete( +/*=====================*/ + buf_page_t* bpage); /*!< in: pointer to the block in question */ +#endif /* !UNIV_HOTBACKUP */ +/********************************************************************//** +Initializes a page for writing to the tablespace. */ +UNIV_INTERN +void +buf_flush_init_for_writing( +/*=======================*/ + byte* page, /*!< in/out: page */ + void* page_zip_, /*!< in/out: compressed page, or NULL */ + lsn_t newest_lsn); /*!< in: newest modification lsn + to the page */ +#ifndef UNIV_HOTBACKUP +# if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG +/********************************************************************//** +Writes a flushable page asynchronously from the buffer pool to a file. +NOTE: buf_pool->mutex and block->mutex must be held upon entering this +function, and they will be released by this function after flushing. +This is loosely based on buf_flush_batch() and buf_flush_page(). +@return TRUE if the page was flushed and the mutexes released */ +UNIV_INTERN +ibool +buf_flush_page_try( +/*===============*/ + buf_pool_t* buf_pool, /*!< in/out: buffer pool instance */ + buf_block_t* block) /*!< in/out: buffer control block */ + __attribute__((nonnull, warn_unused_result)); +# endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */ +/*******************************************************************//** +This utility flushes dirty blocks from the end of the flush list of +all buffer pool instances. +NOTE: The calling thread is not allowed to own any latches on pages! +@return true if a batch was queued successfully for each buffer pool +instance. false if another batch of same type was already running in +at least one of the buffer pool instance */ +UNIV_INTERN +bool +buf_flush_list( +/*===========*/ + ulint min_n, /*!< in: wished minimum mumber of blocks + flushed (it is not guaranteed that the + actual number is that big, though) */ + lsn_t lsn_limit, /*!< in the case BUF_FLUSH_LIST all + blocks whose oldest_modification is + smaller than this should be flushed + (if their number does not exceed + min_n), otherwise ignored */ + ulint* n_processed); /*!< out: the number of pages + which were processed is passed + back to caller. Ignored if NULL */ +/******************************************************************//** +This function picks up a single dirty page from the tail of the LRU +list, flushes it, removes it from page_hash and LRU list and puts +it on the free list. It is called from user threads when they are +unable to find a replacable page at the tail of the LRU list i.e.: +when the background LRU flushing in the page_cleaner thread is not +fast enough to keep pace with the workload. +@return TRUE if success. */ +UNIV_INTERN +ibool +buf_flush_single_page_from_LRU( +/*===========================*/ + buf_pool_t* buf_pool); /*!< in/out: buffer pool instance */ +/******************************************************************//** +Waits until a flush batch of the given type ends */ +UNIV_INTERN +void +buf_flush_wait_batch_end( +/*=====================*/ + buf_pool_t* buf_pool, /*!< in: buffer pool instance */ + buf_flush_t type); /*!< in: BUF_FLUSH_LRU + or BUF_FLUSH_LIST */ +/******************************************************************//** +Waits until a flush batch of the given type ends. This is called by +a thread that only wants to wait for a flush to end but doesn't do +any flushing itself. */ +UNIV_INTERN +void +buf_flush_wait_batch_end_wait_only( +/*===============================*/ + buf_pool_t* buf_pool, /*!< in: buffer pool instance */ + buf_flush_t type); /*!< in: BUF_FLUSH_LRU + or BUF_FLUSH_LIST */ +/********************************************************************//** +This function should be called at a mini-transaction commit, if a page was +modified in it. Puts the block to the list of modified blocks, if it not +already in it. */ +UNIV_INLINE +void +buf_flush_note_modification( +/*========================*/ + buf_block_t* block, /*!< in: block which is modified */ + mtr_t* mtr); /*!< in: mtr */ +/********************************************************************//** +This function should be called when recovery has modified a buffer page. */ +UNIV_INLINE +void +buf_flush_recv_note_modification( +/*=============================*/ + buf_block_t* block, /*!< in: block which is modified */ + lsn_t start_lsn, /*!< in: start lsn of the first mtr in a + set of mtr's */ + lsn_t end_lsn); /*!< in: end lsn of the last mtr in the + set of mtr's */ +/********************************************************************//** +Returns TRUE if the file page block is immediately suitable for replacement, +i.e., transition FILE_PAGE => NOT_USED allowed. +@return TRUE if can replace immediately */ +UNIV_INTERN +ibool +buf_flush_ready_for_replace( +/*========================*/ + buf_page_t* bpage); /*!< in: buffer control block, must be + buf_page_in_file(bpage) and in the LRU list */ +/******************************************************************//** +page_cleaner thread tasked with flushing dirty pages from the buffer +pools. As of now we'll have only one instance of this thread. +@return a dummy parameter */ +extern "C" UNIV_INTERN +os_thread_ret_t +DECLARE_THREAD(buf_flush_page_cleaner_thread)( +/*==========================================*/ + void* arg); /*!< in: a dummy parameter required by + os_thread_create */ +/*********************************************************************//** +Clears up tail of the LRU lists: +* Put replaceable pages at the tail of LRU to the free list +* Flush dirty pages at the tail of LRU to the disk +The depth to which we scan each buffer pool is controlled by dynamic +config parameter innodb_LRU_scan_depth. +@return total pages flushed */ +UNIV_INTERN +ulint +buf_flush_LRU_tail(void); +/*====================*/ +/*********************************************************************//** +Wait for any possible LRU flushes that are in progress to end. */ +UNIV_INTERN +void +buf_flush_wait_LRU_batch_end(void); +/*==============================*/ + +#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG +/******************************************************************//** +Validates the flush list. +@return TRUE if ok */ +UNIV_INTERN +ibool +buf_flush_validate( +/*===============*/ + buf_pool_t* buf_pool); +#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ + +/********************************************************************//** +Initialize the red-black tree to speed up insertions into the flush_list +during recovery process. Should be called at the start of recovery +process before any page has been read/written. */ +UNIV_INTERN +void +buf_flush_init_flush_rbt(void); +/*==========================*/ + +/********************************************************************//** +Frees up the red-black tree. */ +UNIV_INTERN +void +buf_flush_free_flush_rbt(void); +/*==========================*/ + +/********************************************************************//** +Writes a flushable page asynchronously from the buffer pool to a file. +NOTE: in simulated aio we must call +os_aio_simulated_wake_handler_threads after we have posted a batch of +writes! NOTE: buf_pool->mutex and buf_page_get_mutex(bpage) must be +held upon entering this function, and they will be released by this +function if it returns true. +@return TRUE if the page was flushed */ +UNIV_INTERN +bool +buf_flush_page( +/*===========*/ + buf_pool_t* buf_pool, /*!< in: buffer pool instance */ + buf_page_t* bpage, /*!< in: buffer control block */ + buf_flush_t flush_type, /*!< in: type of flush */ + bool sync); /*!< in: true if sync IO request */ +/********************************************************************//** +Returns true if the block is modified and ready for flushing. +@return true if can flush immediately */ +UNIV_INTERN +bool +buf_flush_ready_for_flush( +/*======================*/ + buf_page_t* bpage, /*!< in: buffer control block, must be + buf_page_in_file(bpage) */ + buf_flush_t flush_type)/*!< in: type of flush */ + __attribute__((warn_unused_result)); + +#ifdef UNIV_DEBUG +/******************************************************************//** +Check if there are any dirty pages that belong to a space id in the flush +list in a particular buffer pool. +@return number of dirty pages present in a single buffer pool */ +UNIV_INTERN +ulint +buf_pool_get_dirty_pages_count( +/*===========================*/ + buf_pool_t* buf_pool, /*!< in: buffer pool */ + ulint id); /*!< in: space id to check */ +/******************************************************************//** +Check if there are any dirty pages that belong to a space id in the flush list. +@return count of dirty pages present in all the buffer pools */ +UNIV_INTERN +ulint +buf_flush_get_dirty_pages_count( +/*============================*/ + ulint id); /*!< in: space id to check */ +#endif /* UNIV_DEBUG */ + +#endif /* !UNIV_HOTBACKUP */ + +#ifndef UNIV_NONINL +#include "buf0flu.ic" +#endif + +#endif diff --git a/storage/innobase/include/buf0flu.ic b/storage/innobase/include/buf0flu.ic new file mode 100644 index 00000000000..a763cd115fe --- /dev/null +++ b/storage/innobase/include/buf0flu.ic @@ -0,0 +1,139 @@ +/***************************************************************************** + +Copyright (c) 1995, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/buf0flu.ic +The database buffer pool flush algorithm + +Created 11/5/1995 Heikki Tuuri +*******************************************************/ + +#ifndef UNIV_HOTBACKUP +#include "buf0buf.h" +#include "mtr0mtr.h" +#include "srv0srv.h" + +/********************************************************************//** +Inserts a modified block into the flush list. */ +UNIV_INTERN +void +buf_flush_insert_into_flush_list( +/*=============================*/ + buf_pool_t* buf_pool, /*!< buffer pool instance */ + buf_block_t* block, /*!< in/out: block which is modified */ + lsn_t lsn); /*!< in: oldest modification */ +/********************************************************************//** +Inserts a modified block into the flush list in the right sorted position. +This function is used by recovery, because there the modifications do not +necessarily come in the order of lsn's. */ +UNIV_INTERN +void +buf_flush_insert_sorted_into_flush_list( +/*====================================*/ + buf_pool_t* buf_pool, /*!< buffer pool instance */ + buf_block_t* block, /*!< in/out: block which is modified */ + lsn_t lsn); /*!< in: oldest modification */ + +/********************************************************************//** +This function should be called at a mini-transaction commit, if a page was +modified in it. Puts the block to the list of modified blocks, if it is not +already in it. */ +UNIV_INLINE +void +buf_flush_note_modification( +/*========================*/ + buf_block_t* block, /*!< in: block which is modified */ + mtr_t* mtr) /*!< in: mtr */ +{ + buf_pool_t* buf_pool = buf_pool_from_block(block); + + ut_ad(!srv_read_only_mode); + ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE); + ut_ad(block->page.buf_fix_count > 0); +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&(block->lock), RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + + ut_ad(!buf_pool_mutex_own(buf_pool)); + ut_ad(!buf_flush_list_mutex_own(buf_pool)); + ut_ad(!mtr->made_dirty || log_flush_order_mutex_own()); + + ut_ad(mtr->start_lsn != 0); + ut_ad(mtr->modifications); + + mutex_enter(&block->mutex); + ut_ad(block->page.newest_modification <= mtr->end_lsn); + + block->page.newest_modification = mtr->end_lsn; + + if (!block->page.oldest_modification) { + ut_a(mtr->made_dirty); + ut_ad(log_flush_order_mutex_own()); + buf_flush_insert_into_flush_list( + buf_pool, block, mtr->start_lsn); + } else { + ut_ad(block->page.oldest_modification <= mtr->start_lsn); + } + + mutex_exit(&block->mutex); + + srv_stats.buf_pool_write_requests.inc(); +} + +/********************************************************************//** +This function should be called when recovery has modified a buffer page. */ +UNIV_INLINE +void +buf_flush_recv_note_modification( +/*=============================*/ + buf_block_t* block, /*!< in: block which is modified */ + lsn_t start_lsn, /*!< in: start lsn of the first mtr in a + set of mtr's */ + lsn_t end_lsn) /*!< in: end lsn of the last mtr in the + set of mtr's */ +{ + buf_pool_t* buf_pool = buf_pool_from_block(block); + + ut_ad(!srv_read_only_mode); + ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE); + ut_ad(block->page.buf_fix_count > 0); +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&(block->lock), RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + + ut_ad(!buf_pool_mutex_own(buf_pool)); + ut_ad(!buf_flush_list_mutex_own(buf_pool)); + ut_ad(log_flush_order_mutex_own()); + + ut_ad(start_lsn != 0); + ut_ad(block->page.newest_modification <= end_lsn); + + mutex_enter(&block->mutex); + block->page.newest_modification = end_lsn; + + if (!block->page.oldest_modification) { + buf_flush_insert_sorted_into_flush_list( + buf_pool, block, start_lsn); + } else { + ut_ad(block->page.oldest_modification <= start_lsn); + } + + mutex_exit(&block->mutex); + +} +#endif /* !UNIV_HOTBACKUP */ diff --git a/storage/innobase/include/buf0lru.h b/storage/innobase/include/buf0lru.h new file mode 100644 index 00000000000..ecdaef685a1 --- /dev/null +++ b/storage/innobase/include/buf0lru.h @@ -0,0 +1,310 @@ +/***************************************************************************** + +Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/buf0lru.h +The database buffer pool LRU replacement algorithm + +Created 11/5/1995 Heikki Tuuri +*******************************************************/ + +#ifndef buf0lru_h +#define buf0lru_h + +#include "univ.i" +#ifndef UNIV_HOTBACKUP +#include "ut0byte.h" +#include "buf0types.h" + +// Forward declaration +struct trx_t; + +/******************************************************************//** +Returns TRUE if less than 25 % of the buffer pool is available. This can be +used in heuristics to prevent huge transactions eating up the whole buffer +pool for their locks. +@return TRUE if less than 25 % of buffer pool left */ +UNIV_INTERN +ibool +buf_LRU_buf_pool_running_out(void); +/*==============================*/ + +/*####################################################################### +These are low-level functions +#########################################################################*/ + +/** Minimum LRU list length for which the LRU_old pointer is defined */ +#define BUF_LRU_OLD_MIN_LEN 512 /* 8 megabytes of 16k pages */ + +/******************************************************************//** +Flushes all dirty pages or removes all pages belonging +to a given tablespace. A PROBLEM: if readahead is being started, what +guarantees that it will not try to read in pages after this operation +has completed? */ +UNIV_INTERN +void +buf_LRU_flush_or_remove_pages( +/*==========================*/ + ulint id, /*!< in: space id */ + buf_remove_t buf_remove, /*!< in: remove or flush strategy */ + const trx_t* trx); /*!< to check if the operation must + be interrupted */ + +#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG +/********************************************************************//** +Insert a compressed block into buf_pool->zip_clean in the LRU order. */ +UNIV_INTERN +void +buf_LRU_insert_zip_clean( +/*=====================*/ + buf_page_t* bpage); /*!< in: pointer to the block in question */ +#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ + +/******************************************************************//** +Try to free a block. If bpage is a descriptor of a compressed-only +page, the descriptor object will be freed as well. + +NOTE: If this function returns true, it will temporarily +release buf_pool->mutex. Furthermore, the page frame will no longer be +accessible via bpage. + +The caller must hold buf_pool->mutex and must not hold any +buf_page_get_mutex() when calling this function. +@return true if freed, false otherwise. */ +UNIV_INTERN +bool +buf_LRU_free_page( +/*==============*/ + buf_page_t* bpage, /*!< in: block to be freed */ + bool zip) /*!< in: true if should remove also the + compressed page of an uncompressed page */ + __attribute__((nonnull)); +/******************************************************************//** +Try to free a replaceable block. +@return TRUE if found and freed */ +UNIV_INTERN +ibool +buf_LRU_scan_and_free_block( +/*========================*/ + buf_pool_t* buf_pool, /*!< in: buffer pool instance */ + ibool scan_all) /*!< in: scan whole LRU list + if TRUE, otherwise scan only + 'old' blocks. */ + __attribute__((nonnull,warn_unused_result)); +/******************************************************************//** +Returns a free block from the buf_pool. The block is taken off the +free list. If it is empty, returns NULL. +@return a free control block, or NULL if the buf_block->free list is empty */ +UNIV_INTERN +buf_block_t* +buf_LRU_get_free_only( +/*==================*/ + buf_pool_t* buf_pool); /*!< buffer pool instance */ +/******************************************************************//** +Returns a free block from the buf_pool. The block is taken off the +free list. If it is empty, blocks are moved from the end of the +LRU list to the free list. +This function is called from a user thread when it needs a clean +block to read in a page. Note that we only ever get a block from +the free list. Even when we flush a page or find a page in LRU scan +we put it to free list to be used. +* iteration 0: + * get a block from free list, success:done + * if there is an LRU flush batch in progress: + * wait for batch to end: retry free list + * if buf_pool->try_LRU_scan is set + * scan LRU up to srv_LRU_scan_depth to find a clean block + * the above will put the block on free list + * success:retry the free list + * flush one dirty page from tail of LRU to disk + * the above will put the block on free list + * success: retry the free list +* iteration 1: + * same as iteration 0 except: + * scan whole LRU list + * scan LRU list even if buf_pool->try_LRU_scan is not set +* iteration > 1: + * same as iteration 1 but sleep 100ms +@return the free control block, in state BUF_BLOCK_READY_FOR_USE */ +UNIV_INTERN +buf_block_t* +buf_LRU_get_free_block( +/*===================*/ + buf_pool_t* buf_pool) /*!< in/out: buffer pool instance */ + __attribute__((nonnull,warn_unused_result)); +/******************************************************************//** +Determines if the unzip_LRU list should be used for evicting a victim +instead of the general LRU list. +@return TRUE if should use unzip_LRU */ +UNIV_INTERN +ibool +buf_LRU_evict_from_unzip_LRU( +/*=========================*/ + buf_pool_t* buf_pool); +/******************************************************************//** +Puts a block back to the free list. */ +UNIV_INTERN +void +buf_LRU_block_free_non_file_page( +/*=============================*/ + buf_block_t* block); /*!< in: block, must not contain a file page */ +/******************************************************************//** +Adds a block to the LRU list. Please make sure that the zip_size is +already set into the page zip when invoking the function, so that we +can get correct zip_size from the buffer page when adding a block +into LRU */ +UNIV_INTERN +void +buf_LRU_add_block( +/*==============*/ + buf_page_t* bpage, /*!< in: control block */ + ibool old); /*!< in: TRUE if should be put to the old + blocks in the LRU list, else put to the + start; if the LRU list is very short, added to + the start regardless of this parameter */ +/******************************************************************//** +Adds a block to the LRU list of decompressed zip pages. */ +UNIV_INTERN +void +buf_unzip_LRU_add_block( +/*====================*/ + buf_block_t* block, /*!< in: control block */ + ibool old); /*!< in: TRUE if should be put to the end + of the list, else put to the start */ +/******************************************************************//** +Moves a block to the start of the LRU list. */ +UNIV_INTERN +void +buf_LRU_make_block_young( +/*=====================*/ + buf_page_t* bpage); /*!< in: control block */ +/******************************************************************//** +Moves a block to the end of the LRU list. */ +UNIV_INTERN +void +buf_LRU_make_block_old( +/*===================*/ + buf_page_t* bpage); /*!< in: control block */ +/**********************************************************************//** +Updates buf_pool->LRU_old_ratio. +@return updated old_pct */ +UNIV_INTERN +ulint +buf_LRU_old_ratio_update( +/*=====================*/ + uint old_pct,/*!< in: Reserve this percentage of + the buffer pool for "old" blocks. */ + ibool adjust);/*!< in: TRUE=adjust the LRU list; + FALSE=just assign buf_pool->LRU_old_ratio + during the initialization of InnoDB */ +/********************************************************************//** +Update the historical stats that we are collecting for LRU eviction +policy at the end of each interval. */ +UNIV_INTERN +void +buf_LRU_stat_update(void); +/*=====================*/ + +/******************************************************************//** +Remove one page from LRU list and put it to free list */ +UNIV_INTERN +void +buf_LRU_free_one_page( +/*==================*/ + buf_page_t* bpage) /*!< in/out: block, must contain a file page and + be in a state where it can be freed; there + may or may not be a hash index to the page */ + __attribute__((nonnull)); + +#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG +/**********************************************************************//** +Validates the LRU list. +@return TRUE */ +UNIV_INTERN +ibool +buf_LRU_validate(void); +/*==================*/ +#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ +#if defined UNIV_DEBUG_PRINT || defined UNIV_DEBUG || defined UNIV_BUF_DEBUG +/**********************************************************************//** +Prints the LRU list. */ +UNIV_INTERN +void +buf_LRU_print(void); +/*===============*/ +#endif /* UNIV_DEBUG_PRINT || UNIV_DEBUG || UNIV_BUF_DEBUG */ + +/** @name Heuristics for detecting index scan @{ */ +/** The denominator of buf_pool->LRU_old_ratio. */ +#define BUF_LRU_OLD_RATIO_DIV 1024 +/** Maximum value of buf_pool->LRU_old_ratio. +@see buf_LRU_old_adjust_len +@see buf_pool->LRU_old_ratio_update */ +#define BUF_LRU_OLD_RATIO_MAX BUF_LRU_OLD_RATIO_DIV +/** Minimum value of buf_pool->LRU_old_ratio. +@see buf_LRU_old_adjust_len +@see buf_pool->LRU_old_ratio_update +The minimum must exceed +(BUF_LRU_OLD_TOLERANCE + 5) * BUF_LRU_OLD_RATIO_DIV / BUF_LRU_OLD_MIN_LEN. */ +#define BUF_LRU_OLD_RATIO_MIN 51 + +#if BUF_LRU_OLD_RATIO_MIN >= BUF_LRU_OLD_RATIO_MAX +# error "BUF_LRU_OLD_RATIO_MIN >= BUF_LRU_OLD_RATIO_MAX" +#endif +#if BUF_LRU_OLD_RATIO_MAX > BUF_LRU_OLD_RATIO_DIV +# error "BUF_LRU_OLD_RATIO_MAX > BUF_LRU_OLD_RATIO_DIV" +#endif + +/** Move blocks to "new" LRU list only if the first access was at +least this many milliseconds ago. Not protected by any mutex or latch. */ +extern uint buf_LRU_old_threshold_ms; +/* @} */ + +/** @brief Statistics for selecting the LRU list for eviction. + +These statistics are not 'of' LRU but 'for' LRU. We keep count of I/O +and page_zip_decompress() operations. Based on the statistics we decide +if we want to evict from buf_pool->unzip_LRU or buf_pool->LRU. */ +struct buf_LRU_stat_t +{ + ulint io; /**< Counter of buffer pool I/O operations. */ + ulint unzip; /**< Counter of page_zip_decompress operations. */ +}; + +/** Current operation counters. Not protected by any mutex. +Cleared by buf_LRU_stat_update(). */ +extern buf_LRU_stat_t buf_LRU_stat_cur; + +/** Running sum of past values of buf_LRU_stat_cur. +Updated by buf_LRU_stat_update(). Protected by buf_pool->mutex. */ +extern buf_LRU_stat_t buf_LRU_stat_sum; + +/********************************************************************//** +Increments the I/O counter in buf_LRU_stat_cur. */ +#define buf_LRU_stat_inc_io() buf_LRU_stat_cur.io++ +/********************************************************************//** +Increments the page_zip_decompress() counter in buf_LRU_stat_cur. */ +#define buf_LRU_stat_inc_unzip() buf_LRU_stat_cur.unzip++ + +#ifndef UNIV_NONINL +#include "buf0lru.ic" +#endif + +#endif /* !UNIV_HOTBACKUP */ + +#endif diff --git a/storage/innobase/include/buf0lru.ic b/storage/innobase/include/buf0lru.ic new file mode 100644 index 00000000000..6e0da7a2588 --- /dev/null +++ b/storage/innobase/include/buf0lru.ic @@ -0,0 +1,25 @@ +/***************************************************************************** + +Copyright (c) 1995, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/buf0lru.ic +The database buffer replacement algorithm + +Created 11/5/1995 Heikki Tuuri +*******************************************************/ + diff --git a/storage/innobase/include/buf0rea.h b/storage/innobase/include/buf0rea.h new file mode 100644 index 00000000000..d2a1f264ff5 --- /dev/null +++ b/storage/innobase/include/buf0rea.h @@ -0,0 +1,177 @@ +/***************************************************************************** + +Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/buf0rea.h +The database buffer read + +Created 11/5/1995 Heikki Tuuri +*******************************************************/ + +#ifndef buf0rea_h +#define buf0rea_h + +#include "univ.i" +#include "buf0types.h" + +/********************************************************************//** +High-level function which reads a page asynchronously from a file to the +buffer buf_pool if it is not already there. Sets the io_fix flag and sets +an exclusive lock on the buffer frame. The flag is cleared and the x-lock +released by the i/o-handler thread. +@return TRUE if page has been read in, FALSE in case of failure */ +UNIV_INTERN +ibool +buf_read_page( +/*==========*/ + ulint space, /*!< in: space id */ + ulint zip_size,/*!< in: compressed page size in bytes, or 0 */ + ulint offset);/*!< in: page number */ +/********************************************************************//** +High-level function which reads a page asynchronously from a file to the +buffer buf_pool if it is not already there. Sets the io_fix flag and sets +an exclusive lock on the buffer frame. The flag is cleared and the x-lock +released by the i/o-handler thread. +@return TRUE if page has been read in, FALSE in case of failure */ +UNIV_INTERN +ibool +buf_read_page_async( +/*================*/ + ulint space, /*!< in: space id */ + ulint offset);/*!< in: page number */ +/********************************************************************//** +Applies a random read-ahead in buf_pool if there are at least a threshold +value of accessed pages from the random read-ahead area. Does not read any +page, not even the one at the position (space, offset), if the read-ahead +mechanism is not activated. NOTE 1: the calling thread may own latches on +pages: to avoid deadlocks this function must be written such that it cannot +end up waiting for these latches! NOTE 2: the calling thread must want +access to the page given: this rule is set to prevent unintended read-aheads +performed by ibuf routines, a situation which could result in a deadlock if +the OS does not support asynchronous i/o. +@return number of page read requests issued; NOTE that if we read ibuf +pages, it may happen that the page at the given page number does not +get read even if we return a positive value! +@return number of page read requests issued */ +UNIV_INTERN +ulint +buf_read_ahead_random( +/*==================*/ + ulint space, /*!< in: space id */ + ulint zip_size, /*!< in: compressed page size in bytes, + or 0 */ + ulint offset, /*!< in: page number of a page which + the current thread wants to access */ + ibool inside_ibuf); /*!< in: TRUE if we are inside ibuf + routine */ +/********************************************************************//** +Applies linear read-ahead if in the buf_pool the page is a border page of +a linear read-ahead area and all the pages in the area have been accessed. +Does not read any page if the read-ahead mechanism is not activated. Note +that the algorithm looks at the 'natural' adjacent successor and +predecessor of the page, which on the leaf level of a B-tree are the next +and previous page in the chain of leaves. To know these, the page specified +in (space, offset) must already be present in the buf_pool. Thus, the +natural way to use this function is to call it when a page in the buf_pool +is accessed the first time, calling this function just after it has been +bufferfixed. +NOTE 1: as this function looks at the natural predecessor and successor +fields on the page, what happens, if these are not initialized to any +sensible value? No problem, before applying read-ahead we check that the +area to read is within the span of the space, if not, read-ahead is not +applied. An uninitialized value may result in a useless read operation, but +only very improbably. +NOTE 2: the calling thread may own latches on pages: to avoid deadlocks this +function must be written such that it cannot end up waiting for these +latches! +NOTE 3: the calling thread must want access to the page given: this rule is +set to prevent unintended read-aheads performed by ibuf routines, a situation +which could result in a deadlock if the OS does not support asynchronous io. +@return number of page read requests issued */ +UNIV_INTERN +ulint +buf_read_ahead_linear( +/*==================*/ + ulint space, /*!< in: space id */ + ulint zip_size, /*!< in: compressed page size in bytes, or 0 */ + ulint offset, /*!< in: page number; see NOTE 3 above */ + ibool inside_ibuf); /*!< in: TRUE if we are inside ibuf routine */ +/********************************************************************//** +Issues read requests for pages which the ibuf module wants to read in, in +order to contract the insert buffer tree. Technically, this function is like +a read-ahead function. */ +UNIV_INTERN +void +buf_read_ibuf_merge_pages( +/*======================*/ + bool sync, /*!< in: true if the caller + wants this function to wait + for the highest address page + to get read in, before this + function returns */ + const ulint* space_ids, /*!< in: array of space ids */ + const ib_int64_t* space_versions,/*!< in: the spaces must have + this version number + (timestamp), otherwise we + discard the read; we use this + to cancel reads if DISCARD + + IMPORT may have changed the + tablespace size */ + const ulint* page_nos, /*!< in: array of page numbers + to read, with the highest page + number the last in the + array */ + ulint n_stored); /*!< in: number of elements + in the arrays */ +/********************************************************************//** +Issues read requests for pages which recovery wants to read in. */ +UNIV_INTERN +void +buf_read_recv_pages( +/*================*/ + ibool sync, /*!< in: TRUE if the caller + wants this function to wait + for the highest address page + to get read in, before this + function returns */ + ulint space, /*!< in: space id */ + ulint zip_size, /*!< in: compressed page size in + bytes, or 0 */ + const ulint* page_nos, /*!< in: array of page numbers + to read, with the highest page + number the last in the + array */ + ulint n_stored); /*!< in: number of page numbers + in the array */ + +/** The size in pages of the area which the read-ahead algorithms read if +invoked */ +#define BUF_READ_AHEAD_AREA(b) \ + ut_min(64, ut_2_power_up((b)->curr_size / 32)) + +/** @name Modes used in read-ahead @{ */ +/** read only pages belonging to the insert buffer tree */ +#define BUF_READ_IBUF_PAGES_ONLY 131 +/** read any page */ +#define BUF_READ_ANY_PAGE 132 +/** read any page, but ignore (return an error) if a page does not exist +instead of crashing like BUF_READ_ANY_PAGE does */ +#define BUF_READ_IGNORE_NONEXISTENT_PAGES 1024 +/* @} */ + +#endif diff --git a/storage/innobase/include/buf0types.h b/storage/innobase/include/buf0types.h new file mode 100644 index 00000000000..11bbc9b5c8a --- /dev/null +++ b/storage/innobase/include/buf0types.h @@ -0,0 +1,120 @@ +/***************************************************************************** + +Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/buf0types.h +The database buffer pool global types for the directory + +Created 11/17/1995 Heikki Tuuri +*******************************************************/ + +#ifndef buf0types_h +#define buf0types_h + +#if defined(INNODB_PAGE_ATOMIC_REF_COUNT) && defined(HAVE_ATOMIC_BUILTINS) +#define PAGE_ATOMIC_REF_COUNT +#endif /* INNODB_PAGE_ATOMIC_REF_COUNT && HAVE_ATOMIC_BUILTINS */ + +/** Buffer page (uncompressed or compressed) */ +struct buf_page_t; +/** Buffer block for which an uncompressed page exists */ +struct buf_block_t; +/** Buffer pool chunk comprising buf_block_t */ +struct buf_chunk_t; +/** Buffer pool comprising buf_chunk_t */ +struct buf_pool_t; +/** Buffer pool statistics struct */ +struct buf_pool_stat_t; +/** Buffer pool buddy statistics struct */ +struct buf_buddy_stat_t; +/** Doublewrite memory struct */ +struct buf_dblwr_t; + +/** A buffer frame. @see page_t */ +typedef byte buf_frame_t; + +/** Flags for flush types */ +enum buf_flush_t { + BUF_FLUSH_LRU = 0, /*!< flush via the LRU list */ + BUF_FLUSH_LIST, /*!< flush via the flush list + of dirty blocks */ + BUF_FLUSH_SINGLE_PAGE, /*!< flush via the LRU list + but only a single page */ + BUF_FLUSH_N_TYPES /*!< index of last element + 1 */ +}; + +/** Algorithm to remove the pages for a tablespace from the buffer pool. +See buf_LRU_flush_or_remove_pages(). */ +enum buf_remove_t { + BUF_REMOVE_ALL_NO_WRITE, /*!< Remove all pages from the buffer + pool, don't write or sync to disk */ + BUF_REMOVE_FLUSH_NO_WRITE, /*!< Remove only, from the flush list, + don't write or sync to disk */ + BUF_REMOVE_FLUSH_WRITE /*!< Flush dirty pages to disk only + don't remove from the buffer pool */ +}; + +/** Flags for io_fix types */ +enum buf_io_fix { + BUF_IO_NONE = 0, /**< no pending I/O */ + BUF_IO_READ, /**< read pending */ + BUF_IO_WRITE, /**< write pending */ + BUF_IO_PIN /**< disallow relocation of + block and its removal of from + the flush_list */ +}; + +/** Alternatives for srv_checksum_algorithm, which can be changed by +setting innodb_checksum_algorithm */ +enum srv_checksum_algorithm_t { + SRV_CHECKSUM_ALGORITHM_CRC32, /*!< Write crc32, allow crc32, + innodb or none when reading */ + SRV_CHECKSUM_ALGORITHM_STRICT_CRC32, /*!< Write crc32, allow crc32 + when reading */ + SRV_CHECKSUM_ALGORITHM_INNODB, /*!< Write innodb, allow crc32, + innodb or none when reading */ + SRV_CHECKSUM_ALGORITHM_STRICT_INNODB, /*!< Write innodb, allow + innodb when reading */ + SRV_CHECKSUM_ALGORITHM_NONE, /*!< Write none, allow crc32, + innodb or none when reading */ + SRV_CHECKSUM_ALGORITHM_STRICT_NONE /*!< Write none, allow none + when reading */ +}; + +/** Parameters of binary buddy system for compressed pages (buf0buddy.h) */ +/* @{ */ +/** Zip shift value for the smallest page size */ +#define BUF_BUDDY_LOW_SHIFT UNIV_ZIP_SIZE_SHIFT_MIN + +/** Smallest buddy page size */ +#define BUF_BUDDY_LOW (1U << BUF_BUDDY_LOW_SHIFT) + +/** Actual number of buddy sizes based on current page size */ +#define BUF_BUDDY_SIZES (UNIV_PAGE_SIZE_SHIFT - BUF_BUDDY_LOW_SHIFT) + +/** Maximum number of buddy sizes based on the max page size */ +#define BUF_BUDDY_SIZES_MAX (UNIV_PAGE_SIZE_SHIFT_MAX \ + - BUF_BUDDY_LOW_SHIFT) + +/** twice the maximum block size of the buddy system; +the underlying memory is aligned by this amount: +this must be equal to UNIV_PAGE_SIZE */ +#define BUF_BUDDY_HIGH (BUF_BUDDY_LOW << BUF_BUDDY_SIZES) +/* @} */ + +#endif /* buf0types.h */ diff --git a/storage/innobase/include/data0data.h b/storage/innobase/include/data0data.h new file mode 100644 index 00000000000..a548c7b89b3 --- /dev/null +++ b/storage/innobase/include/data0data.h @@ -0,0 +1,536 @@ +/***************************************************************************** + +Copyright (c) 1994, 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/********************************************************************//** +@file include/data0data.h +SQL data field and tuple + +Created 5/30/1994 Heikki Tuuri +*************************************************************************/ + +#ifndef data0data_h +#define data0data_h + +#include "univ.i" + +#include "data0types.h" +#include "data0type.h" +#include "mem0mem.h" +#include "dict0types.h" + +/** Storage for overflow data in a big record, that is, a clustered +index record which needs external storage of data fields */ +struct big_rec_t; + +#ifdef UNIV_DEBUG +/*********************************************************************//** +Gets pointer to the type struct of SQL data field. +@return pointer to the type struct */ +UNIV_INLINE +dtype_t* +dfield_get_type( +/*============*/ + const dfield_t* field) /*!< in: SQL data field */ + __attribute__((nonnull, warn_unused_result)); +/*********************************************************************//** +Gets pointer to the data in a field. +@return pointer to data */ +UNIV_INLINE +void* +dfield_get_data( +/*============*/ + const dfield_t* field) /*!< in: field */ + __attribute__((nonnull, warn_unused_result)); +#else /* UNIV_DEBUG */ +# define dfield_get_type(field) (&(field)->type) +# define dfield_get_data(field) ((field)->data) +#endif /* UNIV_DEBUG */ +/*********************************************************************//** +Sets the type struct of SQL data field. */ +UNIV_INLINE +void +dfield_set_type( +/*============*/ + dfield_t* field, /*!< in: SQL data field */ + const dtype_t* type) /*!< in: pointer to data type struct */ + __attribute__((nonnull)); +/*********************************************************************//** +Gets length of field data. +@return length of data; UNIV_SQL_NULL if SQL null data */ +UNIV_INLINE +ulint +dfield_get_len( +/*===========*/ + const dfield_t* field) /*!< in: field */ + __attribute__((nonnull, warn_unused_result)); +/*********************************************************************//** +Sets length in a field. */ +UNIV_INLINE +void +dfield_set_len( +/*===========*/ + dfield_t* field, /*!< in: field */ + ulint len) /*!< in: length or UNIV_SQL_NULL */ + __attribute__((nonnull)); +/*********************************************************************//** +Determines if a field is SQL NULL +@return nonzero if SQL null data */ +UNIV_INLINE +ulint +dfield_is_null( +/*===========*/ + const dfield_t* field) /*!< in: field */ + __attribute__((nonnull, warn_unused_result)); +/*********************************************************************//** +Determines if a field is externally stored +@return nonzero if externally stored */ +UNIV_INLINE +ulint +dfield_is_ext( +/*==========*/ + const dfield_t* field) /*!< in: field */ + __attribute__((nonnull, warn_unused_result)); +/*********************************************************************//** +Sets the "external storage" flag */ +UNIV_INLINE +void +dfield_set_ext( +/*===========*/ + dfield_t* field) /*!< in/out: field */ + __attribute__((nonnull)); +/*********************************************************************//** +Sets pointer to the data and length in a field. */ +UNIV_INLINE +void +dfield_set_data( +/*============*/ + dfield_t* field, /*!< in: field */ + const void* data, /*!< in: data */ + ulint len) /*!< in: length or UNIV_SQL_NULL */ + __attribute__((nonnull(1))); +/*********************************************************************//** +Sets a data field to SQL NULL. */ +UNIV_INLINE +void +dfield_set_null( +/*============*/ + dfield_t* field) /*!< in/out: field */ + __attribute__((nonnull)); +/**********************************************************************//** +Writes an SQL null field full of zeros. */ +UNIV_INLINE +void +data_write_sql_null( +/*================*/ + byte* data, /*!< in: pointer to a buffer of size len */ + ulint len) /*!< in: SQL null size in bytes */ + __attribute__((nonnull)); +/*********************************************************************//** +Copies the data and len fields. */ +UNIV_INLINE +void +dfield_copy_data( +/*=============*/ + dfield_t* field1, /*!< out: field to copy to */ + const dfield_t* field2) /*!< in: field to copy from */ + __attribute__((nonnull)); +/*********************************************************************//** +Copies a data field to another. */ +UNIV_INLINE +void +dfield_copy( +/*========*/ + dfield_t* field1, /*!< out: field to copy to */ + const dfield_t* field2) /*!< in: field to copy from */ + __attribute__((nonnull)); +/*********************************************************************//** +Copies the data pointed to by a data field. */ +UNIV_INLINE +void +dfield_dup( +/*=======*/ + dfield_t* field, /*!< in/out: data field */ + mem_heap_t* heap) /*!< in: memory heap where allocated */ + __attribute__((nonnull)); +#ifndef UNIV_HOTBACKUP +/*********************************************************************//** +Tests if two data fields are equal. +If len==0, tests the data length and content for equality. +If len>0, tests the first len bytes of the content for equality. +@return TRUE if both fields are NULL or if they are equal */ +UNIV_INLINE +ibool +dfield_datas_are_binary_equal( +/*==========================*/ + const dfield_t* field1, /*!< in: field */ + const dfield_t* field2, /*!< in: field */ + ulint len) /*!< in: maximum prefix to compare, + or 0 to compare the whole field length */ + __attribute__((nonnull, warn_unused_result)); +/*********************************************************************//** +Tests if dfield data length and content is equal to the given. +@return TRUE if equal */ +UNIV_INLINE +ibool +dfield_data_is_binary_equal( +/*========================*/ + const dfield_t* field, /*!< in: field */ + ulint len, /*!< in: data length or UNIV_SQL_NULL */ + const byte* data) /*!< in: data */ + __attribute__((nonnull, warn_unused_result)); +#endif /* !UNIV_HOTBACKUP */ +/*********************************************************************//** +Gets number of fields in a data tuple. +@return number of fields */ +UNIV_INLINE +ulint +dtuple_get_n_fields( +/*================*/ + const dtuple_t* tuple) /*!< in: tuple */ + __attribute__((nonnull, warn_unused_result)); +#ifdef UNIV_DEBUG +/*********************************************************************//** +Gets nth field of a tuple. +@return nth field */ +UNIV_INLINE +dfield_t* +dtuple_get_nth_field( +/*=================*/ + const dtuple_t* tuple, /*!< in: tuple */ + ulint n); /*!< in: index of field */ +#else /* UNIV_DEBUG */ +# define dtuple_get_nth_field(tuple, n) ((tuple)->fields + (n)) +#endif /* UNIV_DEBUG */ +/*********************************************************************//** +Gets info bits in a data tuple. +@return info bits */ +UNIV_INLINE +ulint +dtuple_get_info_bits( +/*=================*/ + const dtuple_t* tuple) /*!< in: tuple */ + __attribute__((nonnull, warn_unused_result)); +/*********************************************************************//** +Sets info bits in a data tuple. */ +UNIV_INLINE +void +dtuple_set_info_bits( +/*=================*/ + dtuple_t* tuple, /*!< in: tuple */ + ulint info_bits) /*!< in: info bits */ + __attribute__((nonnull)); +/*********************************************************************//** +Gets number of fields used in record comparisons. +@return number of fields used in comparisons in rem0cmp.* */ +UNIV_INLINE +ulint +dtuple_get_n_fields_cmp( +/*====================*/ + const dtuple_t* tuple) /*!< in: tuple */ + __attribute__((nonnull, warn_unused_result)); +/*********************************************************************//** +Gets number of fields used in record comparisons. */ +UNIV_INLINE +void +dtuple_set_n_fields_cmp( +/*====================*/ + dtuple_t* tuple, /*!< in: tuple */ + ulint n_fields_cmp) /*!< in: number of fields used in + comparisons in rem0cmp.* */ + __attribute__((nonnull)); + +/* Estimate the number of bytes that are going to be allocated when +creating a new dtuple_t object */ +#define DTUPLE_EST_ALLOC(n_fields) \ + (sizeof(dtuple_t) + (n_fields) * sizeof(dfield_t)) + +/**********************************************************//** +Creates a data tuple from an already allocated chunk of memory. +The size of the chunk must be at least DTUPLE_EST_ALLOC(n_fields). +The default value for number of fields used in record comparisons +for this tuple is n_fields. +@return created tuple (inside buf) */ +UNIV_INLINE +dtuple_t* +dtuple_create_from_mem( +/*===================*/ + void* buf, /*!< in, out: buffer to use */ + ulint buf_size, /*!< in: buffer size */ + ulint n_fields) /*!< in: number of fields */ + __attribute__((nonnull, warn_unused_result)); + +/**********************************************************//** +Creates a data tuple to a memory heap. The default value for number +of fields used in record comparisons for this tuple is n_fields. +@return own: created tuple */ +UNIV_INLINE +dtuple_t* +dtuple_create( +/*==========*/ + mem_heap_t* heap, /*!< in: memory heap where the tuple + is created, DTUPLE_EST_ALLOC(n_fields) + bytes will be allocated from this heap */ + ulint n_fields)/*!< in: number of fields */ + __attribute__((nonnull, malloc)); + +/*********************************************************************//** +Sets number of fields used in a tuple. Normally this is set in +dtuple_create, but if you want later to set it smaller, you can use this. */ +UNIV_INTERN +void +dtuple_set_n_fields( +/*================*/ + dtuple_t* tuple, /*!< in: tuple */ + ulint n_fields) /*!< in: number of fields */ + __attribute__((nonnull)); +/*********************************************************************//** +Copies a data tuple to another. This is a shallow copy; if a deep copy +is desired, dfield_dup() will have to be invoked on each field. +@return own: copy of tuple */ +UNIV_INLINE +dtuple_t* +dtuple_copy( +/*========*/ + const dtuple_t* tuple, /*!< in: tuple to copy from */ + mem_heap_t* heap) /*!< in: memory heap + where the tuple is created */ + __attribute__((nonnull, malloc)); +/**********************************************************//** +The following function returns the sum of data lengths of a tuple. The space +occupied by the field structs or the tuple struct is not counted. +@return sum of data lens */ +UNIV_INLINE +ulint +dtuple_get_data_size( +/*=================*/ + const dtuple_t* tuple, /*!< in: typed data tuple */ + ulint comp) /*!< in: nonzero=ROW_FORMAT=COMPACT */ + __attribute__((nonnull)); +/*********************************************************************//** +Computes the number of externally stored fields in a data tuple. +@return number of fields */ +UNIV_INLINE +ulint +dtuple_get_n_ext( +/*=============*/ + const dtuple_t* tuple) /*!< in: tuple */ + __attribute__((nonnull)); +/************************************************************//** +Compare two data tuples, respecting the collation of character fields. +@return 1, 0 , -1 if tuple1 is greater, equal, less, respectively, +than tuple2 */ +UNIV_INTERN +int +dtuple_coll_cmp( +/*============*/ + const dtuple_t* tuple1, /*!< in: tuple 1 */ + const dtuple_t* tuple2) /*!< in: tuple 2 */ + __attribute__((nonnull, warn_unused_result)); +/************************************************************//** +Folds a prefix given as the number of fields of a tuple. +@return the folded value */ +UNIV_INLINE +ulint +dtuple_fold( +/*========*/ + const dtuple_t* tuple, /*!< in: the tuple */ + ulint n_fields,/*!< in: number of complete fields to fold */ + ulint n_bytes,/*!< in: number of bytes to fold in an + incomplete last field */ + index_id_t tree_id)/*!< in: index tree id */ + __attribute__((nonnull, pure, warn_unused_result)); +/*******************************************************************//** +Sets types of fields binary in a tuple. */ +UNIV_INLINE +void +dtuple_set_types_binary( +/*====================*/ + dtuple_t* tuple, /*!< in: data tuple */ + ulint n) /*!< in: number of fields to set */ + __attribute__((nonnull)); +/**********************************************************************//** +Checks if a dtuple contains an SQL null value. +@return TRUE if some field is SQL null */ +UNIV_INLINE +ibool +dtuple_contains_null( +/*=================*/ + const dtuple_t* tuple) /*!< in: dtuple */ + __attribute__((nonnull, warn_unused_result)); +/**********************************************************//** +Checks that a data field is typed. Asserts an error if not. +@return TRUE if ok */ +UNIV_INTERN +ibool +dfield_check_typed( +/*===============*/ + const dfield_t* field) /*!< in: data field */ + __attribute__((nonnull, warn_unused_result)); +/**********************************************************//** +Checks that a data tuple is typed. Asserts an error if not. +@return TRUE if ok */ +UNIV_INTERN +ibool +dtuple_check_typed( +/*===============*/ + const dtuple_t* tuple) /*!< in: tuple */ + __attribute__((nonnull, warn_unused_result)); +/**********************************************************//** +Checks that a data tuple is typed. +@return TRUE if ok */ +UNIV_INTERN +ibool +dtuple_check_typed_no_assert( +/*=========================*/ + const dtuple_t* tuple) /*!< in: tuple */ + __attribute__((nonnull, warn_unused_result)); +#ifdef UNIV_DEBUG +/**********************************************************//** +Validates the consistency of a tuple which must be complete, i.e, +all fields must have been set. +@return TRUE if ok */ +UNIV_INTERN +ibool +dtuple_validate( +/*============*/ + const dtuple_t* tuple) /*!< in: tuple */ + __attribute__((nonnull, warn_unused_result)); +#endif /* UNIV_DEBUG */ +/*************************************************************//** +Pretty prints a dfield value according to its data type. */ +UNIV_INTERN +void +dfield_print( +/*=========*/ + const dfield_t* dfield) /*!< in: dfield */ + __attribute__((nonnull)); +/*************************************************************//** +Pretty prints a dfield value according to its data type. Also the hex string +is printed if a string contains non-printable characters. */ +UNIV_INTERN +void +dfield_print_also_hex( +/*==================*/ + const dfield_t* dfield) /*!< in: dfield */ + __attribute__((nonnull)); +/**********************************************************//** +The following function prints the contents of a tuple. */ +UNIV_INTERN +void +dtuple_print( +/*=========*/ + FILE* f, /*!< in: output stream */ + const dtuple_t* tuple) /*!< in: tuple */ + __attribute__((nonnull)); +/**************************************************************//** +Moves parts of long fields in entry to the big record vector so that +the size of tuple drops below the maximum record size allowed in the +database. Moves data only from those fields which are not necessary +to determine uniquely the insertion place of the tuple in the index. +@return own: created big record vector, NULL if we are not able to +shorten the entry enough, i.e., if there are too many fixed-length or +short fields in entry or the index is clustered */ +UNIV_INTERN +big_rec_t* +dtuple_convert_big_rec( +/*===================*/ + dict_index_t* index, /*!< in: index */ + dtuple_t* entry, /*!< in/out: index entry */ + ulint* n_ext) /*!< in/out: number of + externally stored columns */ + __attribute__((nonnull, malloc, warn_unused_result)); +/**************************************************************//** +Puts back to entry the data stored in vector. Note that to ensure the +fields in entry can accommodate the data, vector must have been created +from entry with dtuple_convert_big_rec. */ +UNIV_INTERN +void +dtuple_convert_back_big_rec( +/*========================*/ + dict_index_t* index, /*!< in: index */ + dtuple_t* entry, /*!< in: entry whose data was put to vector */ + big_rec_t* vector) /*!< in, own: big rec vector; it is + freed in this function */ + __attribute__((nonnull)); +/**************************************************************//** +Frees the memory in a big rec vector. */ +UNIV_INLINE +void +dtuple_big_rec_free( +/*================*/ + big_rec_t* vector) /*!< in, own: big rec vector; it is + freed in this function */ + __attribute__((nonnull)); + +/*######################################################################*/ + +/** Structure for an SQL data field */ +struct dfield_t{ + void* data; /*!< pointer to data */ + unsigned ext:1; /*!< TRUE=externally stored, FALSE=local */ + unsigned len:32; /*!< data length; UNIV_SQL_NULL if SQL null */ + dtype_t type; /*!< type of data */ +}; + +/** Structure for an SQL data tuple of fields (logical record) */ +struct dtuple_t { + ulint info_bits; /*!< info bits of an index record: + the default is 0; this field is used + if an index record is built from + a data tuple */ + ulint n_fields; /*!< number of fields in dtuple */ + ulint n_fields_cmp; /*!< number of fields which should + be used in comparison services + of rem0cmp.*; the index search + is performed by comparing only these + fields, others are ignored; the + default value in dtuple creation is + the same value as n_fields */ + dfield_t* fields; /*!< fields */ + UT_LIST_NODE_T(dtuple_t) tuple_list; + /*!< data tuples can be linked into a + list using this field */ +#ifdef UNIV_DEBUG + ulint magic_n; /*!< magic number, used in + debug assertions */ +/** Value of dtuple_t::magic_n */ +# define DATA_TUPLE_MAGIC_N 65478679 +#endif /* UNIV_DEBUG */ +}; + +/** A slot for a field in a big rec vector */ +struct big_rec_field_t { + ulint field_no; /*!< field number in record */ + ulint len; /*!< stored data length, in bytes */ + const void* data; /*!< stored data */ +}; + +/** Storage format for overflow data in a big record, that is, a +clustered index record which needs external storage of data fields */ +struct big_rec_t { + mem_heap_t* heap; /*!< memory heap from which + allocated */ + ulint n_fields; /*!< number of stored fields */ + big_rec_field_t*fields; /*!< stored fields */ +}; + +#ifndef UNIV_NONINL +#include "data0data.ic" +#endif + +#endif diff --git a/storage/innobase/include/data0data.ic b/storage/innobase/include/data0data.ic new file mode 100644 index 00000000000..6937d55d211 --- /dev/null +++ b/storage/innobase/include/data0data.ic @@ -0,0 +1,649 @@ +/***************************************************************************** + +Copyright (c) 1994, 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/********************************************************************//** +@file include/data0data.ic +SQL data field and tuple + +Created 5/30/1994 Heikki Tuuri +*************************************************************************/ + +#include "mem0mem.h" +#include "ut0rnd.h" + +#ifdef UNIV_DEBUG +/** Dummy variable to catch access to uninitialized fields. In the +debug version, dtuple_create() will make all fields of dtuple_t point +to data_error. */ +extern byte data_error; + +/*********************************************************************//** +Gets pointer to the type struct of SQL data field. +@return pointer to the type struct */ +UNIV_INLINE +dtype_t* +dfield_get_type( +/*============*/ + const dfield_t* field) /*!< in: SQL data field */ +{ + ut_ad(field); + + return((dtype_t*) &(field->type)); +} +#endif /* UNIV_DEBUG */ + +/*********************************************************************//** +Sets the type struct of SQL data field. */ +UNIV_INLINE +void +dfield_set_type( +/*============*/ + dfield_t* field, /*!< in: SQL data field */ + const dtype_t* type) /*!< in: pointer to data type struct */ +{ + ut_ad(field && type); + + field->type = *type; +} + +#ifdef UNIV_DEBUG +/*********************************************************************//** +Gets pointer to the data in a field. +@return pointer to data */ +UNIV_INLINE +void* +dfield_get_data( +/*============*/ + const dfield_t* field) /*!< in: field */ +{ + ut_ad(field); + ut_ad((field->len == UNIV_SQL_NULL) + || (field->data != &data_error)); + + return((void*) field->data); +} +#endif /* UNIV_DEBUG */ + +/*********************************************************************//** +Gets length of field data. +@return length of data; UNIV_SQL_NULL if SQL null data */ +UNIV_INLINE +ulint +dfield_get_len( +/*===========*/ + const dfield_t* field) /*!< in: field */ +{ + ut_ad(field); + ut_ad((field->len == UNIV_SQL_NULL) + || (field->data != &data_error)); + + return(field->len); +} + +/*********************************************************************//** +Sets length in a field. */ +UNIV_INLINE +void +dfield_set_len( +/*===========*/ + dfield_t* field, /*!< in: field */ + ulint len) /*!< in: length or UNIV_SQL_NULL */ +{ + ut_ad(field); +#ifdef UNIV_VALGRIND_DEBUG + if (len != UNIV_SQL_NULL) UNIV_MEM_ASSERT_RW(field->data, len); +#endif /* UNIV_VALGRIND_DEBUG */ + + field->ext = 0; + field->len = len; +} + +/*********************************************************************//** +Determines if a field is SQL NULL +@return nonzero if SQL null data */ +UNIV_INLINE +ulint +dfield_is_null( +/*===========*/ + const dfield_t* field) /*!< in: field */ +{ + ut_ad(field); + + return(field->len == UNIV_SQL_NULL); +} + +/*********************************************************************//** +Determines if a field is externally stored +@return nonzero if externally stored */ +UNIV_INLINE +ulint +dfield_is_ext( +/*==========*/ + const dfield_t* field) /*!< in: field */ +{ + ut_ad(field); + + return(field->ext); +} + +/*********************************************************************//** +Sets the "external storage" flag */ +UNIV_INLINE +void +dfield_set_ext( +/*===========*/ + dfield_t* field) /*!< in/out: field */ +{ + ut_ad(field); + + field->ext = 1; +} + +/*********************************************************************//** +Sets pointer to the data and length in a field. */ +UNIV_INLINE +void +dfield_set_data( +/*============*/ + dfield_t* field, /*!< in: field */ + const void* data, /*!< in: data */ + ulint len) /*!< in: length or UNIV_SQL_NULL */ +{ + ut_ad(field); + +#ifdef UNIV_VALGRIND_DEBUG + if (len != UNIV_SQL_NULL) UNIV_MEM_ASSERT_RW(data, len); +#endif /* UNIV_VALGRIND_DEBUG */ + field->data = (void*) data; + field->ext = 0; + field->len = len; +} + +/*********************************************************************//** +Sets a data field to SQL NULL. */ +UNIV_INLINE +void +dfield_set_null( +/*============*/ + dfield_t* field) /*!< in/out: field */ +{ + dfield_set_data(field, NULL, UNIV_SQL_NULL); +} + +/*********************************************************************//** +Copies the data and len fields. */ +UNIV_INLINE +void +dfield_copy_data( +/*=============*/ + dfield_t* field1, /*!< out: field to copy to */ + const dfield_t* field2) /*!< in: field to copy from */ +{ + ut_ad(field1 && field2); + + field1->data = field2->data; + field1->len = field2->len; + field1->ext = field2->ext; +} + +/*********************************************************************//** +Copies a data field to another. */ +UNIV_INLINE +void +dfield_copy( +/*========*/ + dfield_t* field1, /*!< out: field to copy to */ + const dfield_t* field2) /*!< in: field to copy from */ +{ + *field1 = *field2; +} + +/*********************************************************************//** +Copies the data pointed to by a data field. */ +UNIV_INLINE +void +dfield_dup( +/*=======*/ + dfield_t* field, /*!< in/out: data field */ + mem_heap_t* heap) /*!< in: memory heap where allocated */ +{ + if (!dfield_is_null(field)) { + UNIV_MEM_ASSERT_RW(field->data, field->len); + field->data = mem_heap_dup(heap, field->data, field->len); + } +} + +#ifndef UNIV_HOTBACKUP +/*********************************************************************//** +Tests if two data fields are equal. +If len==0, tests the data length and content for equality. +If len>0, tests the first len bytes of the content for equality. +@return TRUE if both fields are NULL or if they are equal */ +UNIV_INLINE +ibool +dfield_datas_are_binary_equal( +/*==========================*/ + const dfield_t* field1, /*!< in: field */ + const dfield_t* field2, /*!< in: field */ + ulint len) /*!< in: maximum prefix to compare, + or 0 to compare the whole field length */ +{ + ulint len2 = len; + + if (field1->len == UNIV_SQL_NULL || len == 0 || field1->len < len) { + len = field1->len; + } + + if (field2->len == UNIV_SQL_NULL || len2 == 0 || field2->len < len2) { + len2 = field2->len; + } + + return(len == len2 + && (len == UNIV_SQL_NULL + || !memcmp(field1->data, field2->data, len))); +} + +/*********************************************************************//** +Tests if dfield data length and content is equal to the given. +@return TRUE if equal */ +UNIV_INLINE +ibool +dfield_data_is_binary_equal( +/*========================*/ + const dfield_t* field, /*!< in: field */ + ulint len, /*!< in: data length or UNIV_SQL_NULL */ + const byte* data) /*!< in: data */ +{ + return(len == dfield_get_len(field) + && (len == UNIV_SQL_NULL + || !memcmp(dfield_get_data(field), data, len))); +} +#endif /* !UNIV_HOTBACKUP */ + +/*********************************************************************//** +Gets info bits in a data tuple. +@return info bits */ +UNIV_INLINE +ulint +dtuple_get_info_bits( +/*=================*/ + const dtuple_t* tuple) /*!< in: tuple */ +{ + ut_ad(tuple); + + return(tuple->info_bits); +} + +/*********************************************************************//** +Sets info bits in a data tuple. */ +UNIV_INLINE +void +dtuple_set_info_bits( +/*=================*/ + dtuple_t* tuple, /*!< in: tuple */ + ulint info_bits) /*!< in: info bits */ +{ + ut_ad(tuple); + + tuple->info_bits = info_bits; +} + +/*********************************************************************//** +Gets number of fields used in record comparisons. +@return number of fields used in comparisons in rem0cmp.* */ +UNIV_INLINE +ulint +dtuple_get_n_fields_cmp( +/*====================*/ + const dtuple_t* tuple) /*!< in: tuple */ +{ + ut_ad(tuple); + + return(tuple->n_fields_cmp); +} + +/*********************************************************************//** +Sets number of fields used in record comparisons. */ +UNIV_INLINE +void +dtuple_set_n_fields_cmp( +/*====================*/ + dtuple_t* tuple, /*!< in: tuple */ + ulint n_fields_cmp) /*!< in: number of fields used in + comparisons in rem0cmp.* */ +{ + ut_ad(tuple); + ut_ad(n_fields_cmp <= tuple->n_fields); + + tuple->n_fields_cmp = n_fields_cmp; +} + +/*********************************************************************//** +Gets number of fields in a data tuple. +@return number of fields */ +UNIV_INLINE +ulint +dtuple_get_n_fields( +/*================*/ + const dtuple_t* tuple) /*!< in: tuple */ +{ + ut_ad(tuple); + + return(tuple->n_fields); +} + +#ifdef UNIV_DEBUG +/*********************************************************************//** +Gets nth field of a tuple. +@return nth field */ +UNIV_INLINE +dfield_t* +dtuple_get_nth_field( +/*=================*/ + const dtuple_t* tuple, /*!< in: tuple */ + ulint n) /*!< in: index of field */ +{ + ut_ad(tuple); + ut_ad(n < tuple->n_fields); + + return((dfield_t*) tuple->fields + n); +} +#endif /* UNIV_DEBUG */ + +/**********************************************************//** +Creates a data tuple from an already allocated chunk of memory. +The size of the chunk must be at least DTUPLE_EST_ALLOC(n_fields). +The default value for number of fields used in record comparisons +for this tuple is n_fields. +@return created tuple (inside buf) */ +UNIV_INLINE +dtuple_t* +dtuple_create_from_mem( +/*===================*/ + void* buf, /*!< in, out: buffer to use */ + ulint buf_size, /*!< in: buffer size */ + ulint n_fields) /*!< in: number of fields */ +{ + dtuple_t* tuple; + + ut_ad(buf != NULL); + ut_a(buf_size >= DTUPLE_EST_ALLOC(n_fields)); + + tuple = (dtuple_t*) buf; + tuple->info_bits = 0; + tuple->n_fields = n_fields; + tuple->n_fields_cmp = n_fields; + tuple->fields = (dfield_t*) &tuple[1]; + +#ifdef UNIV_DEBUG + tuple->magic_n = DATA_TUPLE_MAGIC_N; + + { /* In the debug version, initialize fields to an error value */ + ulint i; + + for (i = 0; i < n_fields; i++) { + dfield_t* field; + + field = dtuple_get_nth_field(tuple, i); + + dfield_set_len(field, UNIV_SQL_NULL); + field->data = &data_error; + dfield_get_type(field)->mtype = DATA_ERROR; + } + } +#endif + UNIV_MEM_ASSERT_W(tuple->fields, n_fields * sizeof *tuple->fields); + UNIV_MEM_INVALID(tuple->fields, n_fields * sizeof *tuple->fields); + return(tuple); +} + +/**********************************************************//** +Creates a data tuple to a memory heap. The default value for number +of fields used in record comparisons for this tuple is n_fields. +@return own: created tuple */ +UNIV_INLINE +dtuple_t* +dtuple_create( +/*==========*/ + mem_heap_t* heap, /*!< in: memory heap where the tuple + is created, DTUPLE_EST_ALLOC(n_fields) + bytes will be allocated from this heap */ + ulint n_fields) /*!< in: number of fields */ +{ + void* buf; + ulint buf_size; + dtuple_t* tuple; + + ut_ad(heap); + + buf_size = DTUPLE_EST_ALLOC(n_fields); + buf = mem_heap_alloc(heap, buf_size); + + tuple = dtuple_create_from_mem(buf, buf_size, n_fields); + + return(tuple); +} + +/*********************************************************************//** +Copies a data tuple to another. This is a shallow copy; if a deep copy +is desired, dfield_dup() will have to be invoked on each field. +@return own: copy of tuple */ +UNIV_INLINE +dtuple_t* +dtuple_copy( +/*========*/ + const dtuple_t* tuple, /*!< in: tuple to copy from */ + mem_heap_t* heap) /*!< in: memory heap + where the tuple is created */ +{ + ulint n_fields = dtuple_get_n_fields(tuple); + dtuple_t* new_tuple = dtuple_create(heap, n_fields); + ulint i; + + for (i = 0; i < n_fields; i++) { + dfield_copy(dtuple_get_nth_field(new_tuple, i), + dtuple_get_nth_field(tuple, i)); + } + + return(new_tuple); +} + +/**********************************************************//** +The following function returns the sum of data lengths of a tuple. The space +occupied by the field structs or the tuple struct is not counted. Neither +is possible space in externally stored parts of the field. +@return sum of data lengths */ +UNIV_INLINE +ulint +dtuple_get_data_size( +/*=================*/ + const dtuple_t* tuple, /*!< in: typed data tuple */ + ulint comp) /*!< in: nonzero=ROW_FORMAT=COMPACT */ +{ + const dfield_t* field; + ulint n_fields; + ulint len; + ulint i; + ulint sum = 0; + + ut_ad(tuple); + ut_ad(dtuple_check_typed(tuple)); + ut_ad(tuple->magic_n == DATA_TUPLE_MAGIC_N); + + n_fields = tuple->n_fields; + + for (i = 0; i < n_fields; i++) { + field = dtuple_get_nth_field(tuple, i); + len = dfield_get_len(field); + + if (len == UNIV_SQL_NULL) { + len = dtype_get_sql_null_size(dfield_get_type(field), + comp); + } + + sum += len; + } + + return(sum); +} + +/*********************************************************************//** +Computes the number of externally stored fields in a data tuple. +@return number of externally stored fields */ +UNIV_INLINE +ulint +dtuple_get_n_ext( +/*=============*/ + const dtuple_t* tuple) /*!< in: tuple */ +{ + ulint n_ext = 0; + ulint n_fields = tuple->n_fields; + ulint i; + + ut_ad(tuple); + ut_ad(dtuple_check_typed(tuple)); + ut_ad(tuple->magic_n == DATA_TUPLE_MAGIC_N); + + for (i = 0; i < n_fields; i++) { + n_ext += dtuple_get_nth_field(tuple, i)->ext; + } + + return(n_ext); +} + +/*******************************************************************//** +Sets types of fields binary in a tuple. */ +UNIV_INLINE +void +dtuple_set_types_binary( +/*====================*/ + dtuple_t* tuple, /*!< in: data tuple */ + ulint n) /*!< in: number of fields to set */ +{ + dtype_t* dfield_type; + ulint i; + + for (i = 0; i < n; i++) { + dfield_type = dfield_get_type(dtuple_get_nth_field(tuple, i)); + dtype_set(dfield_type, DATA_BINARY, 0, 0); + } +} + +/************************************************************//** +Folds a prefix given as the number of fields of a tuple. +@return the folded value */ +UNIV_INLINE +ulint +dtuple_fold( +/*========*/ + const dtuple_t* tuple, /*!< in: the tuple */ + ulint n_fields,/*!< in: number of complete fields to fold */ + ulint n_bytes,/*!< in: number of bytes to fold in an + incomplete last field */ + index_id_t tree_id)/*!< in: index tree id */ +{ + const dfield_t* field; + ulint i; + const byte* data; + ulint len; + ulint fold; + + ut_ad(tuple); + ut_ad(tuple->magic_n == DATA_TUPLE_MAGIC_N); + ut_ad(dtuple_check_typed(tuple)); + + fold = ut_fold_ull(tree_id); + + for (i = 0; i < n_fields; i++) { + field = dtuple_get_nth_field(tuple, i); + + data = (const byte*) dfield_get_data(field); + len = dfield_get_len(field); + + if (len != UNIV_SQL_NULL) { + fold = ut_fold_ulint_pair(fold, + ut_fold_binary(data, len)); + } + } + + if (n_bytes > 0) { + field = dtuple_get_nth_field(tuple, i); + + data = (const byte*) dfield_get_data(field); + len = dfield_get_len(field); + + if (len != UNIV_SQL_NULL) { + if (len > n_bytes) { + len = n_bytes; + } + + fold = ut_fold_ulint_pair(fold, + ut_fold_binary(data, len)); + } + } + + return(fold); +} + +/**********************************************************************//** +Writes an SQL null field full of zeros. */ +UNIV_INLINE +void +data_write_sql_null( +/*================*/ + byte* data, /*!< in: pointer to a buffer of size len */ + ulint len) /*!< in: SQL null size in bytes */ +{ + memset(data, 0, len); +} + +/**********************************************************************//** +Checks if a dtuple contains an SQL null value. +@return TRUE if some field is SQL null */ +UNIV_INLINE +ibool +dtuple_contains_null( +/*=================*/ + const dtuple_t* tuple) /*!< in: dtuple */ +{ + ulint n; + ulint i; + + n = dtuple_get_n_fields(tuple); + + for (i = 0; i < n; i++) { + if (dfield_is_null(dtuple_get_nth_field(tuple, i))) { + + return(TRUE); + } + } + + return(FALSE); +} + +/**************************************************************//** +Frees the memory in a big rec vector. */ +UNIV_INLINE +void +dtuple_big_rec_free( +/*================*/ + big_rec_t* vector) /*!< in, own: big rec vector; it is + freed in this function */ +{ + mem_heap_free(vector->heap); +} diff --git a/storage/innobase/include/data0type.h b/storage/innobase/include/data0type.h new file mode 100644 index 00000000000..111664b0b52 --- /dev/null +++ b/storage/innobase/include/data0type.h @@ -0,0 +1,544 @@ +/***************************************************************************** + +Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/data0type.h +Data types + +Created 1/16/1996 Heikki Tuuri +*******************************************************/ + +#ifndef data0type_h +#define data0type_h + +#include "univ.i" + +extern ulint data_mysql_default_charset_coll; +#define DATA_MYSQL_LATIN1_SWEDISH_CHARSET_COLL 8 +#define DATA_MYSQL_BINARY_CHARSET_COLL 63 + +/* SQL data type struct */ +struct dtype_t; + +/* SQL Like operator comparison types */ +enum ib_like_t { + IB_LIKE_EXACT, /* e.g. STRING */ + IB_LIKE_PREFIX, /* e.g., STRING% */ + IB_LIKE_SUFFIX, /* e.g., %STRING */ + IB_LIKE_SUBSTR, /* e.g., %STRING% */ + IB_LIKE_REGEXP /* Future */ +}; + +/*-------------------------------------------*/ +/* The 'MAIN TYPE' of a column */ +#define DATA_MISSING 0 /* missing column */ +#define DATA_VARCHAR 1 /* character varying of the + latin1_swedish_ci charset-collation; note + that the MySQL format for this, DATA_BINARY, + DATA_VARMYSQL, is also affected by whether the + 'precise type' contains + DATA_MYSQL_TRUE_VARCHAR */ +#define DATA_CHAR 2 /* fixed length character of the + latin1_swedish_ci charset-collation */ +#define DATA_FIXBINARY 3 /* binary string of fixed length */ +#define DATA_BINARY 4 /* binary string */ +#define DATA_BLOB 5 /* binary large object, or a TEXT type; + if prtype & DATA_BINARY_TYPE == 0, then this is + actually a TEXT column (or a BLOB created + with < 4.0.14; since column prefix indexes + came only in 4.0.14, the missing flag in BLOBs + created before that does not cause any harm) */ +#define DATA_INT 6 /* integer: can be any size 1 - 8 bytes */ +#define DATA_SYS_CHILD 7 /* address of the child page in node pointer */ +#define DATA_SYS 8 /* system column */ + +/* Data types >= DATA_FLOAT must be compared using the whole field, not as +binary strings */ + +#define DATA_FLOAT 9 +#define DATA_DOUBLE 10 +#define DATA_DECIMAL 11 /* decimal number stored as an ASCII string */ +#define DATA_VARMYSQL 12 /* any charset varying length char */ +#define DATA_MYSQL 13 /* any charset fixed length char */ + /* NOTE that 4.1.1 used DATA_MYSQL and + DATA_VARMYSQL for all character sets, and the + charset-collation for tables created with it + can also be latin1_swedish_ci */ +#define DATA_MTYPE_MAX 63 /* dtype_store_for_order_and_null_size() + requires the values are <= 63 */ +/*-------------------------------------------*/ +/* The 'PRECISE TYPE' of a column */ +/* +Tables created by a MySQL user have the following convention: + +- In the least significant byte in the precise type we store the MySQL type +code (not applicable for system columns). + +- In the second least significant byte we OR flags DATA_NOT_NULL, +DATA_UNSIGNED, DATA_BINARY_TYPE. + +- In the third least significant byte of the precise type of string types we +store the MySQL charset-collation code. In DATA_BLOB columns created with +< 4.0.14 we do not actually know if it is a BLOB or a TEXT column. Since there +are no indexes on prefixes of BLOB or TEXT columns in < 4.0.14, this is no +problem, though. + +Note that versions < 4.1.2 or < 5.0.1 did not store the charset code to the +precise type, since the charset was always the default charset of the MySQL +installation. If the stored charset code is 0 in the system table SYS_COLUMNS +of InnoDB, that means that the default charset of this MySQL installation +should be used. + +When loading a table definition from the system tables to the InnoDB data +dictionary cache in main memory, InnoDB versions >= 4.1.2 and >= 5.0.1 check +if the stored charset-collation is 0, and if that is the case and the type is +a non-binary string, replace that 0 by the default charset-collation code of +this MySQL installation. In short, in old tables, the charset-collation code +in the system tables on disk can be 0, but in in-memory data structures +(dtype_t), the charset-collation code is always != 0 for non-binary string +types. + +In new tables, in binary string types, the charset-collation code is the +MySQL code for the 'binary charset', that is, != 0. + +For binary string types and for DATA_CHAR, DATA_VARCHAR, and for those +DATA_BLOB which are binary or have the charset-collation latin1_swedish_ci, +InnoDB performs all comparisons internally, without resorting to the MySQL +comparison functions. This is to save CPU time. + +InnoDB's own internal system tables have different precise types for their +columns, and for them the precise type is usually not used at all. +*/ + +#define DATA_ENGLISH 4 /* English language character string: this + is a relic from pre-MySQL time and only used + for InnoDB's own system tables */ +#define DATA_ERROR 111 /* another relic from pre-MySQL time */ + +#define DATA_MYSQL_TYPE_MASK 255 /* AND with this mask to extract the MySQL + type from the precise type */ +#define DATA_MYSQL_TRUE_VARCHAR 15 /* MySQL type code for the >= 5.0.3 + format true VARCHAR */ + +/* Precise data types for system columns and the length of those columns; +NOTE: the values must run from 0 up in the order given! All codes must +be less than 256 */ +#define DATA_ROW_ID 0 /* row id: a 48-bit integer */ +#define DATA_ROW_ID_LEN 6 /* stored length for row id */ + +#define DATA_TRX_ID 1 /* transaction id: 6 bytes */ +#define DATA_TRX_ID_LEN 6 + +#define DATA_ROLL_PTR 2 /* rollback data pointer: 7 bytes */ +#define DATA_ROLL_PTR_LEN 7 + +#define DATA_N_SYS_COLS 3 /* number of system columns defined above */ + +#define DATA_FTS_DOC_ID 3 /* Used as FTS DOC ID column */ + +#define DATA_SYS_PRTYPE_MASK 0xF /* mask to extract the above from prtype */ + +/* Flags ORed to the precise data type */ +#define DATA_NOT_NULL 256 /* this is ORed to the precise type when + the column is declared as NOT NULL */ +#define DATA_UNSIGNED 512 /* this id ORed to the precise type when + we have an unsigned integer type */ +#define DATA_BINARY_TYPE 1024 /* if the data type is a binary character + string, this is ORed to the precise type: + this only holds for tables created with + >= MySQL-4.0.14 */ +/* #define DATA_NONLATIN1 2048 This is a relic from < 4.1.2 and < 5.0.1. + In earlier versions this was set for some + BLOB columns. +*/ +#define DATA_LONG_TRUE_VARCHAR 4096 /* this is ORed to the precise data + type when the column is true VARCHAR where + MySQL uses 2 bytes to store the data len; + for shorter VARCHARs MySQL uses only 1 byte */ +/*-------------------------------------------*/ + +/* This many bytes we need to store the type information affecting the +alphabetical order for a single field and decide the storage size of an +SQL null*/ +#define DATA_ORDER_NULL_TYPE_BUF_SIZE 4 +/* In the >= 4.1.x storage format we add 2 bytes more so that we can also +store the charset-collation number; one byte is left unused, though */ +#define DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE 6 + +/* Maximum multi-byte character length in bytes, plus 1 */ +#define DATA_MBMAX 5 + +/* Pack mbminlen, mbmaxlen to mbminmaxlen. */ +#define DATA_MBMINMAXLEN(mbminlen, mbmaxlen) \ + ((mbmaxlen) * DATA_MBMAX + (mbminlen)) +/* Get mbminlen from mbminmaxlen. Cast the result of UNIV_EXPECT to ulint +because in GCC it returns a long. */ +#define DATA_MBMINLEN(mbminmaxlen) ((ulint) \ + UNIV_EXPECT(((mbminmaxlen) % DATA_MBMAX), \ + 1)) +/* Get mbmaxlen from mbminmaxlen. */ +#define DATA_MBMAXLEN(mbminmaxlen) ((ulint) ((mbminmaxlen) / DATA_MBMAX)) + +/* We now support 15 bits (up to 32767) collation number */ +#define MAX_CHAR_COLL_NUM 32767 + +/* Mask to get the Charset Collation number (0x7fff) */ +#define CHAR_COLL_MASK MAX_CHAR_COLL_NUM + +#ifndef UNIV_HOTBACKUP +/*********************************************************************//** +Gets the MySQL type code from a dtype. +@return MySQL type code; this is NOT an InnoDB type code! */ +UNIV_INLINE +ulint +dtype_get_mysql_type( +/*=================*/ + const dtype_t* type); /*!< in: type struct */ +/*********************************************************************//** +Determine how many bytes the first n characters of the given string occupy. +If the string is shorter than n characters, returns the number of bytes +the characters in the string occupy. +@return length of the prefix, in bytes */ +UNIV_INTERN +ulint +dtype_get_at_most_n_mbchars( +/*========================*/ + ulint prtype, /*!< in: precise type */ + ulint mbminmaxlen, /*!< in: minimum and maximum length of + a multi-byte character */ + ulint prefix_len, /*!< in: length of the requested + prefix, in characters, multiplied by + dtype_get_mbmaxlen(dtype) */ + ulint data_len, /*!< in: length of str (in bytes) */ + const char* str); /*!< in: the string whose prefix + length is being determined */ +#endif /* !UNIV_HOTBACKUP */ +/*********************************************************************//** +Checks if a data main type is a string type. Also a BLOB is considered a +string type. +@return TRUE if string type */ +UNIV_INTERN +ibool +dtype_is_string_type( +/*=================*/ + ulint mtype); /*!< in: InnoDB main data type code: DATA_CHAR, ... */ +/*********************************************************************//** +Checks if a type is a binary string type. Note that for tables created with +< 4.0.14, we do not know if a DATA_BLOB column is a BLOB or a TEXT column. For +those DATA_BLOB columns this function currently returns FALSE. +@return TRUE if binary string type */ +UNIV_INTERN +ibool +dtype_is_binary_string_type( +/*========================*/ + ulint mtype, /*!< in: main data type */ + ulint prtype);/*!< in: precise type */ +/*********************************************************************//** +Checks if a type is a non-binary string type. That is, dtype_is_string_type is +TRUE and dtype_is_binary_string_type is FALSE. Note that for tables created +with < 4.0.14, we do not know if a DATA_BLOB column is a BLOB or a TEXT column. +For those DATA_BLOB columns this function currently returns TRUE. +@return TRUE if non-binary string type */ +UNIV_INTERN +ibool +dtype_is_non_binary_string_type( +/*============================*/ + ulint mtype, /*!< in: main data type */ + ulint prtype);/*!< in: precise type */ +/*********************************************************************//** +Sets a data type structure. */ +UNIV_INLINE +void +dtype_set( +/*======*/ + dtype_t* type, /*!< in: type struct to init */ + ulint mtype, /*!< in: main data type */ + ulint prtype, /*!< in: precise type */ + ulint len); /*!< in: precision of type */ +/*********************************************************************//** +Copies a data type structure. */ +UNIV_INLINE +void +dtype_copy( +/*=======*/ + dtype_t* type1, /*!< in: type struct to copy to */ + const dtype_t* type2); /*!< in: type struct to copy from */ +/*********************************************************************//** +Gets the SQL main data type. +@return SQL main data type */ +UNIV_INLINE +ulint +dtype_get_mtype( +/*============*/ + const dtype_t* type); /*!< in: data type */ +/*********************************************************************//** +Gets the precise data type. +@return precise data type */ +UNIV_INLINE +ulint +dtype_get_prtype( +/*=============*/ + const dtype_t* type); /*!< in: data type */ +#ifndef UNIV_HOTBACKUP +/*********************************************************************//** +Compute the mbminlen and mbmaxlen members of a data type structure. */ +UNIV_INLINE +void +dtype_get_mblen( +/*============*/ + ulint mtype, /*!< in: main type */ + ulint prtype, /*!< in: precise type (and collation) */ + ulint* mbminlen, /*!< out: minimum length of a + multi-byte character */ + ulint* mbmaxlen); /*!< out: maximum length of a + multi-byte character */ +/*********************************************************************//** +Gets the MySQL charset-collation code for MySQL string types. +@return MySQL charset-collation code */ +UNIV_INLINE +ulint +dtype_get_charset_coll( +/*===================*/ + ulint prtype);/*!< in: precise data type */ +/*********************************************************************//** +Forms a precise type from the < 4.1.2 format precise type plus the +charset-collation code. +@return precise type, including the charset-collation code */ +UNIV_INTERN +ulint +dtype_form_prtype( +/*==============*/ + ulint old_prtype, /*!< in: the MySQL type code and the flags + DATA_BINARY_TYPE etc. */ + ulint charset_coll); /*!< in: MySQL charset-collation code */ +/*********************************************************************//** +Determines if a MySQL string type is a subset of UTF-8. This function +may return false negatives, in case further character-set collation +codes are introduced in MySQL later. +@return TRUE if a subset of UTF-8 */ +UNIV_INLINE +ibool +dtype_is_utf8( +/*==========*/ + ulint prtype);/*!< in: precise data type */ +#endif /* !UNIV_HOTBACKUP */ +/*********************************************************************//** +Gets the type length. +@return fixed length of the type, in bytes, or 0 if variable-length */ +UNIV_INLINE +ulint +dtype_get_len( +/*==========*/ + const dtype_t* type); /*!< in: data type */ +#ifndef UNIV_HOTBACKUP +/*********************************************************************//** +Gets the minimum length of a character, in bytes. +@return minimum length of a char, in bytes, or 0 if this is not a +character type */ +UNIV_INLINE +ulint +dtype_get_mbminlen( +/*===============*/ + const dtype_t* type); /*!< in: type */ +/*********************************************************************//** +Gets the maximum length of a character, in bytes. +@return maximum length of a char, in bytes, or 0 if this is not a +character type */ +UNIV_INLINE +ulint +dtype_get_mbmaxlen( +/*===============*/ + const dtype_t* type); /*!< in: type */ +/*********************************************************************//** +Sets the minimum and maximum length of a character, in bytes. */ +UNIV_INLINE +void +dtype_set_mbminmaxlen( +/*==================*/ + dtype_t* type, /*!< in/out: type */ + ulint mbminlen, /*!< in: minimum length of a char, + in bytes, or 0 if this is not + a character type */ + ulint mbmaxlen); /*!< in: maximum length of a char, + in bytes, or 0 if this is not + a character type */ +/*********************************************************************//** +Gets the padding character code for the type. +@return padding character code, or ULINT_UNDEFINED if no padding specified */ +UNIV_INLINE +ulint +dtype_get_pad_char( +/*===============*/ + ulint mtype, /*!< in: main type */ + ulint prtype); /*!< in: precise type */ +#endif /* !UNIV_HOTBACKUP */ +/***********************************************************************//** +Returns the size of a fixed size data type, 0 if not a fixed size type. +@return fixed size, or 0 */ +UNIV_INLINE +ulint +dtype_get_fixed_size_low( +/*=====================*/ + ulint mtype, /*!< in: main type */ + ulint prtype, /*!< in: precise type */ + ulint len, /*!< in: length */ + ulint mbminmaxlen, /*!< in: minimum and maximum length of a + multibyte character, in bytes */ + ulint comp); /*!< in: nonzero=ROW_FORMAT=COMPACT */ +#ifndef UNIV_HOTBACKUP +/***********************************************************************//** +Returns the minimum size of a data type. +@return minimum size */ +UNIV_INLINE +ulint +dtype_get_min_size_low( +/*===================*/ + ulint mtype, /*!< in: main type */ + ulint prtype, /*!< in: precise type */ + ulint len, /*!< in: length */ + ulint mbminmaxlen); /*!< in: minimum and maximum length of a + multibyte character */ +/***********************************************************************//** +Returns the maximum size of a data type. Note: types in system tables may be +incomplete and return incorrect information. +@return maximum size */ +UNIV_INLINE +ulint +dtype_get_max_size_low( +/*===================*/ + ulint mtype, /*!< in: main type */ + ulint len); /*!< in: length */ +#endif /* !UNIV_HOTBACKUP */ +/***********************************************************************//** +Returns the ROW_FORMAT=REDUNDANT stored SQL NULL size of a type. +For fixed length types it is the fixed length of the type, otherwise 0. +@return SQL null storage size in ROW_FORMAT=REDUNDANT */ +UNIV_INLINE +ulint +dtype_get_sql_null_size( +/*====================*/ + const dtype_t* type, /*!< in: type */ + ulint comp); /*!< in: nonzero=ROW_FORMAT=COMPACT */ +#ifndef UNIV_HOTBACKUP +/**********************************************************************//** +Reads to a type the stored information which determines its alphabetical +ordering and the storage size of an SQL NULL value. */ +UNIV_INLINE +void +dtype_read_for_order_and_null_size( +/*===============================*/ + dtype_t* type, /*!< in: type struct */ + const byte* buf); /*!< in: buffer for the stored order info */ +/**********************************************************************//** +Stores for a type the information which determines its alphabetical ordering +and the storage size of an SQL NULL value. This is the >= 4.1.x storage +format. */ +UNIV_INLINE +void +dtype_new_store_for_order_and_null_size( +/*====================================*/ + byte* buf, /*!< in: buffer for + DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE + bytes where we store the info */ + const dtype_t* type, /*!< in: type struct */ + ulint prefix_len);/*!< in: prefix length to + replace type->len, or 0 */ +/**********************************************************************//** +Reads to a type the stored information which determines its alphabetical +ordering and the storage size of an SQL NULL value. This is the 4.1.x storage +format. */ +UNIV_INLINE +void +dtype_new_read_for_order_and_null_size( +/*===================================*/ + dtype_t* type, /*!< in: type struct */ + const byte* buf); /*!< in: buffer for stored type order info */ + +/*********************************************************************//** +Returns the type's SQL name (e.g. BIGINT UNSIGNED) from mtype,prtype,len +@return the SQL type name */ +UNIV_INLINE +char* +dtype_sql_name( +/*===========*/ + unsigned mtype, /*!< in: mtype */ + unsigned prtype, /*!< in: prtype */ + unsigned len, /*!< in: len */ + char* name, /*!< out: SQL name */ + unsigned name_sz);/*!< in: size of the name buffer */ + +#endif /* !UNIV_HOTBACKUP */ + +/*********************************************************************//** +Validates a data type structure. +@return TRUE if ok */ +UNIV_INTERN +ibool +dtype_validate( +/*===========*/ + const dtype_t* type); /*!< in: type struct to validate */ +/*********************************************************************//** +Prints a data type structure. */ +UNIV_INTERN +void +dtype_print( +/*========*/ + const dtype_t* type); /*!< in: type */ + +/* Structure for an SQL data type. +If you add fields to this structure, be sure to initialize them everywhere. +This structure is initialized in the following functions: +dtype_set() +dtype_read_for_order_and_null_size() +dtype_new_read_for_order_and_null_size() +sym_tab_add_null_lit() */ + +struct dtype_t{ + unsigned prtype:32; /*!< precise type; MySQL data + type, charset code, flags to + indicate nullability, + signedness, whether this is a + binary string, whether this is + a true VARCHAR where MySQL + uses 2 bytes to store the length */ + unsigned mtype:8; /*!< main data type */ + + /* the remaining fields do not affect alphabetical ordering: */ + + unsigned len:16; /*!< length; for MySQL data this + is field->pack_length(), + except that for a >= 5.0.3 + type true VARCHAR this is the + maximum byte length of the + string data (in addition to + the string, MySQL uses 1 or 2 + bytes to store the string length) */ +#ifndef UNIV_HOTBACKUP + unsigned mbminmaxlen:5; /*!< minimum and maximum length of a + character, in bytes; + DATA_MBMINMAXLEN(mbminlen,mbmaxlen); + mbminlen=DATA_MBMINLEN(mbminmaxlen); + mbmaxlen=DATA_MBMINLEN(mbminmaxlen) */ +#endif /* !UNIV_HOTBACKUP */ +}; + +#ifndef UNIV_NONINL +#include "data0type.ic" +#endif + +#endif diff --git a/storage/innobase/include/data0type.ic b/storage/innobase/include/data0type.ic new file mode 100644 index 00000000000..d489bef89a8 --- /dev/null +++ b/storage/innobase/include/data0type.ic @@ -0,0 +1,711 @@ +/***************************************************************************** + +Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/data0type.ic +Data types + +Created 1/16/1996 Heikki Tuuri +*******************************************************/ + +#include <string.h> /* strlen() */ + +#include "mach0data.h" +#ifndef UNIV_HOTBACKUP +# include "ha_prototypes.h" + +/*********************************************************************//** +Gets the MySQL charset-collation code for MySQL string types. +@return MySQL charset-collation code */ +UNIV_INLINE +ulint +dtype_get_charset_coll( +/*===================*/ + ulint prtype) /*!< in: precise data type */ +{ + return((prtype >> 16) & CHAR_COLL_MASK); +} + +/*********************************************************************//** +Determines if a MySQL string type is a subset of UTF-8. This function +may return false negatives, in case further character-set collation +codes are introduced in MySQL later. +@return TRUE if a subset of UTF-8 */ +UNIV_INLINE +ibool +dtype_is_utf8( +/*==========*/ + ulint prtype) /*!< in: precise data type */ +{ + /* These codes have been copied from strings/ctype-extra.c + and strings/ctype-utf8.c. */ + switch (dtype_get_charset_coll(prtype)) { + case 11: /* ascii_general_ci */ + case 65: /* ascii_bin */ + case 33: /* utf8_general_ci */ + case 83: /* utf8_bin */ + case 254: /* utf8_general_cs */ + return(TRUE); + } + + return(FALSE); +} + +/*********************************************************************//** +Gets the MySQL type code from a dtype. +@return MySQL type code; this is NOT an InnoDB type code! */ +UNIV_INLINE +ulint +dtype_get_mysql_type( +/*=================*/ + const dtype_t* type) /*!< in: type struct */ +{ + return(type->prtype & 0xFFUL); +} + +/*********************************************************************//** +Compute the mbminlen and mbmaxlen members of a data type structure. */ +UNIV_INLINE +void +dtype_get_mblen( +/*============*/ + ulint mtype, /*!< in: main type */ + ulint prtype, /*!< in: precise type (and collation) */ + ulint* mbminlen, /*!< out: minimum length of a + multi-byte character */ + ulint* mbmaxlen) /*!< out: maximum length of a + multi-byte character */ +{ + if (dtype_is_string_type(mtype)) { + innobase_get_cset_width(dtype_get_charset_coll(prtype), + mbminlen, mbmaxlen); + ut_ad(*mbminlen <= *mbmaxlen); + ut_ad(*mbminlen < DATA_MBMAX); + ut_ad(*mbmaxlen < DATA_MBMAX); + } else { + *mbminlen = *mbmaxlen = 0; + } +} + +/*********************************************************************//** +Sets the minimum and maximum length of a character, in bytes. */ +UNIV_INLINE +void +dtype_set_mbminmaxlen( +/*==================*/ + dtype_t* type, /*!< in/out: type */ + ulint mbminlen, /*!< in: minimum length of a char, + in bytes, or 0 if this is not + a character type */ + ulint mbmaxlen) /*!< in: maximum length of a char, + in bytes, or 0 if this is not + a character type */ +{ + ut_ad(mbminlen < DATA_MBMAX); + ut_ad(mbmaxlen < DATA_MBMAX); + ut_ad(mbminlen <= mbmaxlen); + + type->mbminmaxlen = DATA_MBMINMAXLEN(mbminlen, mbmaxlen); +} + +/*********************************************************************//** +Compute the mbminlen and mbmaxlen members of a data type structure. */ +UNIV_INLINE +void +dtype_set_mblen( +/*============*/ + dtype_t* type) /*!< in/out: type */ +{ + ulint mbminlen; + ulint mbmaxlen; + + dtype_get_mblen(type->mtype, type->prtype, &mbminlen, &mbmaxlen); + dtype_set_mbminmaxlen(type, mbminlen, mbmaxlen); + + ut_ad(dtype_validate(type)); +} +#else /* !UNIV_HOTBACKUP */ +# define dtype_set_mblen(type) (void) 0 +#endif /* !UNIV_HOTBACKUP */ + +/*********************************************************************//** +Sets a data type structure. */ +UNIV_INLINE +void +dtype_set( +/*======*/ + dtype_t* type, /*!< in: type struct to init */ + ulint mtype, /*!< in: main data type */ + ulint prtype, /*!< in: precise type */ + ulint len) /*!< in: precision of type */ +{ + ut_ad(type); + ut_ad(mtype <= DATA_MTYPE_MAX); + + type->mtype = mtype; + type->prtype = prtype; + type->len = len; + + dtype_set_mblen(type); +} + +/*********************************************************************//** +Copies a data type structure. */ +UNIV_INLINE +void +dtype_copy( +/*=======*/ + dtype_t* type1, /*!< in: type struct to copy to */ + const dtype_t* type2) /*!< in: type struct to copy from */ +{ + *type1 = *type2; + + ut_ad(dtype_validate(type1)); +} + +/*********************************************************************//** +Gets the SQL main data type. +@return SQL main data type */ +UNIV_INLINE +ulint +dtype_get_mtype( +/*============*/ + const dtype_t* type) /*!< in: data type */ +{ + ut_ad(type); + + return(type->mtype); +} + +/*********************************************************************//** +Gets the precise data type. +@return precise data type */ +UNIV_INLINE +ulint +dtype_get_prtype( +/*=============*/ + const dtype_t* type) /*!< in: data type */ +{ + ut_ad(type); + + return(type->prtype); +} + +/*********************************************************************//** +Gets the type length. +@return fixed length of the type, in bytes, or 0 if variable-length */ +UNIV_INLINE +ulint +dtype_get_len( +/*==========*/ + const dtype_t* type) /*!< in: data type */ +{ + ut_ad(type); + + return(type->len); +} + +#ifndef UNIV_HOTBACKUP +/*********************************************************************//** +Gets the minimum length of a character, in bytes. +@return minimum length of a char, in bytes, or 0 if this is not a +character type */ +UNIV_INLINE +ulint +dtype_get_mbminlen( +/*===============*/ + const dtype_t* type) /*!< in: type */ +{ + ut_ad(type); + return(DATA_MBMINLEN(type->mbminmaxlen)); +} +/*********************************************************************//** +Gets the maximum length of a character, in bytes. +@return maximum length of a char, in bytes, or 0 if this is not a +character type */ +UNIV_INLINE +ulint +dtype_get_mbmaxlen( +/*===============*/ + const dtype_t* type) /*!< in: type */ +{ + ut_ad(type); + return(DATA_MBMAXLEN(type->mbminmaxlen)); +} + +/*********************************************************************//** +Gets the padding character code for a type. +@return padding character code, or ULINT_UNDEFINED if no padding specified */ +UNIV_INLINE +ulint +dtype_get_pad_char( +/*===============*/ + ulint mtype, /*!< in: main type */ + ulint prtype) /*!< in: precise type */ +{ + switch (mtype) { + case DATA_FIXBINARY: + case DATA_BINARY: + if (dtype_get_charset_coll(prtype) + == DATA_MYSQL_BINARY_CHARSET_COLL) { + /* Starting from 5.0.18, do not pad + VARBINARY or BINARY columns. */ + return(ULINT_UNDEFINED); + } + /* Fall through */ + case DATA_CHAR: + case DATA_VARCHAR: + case DATA_MYSQL: + case DATA_VARMYSQL: + /* Space is the padding character for all char and binary + strings, and starting from 5.0.3, also for TEXT strings. */ + + return(0x20); + case DATA_BLOB: + if (!(prtype & DATA_BINARY_TYPE)) { + return(0x20); + } + /* Fall through */ + default: + /* No padding specified */ + return(ULINT_UNDEFINED); + } +} + +/**********************************************************************//** +Stores for a type the information which determines its alphabetical ordering +and the storage size of an SQL NULL value. This is the >= 4.1.x storage +format. */ +UNIV_INLINE +void +dtype_new_store_for_order_and_null_size( +/*====================================*/ + byte* buf, /*!< in: buffer for + DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE + bytes where we store the info */ + const dtype_t* type, /*!< in: type struct */ + ulint prefix_len)/*!< in: prefix length to + replace type->len, or 0 */ +{ +#if 6 != DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE +#error "6 != DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE" +#endif + ulint len; + + ut_ad(type); + ut_ad(type->mtype >= DATA_VARCHAR); + ut_ad(type->mtype <= DATA_MYSQL); + + buf[0] = (byte)(type->mtype & 0xFFUL); + + if (type->prtype & DATA_BINARY_TYPE) { + buf[0] |= 128; + } + + /* In versions < 4.1.2 we had: if (type->prtype & DATA_NONLATIN1) { + buf[0] |= 64; + } + */ + + buf[1] = (byte)(type->prtype & 0xFFUL); + + len = prefix_len ? prefix_len : type->len; + + mach_write_to_2(buf + 2, len & 0xFFFFUL); + + ut_ad(dtype_get_charset_coll(type->prtype) <= MAX_CHAR_COLL_NUM); + mach_write_to_2(buf + 4, dtype_get_charset_coll(type->prtype)); + + if (type->prtype & DATA_NOT_NULL) { + buf[4] |= 128; + } +} + +/**********************************************************************//** +Reads to a type the stored information which determines its alphabetical +ordering and the storage size of an SQL NULL value. This is the < 4.1.x +storage format. */ +UNIV_INLINE +void +dtype_read_for_order_and_null_size( +/*===============================*/ + dtype_t* type, /*!< in: type struct */ + const byte* buf) /*!< in: buffer for stored type order info */ +{ +#if 4 != DATA_ORDER_NULL_TYPE_BUF_SIZE +# error "4 != DATA_ORDER_NULL_TYPE_BUF_SIZE" +#endif + + type->mtype = buf[0] & 63; + type->prtype = buf[1]; + + if (buf[0] & 128) { + type->prtype |= DATA_BINARY_TYPE; + } + + type->len = mach_read_from_2(buf + 2); + + type->prtype = dtype_form_prtype(type->prtype, + data_mysql_default_charset_coll); + dtype_set_mblen(type); +} + +/**********************************************************************//** +Reads to a type the stored information which determines its alphabetical +ordering and the storage size of an SQL NULL value. This is the >= 4.1.x +storage format. */ +UNIV_INLINE +void +dtype_new_read_for_order_and_null_size( +/*===================================*/ + dtype_t* type, /*!< in: type struct */ + const byte* buf) /*!< in: buffer for stored type order info */ +{ + ulint charset_coll; + +#if 6 != DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE +#error "6 != DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE" +#endif + + type->mtype = buf[0] & 63; + type->prtype = buf[1]; + + if (buf[0] & 128) { + type->prtype |= DATA_BINARY_TYPE; + } + + if (buf[4] & 128) { + type->prtype |= DATA_NOT_NULL; + } + + type->len = mach_read_from_2(buf + 2); + + charset_coll = mach_read_from_2(buf + 4) & CHAR_COLL_MASK; + + if (dtype_is_string_type(type->mtype)) { + ut_a(charset_coll <= MAX_CHAR_COLL_NUM); + + if (charset_coll == 0) { + /* This insert buffer record was inserted with MySQL + version < 4.1.2, and the charset-collation code was not + explicitly stored to dtype->prtype at that time. It + must be the default charset-collation of this MySQL + installation. */ + + charset_coll = data_mysql_default_charset_coll; + } + + type->prtype = dtype_form_prtype(type->prtype, charset_coll); + } + dtype_set_mblen(type); +} + +/*********************************************************************//** +Returns the type's SQL name (e.g. BIGINT UNSIGNED) from mtype,prtype,len +@return the SQL type name */ +UNIV_INLINE +char* +dtype_sql_name( +/*===========*/ + unsigned mtype, /*!< in: mtype */ + unsigned prtype, /*!< in: prtype */ + unsigned len, /*!< in: len */ + char* name, /*!< out: SQL name */ + unsigned name_sz)/*!< in: size of the name buffer */ +{ + +#define APPEND_UNSIGNED() \ + do { \ + if (prtype & DATA_UNSIGNED) { \ + ut_snprintf(name + strlen(name), \ + name_sz - strlen(name), \ + " UNSIGNED"); \ + } \ + } while (0) + + ut_snprintf(name, name_sz, "UNKNOWN"); + + switch (mtype) { + case DATA_INT: + switch (len) { + case 1: + ut_snprintf(name, name_sz, "TINYINT"); + break; + case 2: + ut_snprintf(name, name_sz, "SMALLINT"); + break; + case 3: + ut_snprintf(name, name_sz, "MEDIUMINT"); + break; + case 4: + ut_snprintf(name, name_sz, "INT"); + break; + case 8: + ut_snprintf(name, name_sz, "BIGINT"); + break; + } + APPEND_UNSIGNED(); + break; + case DATA_FLOAT: + ut_snprintf(name, name_sz, "FLOAT"); + APPEND_UNSIGNED(); + break; + case DATA_DOUBLE: + ut_snprintf(name, name_sz, "DOUBLE"); + APPEND_UNSIGNED(); + break; + case DATA_FIXBINARY: + ut_snprintf(name, name_sz, "BINARY(%u)", len); + break; + case DATA_CHAR: + case DATA_MYSQL: + ut_snprintf(name, name_sz, "CHAR(%u)", len); + break; + case DATA_VARCHAR: + case DATA_VARMYSQL: + ut_snprintf(name, name_sz, "VARCHAR(%u)", len); + break; + case DATA_BINARY: + ut_snprintf(name, name_sz, "VARBINARY(%u)", len); + break; + case DATA_BLOB: + switch (len) { + case 9: + ut_snprintf(name, name_sz, "TINYBLOB"); + break; + case 10: + ut_snprintf(name, name_sz, "BLOB"); + break; + case 11: + ut_snprintf(name, name_sz, "MEDIUMBLOB"); + break; + case 12: + ut_snprintf(name, name_sz, "LONGBLOB"); + break; + } + } + + if (prtype & DATA_NOT_NULL) { + ut_snprintf(name + strlen(name), + name_sz - strlen(name), + " NOT NULL"); + } + + return(name); +} + +#endif /* !UNIV_HOTBACKUP */ + +/***********************************************************************//** +Returns the size of a fixed size data type, 0 if not a fixed size type. +@return fixed size, or 0 */ +UNIV_INLINE +ulint +dtype_get_fixed_size_low( +/*=====================*/ + ulint mtype, /*!< in: main type */ + ulint prtype, /*!< in: precise type */ + ulint len, /*!< in: length */ + ulint mbminmaxlen, /*!< in: minimum and maximum length of + a multibyte character, in bytes */ + ulint comp) /*!< in: nonzero=ROW_FORMAT=COMPACT */ +{ + switch (mtype) { + case DATA_SYS: +#ifdef UNIV_DEBUG + switch (prtype & DATA_MYSQL_TYPE_MASK) { + case DATA_ROW_ID: + ut_ad(len == DATA_ROW_ID_LEN); + break; + case DATA_TRX_ID: + ut_ad(len == DATA_TRX_ID_LEN); + break; + case DATA_ROLL_PTR: + ut_ad(len == DATA_ROLL_PTR_LEN); + break; + default: + ut_ad(0); + return(0); + } +#endif /* UNIV_DEBUG */ + case DATA_CHAR: + case DATA_FIXBINARY: + case DATA_INT: + case DATA_FLOAT: + case DATA_DOUBLE: + return(len); + case DATA_MYSQL: +#ifndef UNIV_HOTBACKUP + if (prtype & DATA_BINARY_TYPE) { + return(len); + } else if (!comp) { + return(len); + } else { +#ifdef UNIV_DEBUG + ulint i_mbminlen, i_mbmaxlen; + + innobase_get_cset_width( + dtype_get_charset_coll(prtype), + &i_mbminlen, &i_mbmaxlen); + + ut_ad(DATA_MBMINMAXLEN(i_mbminlen, i_mbmaxlen) + == mbminmaxlen); +#endif /* UNIV_DEBUG */ + if (DATA_MBMINLEN(mbminmaxlen) + == DATA_MBMAXLEN(mbminmaxlen)) { + return(len); + } + } +#else /* !UNIV_HOTBACKUP */ + return(len); +#endif /* !UNIV_HOTBACKUP */ + /* fall through for variable-length charsets */ + case DATA_VARCHAR: + case DATA_BINARY: + case DATA_DECIMAL: + case DATA_VARMYSQL: + case DATA_BLOB: + return(0); + default: + ut_error; + } + + return(0); +} + +#ifndef UNIV_HOTBACKUP +/***********************************************************************//** +Returns the minimum size of a data type. +@return minimum size */ +UNIV_INLINE +ulint +dtype_get_min_size_low( +/*===================*/ + ulint mtype, /*!< in: main type */ + ulint prtype, /*!< in: precise type */ + ulint len, /*!< in: length */ + ulint mbminmaxlen) /*!< in: minimum and maximum length of a + multi-byte character */ +{ + switch (mtype) { + case DATA_SYS: +#ifdef UNIV_DEBUG + switch (prtype & DATA_MYSQL_TYPE_MASK) { + case DATA_ROW_ID: + ut_ad(len == DATA_ROW_ID_LEN); + break; + case DATA_TRX_ID: + ut_ad(len == DATA_TRX_ID_LEN); + break; + case DATA_ROLL_PTR: + ut_ad(len == DATA_ROLL_PTR_LEN); + break; + default: + ut_ad(0); + return(0); + } +#endif /* UNIV_DEBUG */ + case DATA_CHAR: + case DATA_FIXBINARY: + case DATA_INT: + case DATA_FLOAT: + case DATA_DOUBLE: + return(len); + case DATA_MYSQL: + if (prtype & DATA_BINARY_TYPE) { + return(len); + } else { + ulint mbminlen = DATA_MBMINLEN(mbminmaxlen); + ulint mbmaxlen = DATA_MBMAXLEN(mbminmaxlen); + + if (mbminlen == mbmaxlen) { + return(len); + } + + /* this is a variable-length character set */ + ut_a(mbminlen > 0); + ut_a(mbmaxlen > mbminlen); + ut_a(len % mbmaxlen == 0); + return(len * mbminlen / mbmaxlen); + } + case DATA_VARCHAR: + case DATA_BINARY: + case DATA_DECIMAL: + case DATA_VARMYSQL: + case DATA_BLOB: + return(0); + default: + ut_error; + } + + return(0); +} + +/***********************************************************************//** +Returns the maximum size of a data type. Note: types in system tables may be +incomplete and return incorrect information. +@return maximum size */ +UNIV_INLINE +ulint +dtype_get_max_size_low( +/*===================*/ + ulint mtype, /*!< in: main type */ + ulint len) /*!< in: length */ +{ + switch (mtype) { + case DATA_SYS: + case DATA_CHAR: + case DATA_FIXBINARY: + case DATA_INT: + case DATA_FLOAT: + case DATA_DOUBLE: + case DATA_MYSQL: + case DATA_VARCHAR: + case DATA_BINARY: + case DATA_DECIMAL: + case DATA_VARMYSQL: + return(len); + case DATA_BLOB: + break; + default: + ut_error; + } + + return(ULINT_MAX); +} +#endif /* !UNIV_HOTBACKUP */ + +/***********************************************************************//** +Returns the ROW_FORMAT=REDUNDANT stored SQL NULL size of a type. +For fixed length types it is the fixed length of the type, otherwise 0. +@return SQL null storage size in ROW_FORMAT=REDUNDANT */ +UNIV_INLINE +ulint +dtype_get_sql_null_size( +/*====================*/ + const dtype_t* type, /*!< in: type */ + ulint comp) /*!< in: nonzero=ROW_FORMAT=COMPACT */ +{ +#ifndef UNIV_HOTBACKUP + return(dtype_get_fixed_size_low(type->mtype, type->prtype, type->len, + type->mbminmaxlen, comp)); +#else /* !UNIV_HOTBACKUP */ + return(dtype_get_fixed_size_low(type->mtype, type->prtype, type->len, + 0, 0)); +#endif /* !UNIV_HOTBACKUP */ +} diff --git a/storage/innobase/include/data0types.h b/storage/innobase/include/data0types.h new file mode 100644 index 00000000000..bd2bb577611 --- /dev/null +++ b/storage/innobase/include/data0types.h @@ -0,0 +1,36 @@ +/***************************************************************************** + +Copyright (c) 2000, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/********************************************************************//** +@file include/data0types.h +Some type definitions + +Created 9/21/2000 Heikki Tuuri +*************************************************************************/ + +#ifndef data0types_h +#define data0types_h + +/* SQL data field struct */ +struct dfield_t; + +/* SQL data tuple struct */ +struct dtuple_t; + +#endif + diff --git a/storage/innobase/include/db0err.h b/storage/innobase/include/db0err.h new file mode 100644 index 00000000000..1e87ce3fdb8 --- /dev/null +++ b/storage/innobase/include/db0err.h @@ -0,0 +1,161 @@ +/***************************************************************************** + +Copyright (c) 1996, 2014, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/db0err.h +Global error codes for the database + +Created 5/24/1996 Heikki Tuuri +*******************************************************/ + +#ifndef db0err_h +#define db0err_h + + +enum dberr_t { + DB_SUCCESS_LOCKED_REC = 9, /*!< like DB_SUCCESS, but a new + explicit record lock was created */ + DB_SUCCESS = 10, + + /* The following are error codes */ + DB_ERROR, + DB_INTERRUPTED, + DB_OUT_OF_MEMORY, + DB_OUT_OF_FILE_SPACE, + DB_LOCK_WAIT, + DB_DEADLOCK, + DB_ROLLBACK, + DB_DUPLICATE_KEY, + DB_QUE_THR_SUSPENDED, + DB_MISSING_HISTORY, /*!< required history data has been + deleted due to lack of space in + rollback segment */ + DB_CLUSTER_NOT_FOUND = 30, + DB_TABLE_NOT_FOUND, + DB_MUST_GET_MORE_FILE_SPACE, /*!< the database has to be stopped + and restarted with more file space */ + DB_TABLE_IS_BEING_USED, + DB_TOO_BIG_RECORD, /*!< a record in an index would not fit + on a compressed page, or it would + become bigger than 1/2 free space in + an uncompressed page frame */ + DB_LOCK_WAIT_TIMEOUT, /*!< lock wait lasted too long */ + DB_NO_REFERENCED_ROW, /*!< referenced key value not found + for a foreign key in an insert or + update of a row */ + DB_ROW_IS_REFERENCED, /*!< cannot delete or update a row + because it contains a key value + which is referenced */ + DB_CANNOT_ADD_CONSTRAINT, /*!< adding a foreign key constraint + to a table failed */ + DB_CORRUPTION, /*!< data structure corruption noticed */ + DB_CANNOT_DROP_CONSTRAINT, /*!< dropping a foreign key constraint + from a table failed */ + DB_NO_SAVEPOINT, /*!< no savepoint exists with the given + name */ + DB_TABLESPACE_EXISTS, /*!< we cannot create a new single-table + tablespace because a file of the same + name already exists */ + DB_TABLESPACE_DELETED, /*!< tablespace was deleted or is + being dropped right now */ + DB_TABLESPACE_NOT_FOUND, /*<! Attempt to delete a tablespace + instance that was not found in the + tablespace hash table */ + DB_LOCK_TABLE_FULL, /*!< lock structs have exhausted the + buffer pool (for big transactions, + InnoDB stores the lock structs in the + buffer pool) */ + DB_FOREIGN_DUPLICATE_KEY, /*!< foreign key constraints + activated by the operation would + lead to a duplicate key in some + table */ + DB_TOO_MANY_CONCURRENT_TRXS, /*!< when InnoDB runs out of the + preconfigured undo slots, this can + only happen when there are too many + concurrent transactions */ + DB_UNSUPPORTED, /*!< when InnoDB sees any artefact or + a feature that it can't recoginize or + work with e.g., FT indexes created by + a later version of the engine. */ + + DB_INVALID_NULL, /*!< a NOT NULL column was found to + be NULL during table rebuild */ + + DB_STATS_DO_NOT_EXIST, /*!< an operation that requires the + persistent storage, used for recording + table and index statistics, was + requested but this storage does not + exist itself or the stats for a given + table do not exist */ + DB_FOREIGN_EXCEED_MAX_CASCADE, /*!< Foreign key constraint related + cascading delete/update exceeds + maximum allowed depth */ + DB_CHILD_NO_INDEX, /*!< the child (foreign) table does + not have an index that contains the + foreign keys as its prefix columns */ + DB_PARENT_NO_INDEX, /*!< the parent table does not + have an index that contains the + foreign keys as its prefix columns */ + DB_TOO_BIG_INDEX_COL, /*!< index column size exceeds + maximum limit */ + DB_INDEX_CORRUPT, /*!< we have corrupted index */ + DB_UNDO_RECORD_TOO_BIG, /*!< the undo log record is too big */ + DB_READ_ONLY, /*!< Update operation attempted in + a read-only transaction */ + DB_FTS_INVALID_DOCID, /* FTS Doc ID cannot be zero */ + DB_TABLE_IN_FK_CHECK, /* table is being used in foreign + key check */ + DB_ONLINE_LOG_TOO_BIG, /*!< Modification log grew too big + during online index creation */ + + DB_IO_ERROR, /*!< Generic IO error */ + DB_IDENTIFIER_TOO_LONG, /*!< Identifier name too long */ + DB_FTS_EXCEED_RESULT_CACHE_LIMIT, /*!< FTS query memory + exceeds result cache limit */ + DB_TEMP_FILE_WRITE_FAILURE, /*!< Temp file write failure */ + DB_FTS_TOO_MANY_WORDS_IN_PHRASE, + /*< Too many words in a phrase */ + DB_TOO_BIG_FOR_REDO, /* Record length greater than 10% + of redo log */ + /* The following are partial failure codes */ + DB_FAIL = 1000, + DB_OVERFLOW, + DB_UNDERFLOW, + DB_STRONG_FAIL, + DB_ZIP_OVERFLOW, + DB_RECORD_NOT_FOUND = 1500, + DB_END_OF_INDEX, + DB_DICT_CHANGED, /*!< Some part of table dictionary has + changed. Such as index dropped or + foreign key dropped */ + + + /* The following are API only error codes. */ + DB_DATA_MISMATCH = 2000, /*!< Column update or read failed + because the types mismatch */ + + DB_SCHEMA_NOT_LOCKED, /*!< If an API function expects the + schema to be locked in exclusive mode + and if it's not then that API function + will return this error code */ + + DB_NOT_FOUND /*!< Generic error code for "Not found" + type of errors */ +}; + +#endif diff --git a/storage/innobase/include/dict0boot.h b/storage/innobase/include/dict0boot.h new file mode 100644 index 00000000000..a994c9d8ff1 --- /dev/null +++ b/storage/innobase/include/dict0boot.h @@ -0,0 +1,342 @@ +/***************************************************************************** + +Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/dict0boot.h +Data dictionary creation and booting + +Created 4/18/1996 Heikki Tuuri +*******************************************************/ + +#ifndef dict0boot_h +#define dict0boot_h + +#include "univ.i" + +#include "mtr0mtr.h" +#include "mtr0log.h" +#include "ut0byte.h" +#include "buf0buf.h" +#include "fsp0fsp.h" +#include "dict0dict.h" + +typedef byte dict_hdr_t; + +/**********************************************************************//** +Gets a pointer to the dictionary header and x-latches its page. +@return pointer to the dictionary header, page x-latched */ +UNIV_INTERN +dict_hdr_t* +dict_hdr_get( +/*=========*/ + mtr_t* mtr); /*!< in: mtr */ +/**********************************************************************//** +Returns a new table, index, or space id. */ +UNIV_INTERN +void +dict_hdr_get_new_id( +/*================*/ + table_id_t* table_id, /*!< out: table id + (not assigned if NULL) */ + index_id_t* index_id, /*!< out: index id + (not assigned if NULL) */ + ulint* space_id); /*!< out: space id + (not assigned if NULL) */ +/**********************************************************************//** +Writes the current value of the row id counter to the dictionary header file +page. */ +UNIV_INTERN +void +dict_hdr_flush_row_id(void); +/*=======================*/ +/**********************************************************************//** +Returns a new row id. +@return the new id */ +UNIV_INLINE +row_id_t +dict_sys_get_new_row_id(void); +/*=========================*/ +/**********************************************************************//** +Reads a row id from a record or other 6-byte stored form. +@return row id */ +UNIV_INLINE +row_id_t +dict_sys_read_row_id( +/*=================*/ + const byte* field); /*!< in: record field */ +/**********************************************************************//** +Writes a row id to a record or other 6-byte stored form. */ +UNIV_INLINE +void +dict_sys_write_row_id( +/*==================*/ + byte* field, /*!< in: record field */ + row_id_t row_id);/*!< in: row id */ +/*****************************************************************//** +Initializes the data dictionary memory structures when the database is +started. This function is also called when the data dictionary is created. +@return DB_SUCCESS or error code. */ +UNIV_INTERN +dberr_t +dict_boot(void) +/*===========*/ + __attribute__((warn_unused_result)); + +/*****************************************************************//** +Creates and initializes the data dictionary at the server bootstrap. +@return DB_SUCCESS or error code. */ +UNIV_INTERN +dberr_t +dict_create(void) +/*=============*/ + __attribute__((warn_unused_result)); + +/*********************************************************************//** +Check if a table id belongs to system table. +@return true if the table id belongs to a system table. */ +UNIV_INLINE +bool +dict_is_sys_table( +/*==============*/ + table_id_t id) /*!< in: table id to check */ + __attribute__((warn_unused_result)); + +/* Space id and page no where the dictionary header resides */ +#define DICT_HDR_SPACE 0 /* the SYSTEM tablespace */ +#define DICT_HDR_PAGE_NO FSP_DICT_HDR_PAGE_NO + +/* The ids for the basic system tables and their indexes */ +#define DICT_TABLES_ID 1 +#define DICT_COLUMNS_ID 2 +#define DICT_INDEXES_ID 3 +#define DICT_FIELDS_ID 4 +/* The following is a secondary index on SYS_TABLES */ +#define DICT_TABLE_IDS_ID 5 + +#define DICT_HDR_FIRST_ID 10 /* the ids for tables etc. start + from this number, except for basic + system tables and their above defined + indexes; ibuf tables and indexes are + assigned as the id the number + DICT_IBUF_ID_MIN plus the space id */ + +/* The offset of the dictionary header on the page */ +#define DICT_HDR FSEG_PAGE_DATA + +/*-------------------------------------------------------------*/ +/* Dictionary header offsets */ +#define DICT_HDR_ROW_ID 0 /* The latest assigned row id */ +#define DICT_HDR_TABLE_ID 8 /* The latest assigned table id */ +#define DICT_HDR_INDEX_ID 16 /* The latest assigned index id */ +#define DICT_HDR_MAX_SPACE_ID 24 /* The latest assigned space id,or 0*/ +#define DICT_HDR_MIX_ID_LOW 28 /* Obsolete,always DICT_HDR_FIRST_ID*/ +#define DICT_HDR_TABLES 32 /* Root of SYS_TABLES clust index */ +#define DICT_HDR_TABLE_IDS 36 /* Root of SYS_TABLE_IDS sec index */ +#define DICT_HDR_COLUMNS 40 /* Root of SYS_COLUMNS clust index */ +#define DICT_HDR_INDEXES 44 /* Root of SYS_INDEXES clust index */ +#define DICT_HDR_FIELDS 48 /* Root of SYS_FIELDS clust index */ + +#define DICT_HDR_FSEG_HEADER 56 /* Segment header for the tablespace + segment into which the dictionary + header is created */ +/*-------------------------------------------------------------*/ + +/* The columns in SYS_TABLES */ +enum dict_col_sys_tables_enum { + DICT_COL__SYS_TABLES__NAME = 0, + DICT_COL__SYS_TABLES__ID = 1, + DICT_COL__SYS_TABLES__N_COLS = 2, + DICT_COL__SYS_TABLES__TYPE = 3, + DICT_COL__SYS_TABLES__MIX_ID = 4, + DICT_COL__SYS_TABLES__MIX_LEN = 5, + DICT_COL__SYS_TABLES__CLUSTER_ID = 6, + DICT_COL__SYS_TABLES__SPACE = 7, + DICT_NUM_COLS__SYS_TABLES = 8 +}; +/* The field numbers in the SYS_TABLES clustered index */ +enum dict_fld_sys_tables_enum { + DICT_FLD__SYS_TABLES__NAME = 0, + DICT_FLD__SYS_TABLES__DB_TRX_ID = 1, + DICT_FLD__SYS_TABLES__DB_ROLL_PTR = 2, + DICT_FLD__SYS_TABLES__ID = 3, + DICT_FLD__SYS_TABLES__N_COLS = 4, + DICT_FLD__SYS_TABLES__TYPE = 5, + DICT_FLD__SYS_TABLES__MIX_ID = 6, + DICT_FLD__SYS_TABLES__MIX_LEN = 7, + DICT_FLD__SYS_TABLES__CLUSTER_ID = 8, + DICT_FLD__SYS_TABLES__SPACE = 9, + DICT_NUM_FIELDS__SYS_TABLES = 10 +}; +/* The field numbers in the SYS_TABLE_IDS index */ +enum dict_fld_sys_table_ids_enum { + DICT_FLD__SYS_TABLE_IDS__ID = 0, + DICT_FLD__SYS_TABLE_IDS__NAME = 1, + DICT_NUM_FIELDS__SYS_TABLE_IDS = 2 +}; +/* The columns in SYS_COLUMNS */ +enum dict_col_sys_columns_enum { + DICT_COL__SYS_COLUMNS__TABLE_ID = 0, + DICT_COL__SYS_COLUMNS__POS = 1, + DICT_COL__SYS_COLUMNS__NAME = 2, + DICT_COL__SYS_COLUMNS__MTYPE = 3, + DICT_COL__SYS_COLUMNS__PRTYPE = 4, + DICT_COL__SYS_COLUMNS__LEN = 5, + DICT_COL__SYS_COLUMNS__PREC = 6, + DICT_NUM_COLS__SYS_COLUMNS = 7 +}; +/* The field numbers in the SYS_COLUMNS clustered index */ +enum dict_fld_sys_columns_enum { + DICT_FLD__SYS_COLUMNS__TABLE_ID = 0, + DICT_FLD__SYS_COLUMNS__POS = 1, + DICT_FLD__SYS_COLUMNS__DB_TRX_ID = 2, + DICT_FLD__SYS_COLUMNS__DB_ROLL_PTR = 3, + DICT_FLD__SYS_COLUMNS__NAME = 4, + DICT_FLD__SYS_COLUMNS__MTYPE = 5, + DICT_FLD__SYS_COLUMNS__PRTYPE = 6, + DICT_FLD__SYS_COLUMNS__LEN = 7, + DICT_FLD__SYS_COLUMNS__PREC = 8, + DICT_NUM_FIELDS__SYS_COLUMNS = 9 +}; +/* The columns in SYS_INDEXES */ +enum dict_col_sys_indexes_enum { + DICT_COL__SYS_INDEXES__TABLE_ID = 0, + DICT_COL__SYS_INDEXES__ID = 1, + DICT_COL__SYS_INDEXES__NAME = 2, + DICT_COL__SYS_INDEXES__N_FIELDS = 3, + DICT_COL__SYS_INDEXES__TYPE = 4, + DICT_COL__SYS_INDEXES__SPACE = 5, + DICT_COL__SYS_INDEXES__PAGE_NO = 6, + DICT_NUM_COLS__SYS_INDEXES = 7 +}; +/* The field numbers in the SYS_INDEXES clustered index */ +enum dict_fld_sys_indexes_enum { + DICT_FLD__SYS_INDEXES__TABLE_ID = 0, + DICT_FLD__SYS_INDEXES__ID = 1, + DICT_FLD__SYS_INDEXES__DB_TRX_ID = 2, + DICT_FLD__SYS_INDEXES__DB_ROLL_PTR = 3, + DICT_FLD__SYS_INDEXES__NAME = 4, + DICT_FLD__SYS_INDEXES__N_FIELDS = 5, + DICT_FLD__SYS_INDEXES__TYPE = 6, + DICT_FLD__SYS_INDEXES__SPACE = 7, + DICT_FLD__SYS_INDEXES__PAGE_NO = 8, + DICT_NUM_FIELDS__SYS_INDEXES = 9 +}; +/* The columns in SYS_FIELDS */ +enum dict_col_sys_fields_enum { + DICT_COL__SYS_FIELDS__INDEX_ID = 0, + DICT_COL__SYS_FIELDS__POS = 1, + DICT_COL__SYS_FIELDS__COL_NAME = 2, + DICT_NUM_COLS__SYS_FIELDS = 3 +}; +/* The field numbers in the SYS_FIELDS clustered index */ +enum dict_fld_sys_fields_enum { + DICT_FLD__SYS_FIELDS__INDEX_ID = 0, + DICT_FLD__SYS_FIELDS__POS = 1, + DICT_FLD__SYS_FIELDS__DB_TRX_ID = 2, + DICT_FLD__SYS_FIELDS__DB_ROLL_PTR = 3, + DICT_FLD__SYS_FIELDS__COL_NAME = 4, + DICT_NUM_FIELDS__SYS_FIELDS = 5 +}; +/* The columns in SYS_FOREIGN */ +enum dict_col_sys_foreign_enum { + DICT_COL__SYS_FOREIGN__ID = 0, + DICT_COL__SYS_FOREIGN__FOR_NAME = 1, + DICT_COL__SYS_FOREIGN__REF_NAME = 2, + DICT_COL__SYS_FOREIGN__N_COLS = 3, + DICT_NUM_COLS__SYS_FOREIGN = 4 +}; +/* The field numbers in the SYS_FOREIGN clustered index */ +enum dict_fld_sys_foreign_enum { + DICT_FLD__SYS_FOREIGN__ID = 0, + DICT_FLD__SYS_FOREIGN__DB_TRX_ID = 1, + DICT_FLD__SYS_FOREIGN__DB_ROLL_PTR = 2, + DICT_FLD__SYS_FOREIGN__FOR_NAME = 3, + DICT_FLD__SYS_FOREIGN__REF_NAME = 4, + DICT_FLD__SYS_FOREIGN__N_COLS = 5, + DICT_NUM_FIELDS__SYS_FOREIGN = 6 +}; +/* The field numbers in the SYS_FOREIGN_FOR_NAME secondary index */ +enum dict_fld_sys_foreign_for_name_enum { + DICT_FLD__SYS_FOREIGN_FOR_NAME__NAME = 0, + DICT_FLD__SYS_FOREIGN_FOR_NAME__ID = 1, + DICT_NUM_FIELDS__SYS_FOREIGN_FOR_NAME = 2 +}; +/* The columns in SYS_FOREIGN_COLS */ +enum dict_col_sys_foreign_cols_enum { + DICT_COL__SYS_FOREIGN_COLS__ID = 0, + DICT_COL__SYS_FOREIGN_COLS__POS = 1, + DICT_COL__SYS_FOREIGN_COLS__FOR_COL_NAME = 2, + DICT_COL__SYS_FOREIGN_COLS__REF_COL_NAME = 3, + DICT_NUM_COLS__SYS_FOREIGN_COLS = 4 +}; +/* The field numbers in the SYS_FOREIGN_COLS clustered index */ +enum dict_fld_sys_foreign_cols_enum { + DICT_FLD__SYS_FOREIGN_COLS__ID = 0, + DICT_FLD__SYS_FOREIGN_COLS__POS = 1, + DICT_FLD__SYS_FOREIGN_COLS__DB_TRX_ID = 2, + DICT_FLD__SYS_FOREIGN_COLS__DB_ROLL_PTR = 3, + DICT_FLD__SYS_FOREIGN_COLS__FOR_COL_NAME = 4, + DICT_FLD__SYS_FOREIGN_COLS__REF_COL_NAME = 5, + DICT_NUM_FIELDS__SYS_FOREIGN_COLS = 6 +}; +/* The columns in SYS_TABLESPACES */ +enum dict_col_sys_tablespaces_enum { + DICT_COL__SYS_TABLESPACES__SPACE = 0, + DICT_COL__SYS_TABLESPACES__NAME = 1, + DICT_COL__SYS_TABLESPACES__FLAGS = 2, + DICT_NUM_COLS__SYS_TABLESPACES = 3 +}; +/* The field numbers in the SYS_TABLESPACES clustered index */ +enum dict_fld_sys_tablespaces_enum { + DICT_FLD__SYS_TABLESPACES__SPACE = 0, + DICT_FLD__SYS_TABLESPACES__DB_TRX_ID = 1, + DICT_FLD__SYS_TABLESPACES__DB_ROLL_PTR = 2, + DICT_FLD__SYS_TABLESPACES__NAME = 3, + DICT_FLD__SYS_TABLESPACES__FLAGS = 4, + DICT_NUM_FIELDS__SYS_TABLESPACES = 5 +}; +/* The columns in SYS_DATAFILES */ +enum dict_col_sys_datafiles_enum { + DICT_COL__SYS_DATAFILES__SPACE = 0, + DICT_COL__SYS_DATAFILES__PATH = 1, + DICT_NUM_COLS__SYS_DATAFILES = 2 +}; +/* The field numbers in the SYS_DATAFILES clustered index */ +enum dict_fld_sys_datafiles_enum { + DICT_FLD__SYS_DATAFILES__SPACE = 0, + DICT_FLD__SYS_DATAFILES__DB_TRX_ID = 1, + DICT_FLD__SYS_DATAFILES__DB_ROLL_PTR = 2, + DICT_FLD__SYS_DATAFILES__PATH = 3, + DICT_NUM_FIELDS__SYS_DATAFILES = 4 +}; + +/* A number of the columns above occur in multiple tables. These are the +length of thos fields. */ +#define DICT_FLD_LEN_SPACE 4 +#define DICT_FLD_LEN_FLAGS 4 + +/* When a row id which is zero modulo this number (which must be a power of +two) is assigned, the field DICT_HDR_ROW_ID on the dictionary header page is +updated */ +#define DICT_HDR_ROW_ID_WRITE_MARGIN 256 + +#ifndef UNIV_NONINL +#include "dict0boot.ic" +#endif + +#endif diff --git a/storage/innobase/include/dict0boot.ic b/storage/innobase/include/dict0boot.ic new file mode 100644 index 00000000000..2b156a4f672 --- /dev/null +++ b/storage/innobase/include/dict0boot.ic @@ -0,0 +1,96 @@ +/***************************************************************************** + +Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/dict0boot.ic +Data dictionary creation and booting + +Created 4/18/1996 Heikki Tuuri +*******************************************************/ + +/**********************************************************************//** +Returns a new row id. +@return the new id */ +UNIV_INLINE +row_id_t +dict_sys_get_new_row_id(void) +/*=========================*/ +{ + row_id_t id; + + mutex_enter(&(dict_sys->mutex)); + + id = dict_sys->row_id; + + if (0 == (id % DICT_HDR_ROW_ID_WRITE_MARGIN)) { + + dict_hdr_flush_row_id(); + } + + dict_sys->row_id++; + + mutex_exit(&(dict_sys->mutex)); + + return(id); +} + +/**********************************************************************//** +Reads a row id from a record or other 6-byte stored form. +@return row id */ +UNIV_INLINE +row_id_t +dict_sys_read_row_id( +/*=================*/ + const byte* field) /*!< in: record field */ +{ +#if DATA_ROW_ID_LEN != 6 +# error "DATA_ROW_ID_LEN != 6" +#endif + + return(mach_read_from_6(field)); +} + +/**********************************************************************//** +Writes a row id to a record or other 6-byte stored form. */ +UNIV_INLINE +void +dict_sys_write_row_id( +/*==================*/ + byte* field, /*!< in: record field */ + row_id_t row_id) /*!< in: row id */ +{ +#if DATA_ROW_ID_LEN != 6 +# error "DATA_ROW_ID_LEN != 6" +#endif + + mach_write_to_6(field, row_id); +} + +/*********************************************************************//** +Check if a table id belongs to system table. +@return true if the table id belongs to a system table. */ +UNIV_INLINE +bool +dict_is_sys_table( +/*==============*/ + table_id_t id) /*!< in: table id to check */ +{ + return(id < DICT_HDR_FIRST_ID); +} + + diff --git a/storage/innobase/include/dict0crea.h b/storage/innobase/include/dict0crea.h new file mode 100644 index 00000000000..67eab9058da --- /dev/null +++ b/storage/innobase/include/dict0crea.h @@ -0,0 +1,246 @@ +/***************************************************************************** + +Copyright (c) 1996, 2014, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/dict0crea.h +Database object creation + +Created 1/8/1996 Heikki Tuuri +*******************************************************/ + +#ifndef dict0crea_h +#define dict0crea_h + +#include "univ.i" +#include "dict0types.h" +#include "dict0dict.h" +#include "que0types.h" +#include "row0types.h" +#include "mtr0mtr.h" + +/*********************************************************************//** +Creates a table create graph. +@return own: table create node */ +UNIV_INTERN +tab_node_t* +tab_create_graph_create( +/*====================*/ + dict_table_t* table, /*!< in: table to create, built as a memory data + structure */ + mem_heap_t* heap, /*!< in: heap where created */ + bool commit);/*!< in: true if the commit node should be + added to the query graph */ +/*********************************************************************//** +Creates an index create graph. +@return own: index create node */ +UNIV_INTERN +ind_node_t* +ind_create_graph_create( +/*====================*/ + dict_index_t* index, /*!< in: index to create, built as a memory data + structure */ + mem_heap_t* heap, /*!< in: heap where created */ + bool commit);/*!< in: true if the commit node should be + added to the query graph */ +/***********************************************************//** +Creates a table. This is a high-level function used in SQL execution graphs. +@return query thread to run next or NULL */ +UNIV_INTERN +que_thr_t* +dict_create_table_step( +/*===================*/ + que_thr_t* thr); /*!< in: query thread */ +/***********************************************************//** +Creates an index. This is a high-level function used in SQL execution +graphs. +@return query thread to run next or NULL */ +UNIV_INTERN +que_thr_t* +dict_create_index_step( +/*===================*/ + que_thr_t* thr); /*!< in: query thread */ +/*******************************************************************//** +Truncates the index tree associated with a row in SYS_INDEXES table. +@return new root page number, or FIL_NULL on failure */ +UNIV_INTERN +ulint +dict_truncate_index_tree( +/*=====================*/ + dict_table_t* table, /*!< in: the table the index belongs to */ + ulint space, /*!< in: 0=truncate, + nonzero=create the index tree in the + given tablespace */ + btr_pcur_t* pcur, /*!< in/out: persistent cursor pointing to + record in the clustered index of + SYS_INDEXES table. The cursor may be + repositioned in this call. */ + mtr_t* mtr); /*!< in: mtr having the latch + on the record page. The mtr may be + committed and restarted in this call. */ +/*******************************************************************//** +Drops the index tree associated with a row in SYS_INDEXES table. */ +UNIV_INTERN +void +dict_drop_index_tree( +/*=================*/ + rec_t* rec, /*!< in/out: record in the clustered index + of SYS_INDEXES table */ + mtr_t* mtr); /*!< in: mtr having the latch on the record page */ +/****************************************************************//** +Creates the foreign key constraints system tables inside InnoDB +at server bootstrap or server start if they are not found or are +not of the right form. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +dict_create_or_check_foreign_constraint_tables(void); +/*================================================*/ +/********************************************************************//** +Generate a foreign key constraint name when it was not named by the user. +A generated constraint has a name of the format dbname/tablename_ibfk_NUMBER, +where the numbers start from 1, and are given locally for this table, that is, +the number is not global, as it used to be before MySQL 4.0.18. */ +UNIV_INLINE +dberr_t +dict_create_add_foreign_id( +/*=======================*/ + ulint* id_nr, /*!< in/out: number to use in id generation; + incremented if used */ + const char* name, /*!< in: table name */ + dict_foreign_t* foreign)/*!< in/out: foreign key */ + __attribute__((nonnull)); + +/** Adds the given set of foreign key objects to the dictionary tables +in the database. This function does not modify the dictionary cache. The +caller must ensure that all foreign key objects contain a valid constraint +name in foreign->id. +@param[in] local_fk_set set of foreign key objects, to be added to +the dictionary tables +@param[in] table table to which the foreign key objects in +local_fk_set belong to +@param[in,out] trx transaction +@return error code or DB_SUCCESS */ +UNIV_INTERN +dberr_t +dict_create_add_foreigns_to_dictionary( +/*===================================*/ + const dict_foreign_set& local_fk_set, + const dict_table_t* table, + trx_t* trx) + __attribute__((nonnull, warn_unused_result)); +/****************************************************************//** +Creates the tablespaces and datafiles system tables inside InnoDB +at server bootstrap or server start if they are not found or are +not of the right form. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +dict_create_or_check_sys_tablespace(void); +/*=====================================*/ +/********************************************************************//** +Add a single tablespace definition to the data dictionary tables in the +database. +@return error code or DB_SUCCESS */ +UNIV_INTERN +dberr_t +dict_create_add_tablespace_to_dictionary( +/*=====================================*/ + ulint space, /*!< in: tablespace id */ + const char* name, /*!< in: tablespace name */ + ulint flags, /*!< in: tablespace flags */ + const char* path, /*!< in: tablespace path */ + trx_t* trx, /*!< in: transaction */ + bool commit); /*!< in: if true then commit the + transaction */ +/********************************************************************//** +Add a foreign key definition to the data dictionary tables. +@return error code or DB_SUCCESS */ +UNIV_INTERN +dberr_t +dict_create_add_foreign_to_dictionary( +/*==================================*/ + const char* name, /*!< in: table name */ + const dict_foreign_t* foreign,/*!< in: foreign key */ + trx_t* trx) /*!< in/out: dictionary transaction */ + __attribute__((nonnull, warn_unused_result)); + +/* Table create node structure */ +struct tab_node_t{ + que_common_t common; /*!< node type: QUE_NODE_TABLE_CREATE */ + dict_table_t* table; /*!< table to create, built as a memory data + structure with dict_mem_... functions */ + ins_node_t* tab_def; /* child node which does the insert of + the table definition; the row to be inserted + is built by the parent node */ + ins_node_t* col_def; /* child node which does the inserts of + the column definitions; the row to be inserted + is built by the parent node */ + commit_node_t* commit_node; + /* child node which performs a commit after + a successful table creation */ + /*----------------------*/ + /* Local storage for this graph node */ + ulint state; /*!< node execution state */ + ulint col_no; /*!< next column definition to insert */ + mem_heap_t* heap; /*!< memory heap used as auxiliary storage */ +}; + +/* Table create node states */ +#define TABLE_BUILD_TABLE_DEF 1 +#define TABLE_BUILD_COL_DEF 2 +#define TABLE_COMMIT_WORK 3 +#define TABLE_ADD_TO_CACHE 4 +#define TABLE_COMPLETED 5 + +/* Index create node struct */ + +struct ind_node_t{ + que_common_t common; /*!< node type: QUE_NODE_INDEX_CREATE */ + dict_index_t* index; /*!< index to create, built as a memory data + structure with dict_mem_... functions */ + ins_node_t* ind_def; /* child node which does the insert of + the index definition; the row to be inserted + is built by the parent node */ + ins_node_t* field_def; /* child node which does the inserts of + the field definitions; the row to be inserted + is built by the parent node */ + commit_node_t* commit_node; + /* child node which performs a commit after + a successful index creation */ + /*----------------------*/ + /* Local storage for this graph node */ + ulint state; /*!< node execution state */ + ulint page_no;/* root page number of the index */ + dict_table_t* table; /*!< table which owns the index */ + dtuple_t* ind_row;/* index definition row built */ + ulint field_no;/* next field definition to insert */ + mem_heap_t* heap; /*!< memory heap used as auxiliary storage */ +}; + +/* Index create node states */ +#define INDEX_BUILD_INDEX_DEF 1 +#define INDEX_BUILD_FIELD_DEF 2 +#define INDEX_CREATE_INDEX_TREE 3 +#define INDEX_COMMIT_WORK 4 +#define INDEX_ADD_TO_CACHE 5 + +#ifndef UNIV_NONINL +#include "dict0crea.ic" +#endif + +#endif diff --git a/storage/innobase/include/dict0crea.ic b/storage/innobase/include/dict0crea.ic new file mode 100644 index 00000000000..2d0d9dcb858 --- /dev/null +++ b/storage/innobase/include/dict0crea.ic @@ -0,0 +1,98 @@ +/***************************************************************************** + +Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/dict0crea.ic +Database object creation + +Created 1/8/1996 Heikki Tuuri +*******************************************************/ + +#include "mem0mem.h" + +/*********************************************************************//** +Checks if a table name contains the string "/#sql" which denotes temporary +tables in MySQL. +@return true if temporary table */ +UNIV_INTERN +bool +row_is_mysql_tmp_table_name( +/*========================*/ + const char* name) __attribute__((warn_unused_result)); + /*!< in: table name in the form + 'database/tablename' */ + + +/********************************************************************//** +Generate a foreign key constraint name when it was not named by the user. +A generated constraint has a name of the format dbname/tablename_ibfk_NUMBER, +where the numbers start from 1, and are given locally for this table, that is, +the number is not global, as it used to be before MySQL 4.0.18. */ +UNIV_INLINE +dberr_t +dict_create_add_foreign_id( +/*=======================*/ + ulint* id_nr, /*!< in/out: number to use in id generation; + incremented if used */ + const char* name, /*!< in: table name */ + dict_foreign_t* foreign)/*!< in/out: foreign key */ +{ + if (foreign->id == NULL) { + /* Generate a new constraint id */ + ulint namelen = strlen(name); + char* id = static_cast<char*>( + mem_heap_alloc(foreign->heap, + namelen + 20)); + + if (row_is_mysql_tmp_table_name(name)) { + + /* no overflow if number < 1e13 */ + sprintf(id, "%s_ibfk_%lu", name, + (ulong) (*id_nr)++); + } else { + char table_name[MAX_TABLE_NAME_LEN + 20] = ""; + uint errors = 0; + + strncpy(table_name, name, + MAX_TABLE_NAME_LEN + 20); + + innobase_convert_to_system_charset( + strchr(table_name, '/') + 1, + strchr(name, '/') + 1, + MAX_TABLE_NAME_LEN, &errors); + + if (errors) { + strncpy(table_name, name, + MAX_TABLE_NAME_LEN + 20); + } + + /* no overflow if number < 1e13 */ + sprintf(id, "%s_ibfk_%lu", table_name, + (ulong) (*id_nr)++); + + if (innobase_check_identifier_length( + strchr(id,'/') + 1)) { + return(DB_IDENTIFIER_TOO_LONG); + } + } + foreign->id = id; + } + + return(DB_SUCCESS); +} + diff --git a/storage/innobase/include/dict0dict.h b/storage/innobase/include/dict0dict.h new file mode 100644 index 00000000000..dd61e5becc1 --- /dev/null +++ b/storage/innobase/include/dict0dict.h @@ -0,0 +1,1841 @@ +/***************************************************************************** + +Copyright (c) 1996, 2014, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2012, Facebook Inc. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/dict0dict.h +Data dictionary system + +Created 1/8/1996 Heikki Tuuri +*******************************************************/ + +#ifndef dict0dict_h +#define dict0dict_h + +#include "univ.i" +#include "db0err.h" +#include "dict0types.h" +#include "dict0mem.h" +#include "data0type.h" +#include "data0data.h" +#include "mem0mem.h" +#include "rem0types.h" +#include "ut0mem.h" +#include "ut0lst.h" +#include "hash0hash.h" +#include "ut0rnd.h" +#include "ut0byte.h" +#include "trx0types.h" +#include "row0types.h" + +#ifndef UNIV_HOTBACKUP +# include "sync0sync.h" +# include "sync0rw.h" +/******************************************************************//** +Makes all characters in a NUL-terminated UTF-8 string lower case. */ +UNIV_INTERN +void +dict_casedn_str( +/*============*/ + char* a) /*!< in/out: string to put in lower case */ + __attribute__((nonnull)); +/********************************************************************//** +Get the database name length in a table name. +@return database name length */ +UNIV_INTERN +ulint +dict_get_db_name_len( +/*=================*/ + const char* name) /*!< in: table name in the form + dbname '/' tablename */ + __attribute__((nonnull, warn_unused_result)); +/*********************************************************************//** +Open a table from its database and table name, this is currently used by +foreign constraint parser to get the referenced table. +@return complete table name with database and table name, allocated from +heap memory passed in */ +UNIV_INTERN +char* +dict_get_referenced_table( +/*======================*/ + const char* name, /*!< in: foreign key table name */ + const char* database_name, /*!< in: table db name */ + ulint database_name_len,/*!< in: db name length */ + const char* table_name, /*!< in: table name */ + ulint table_name_len, /*!< in: table name length */ + dict_table_t** table, /*!< out: table object or NULL */ + mem_heap_t* heap); /*!< in: heap memory */ +/*********************************************************************//** +Frees a foreign key struct. */ + +void +dict_foreign_free( +/*==============*/ + dict_foreign_t* foreign); /*!< in, own: foreign key struct */ +/*********************************************************************//** +Finds the highest [number] for foreign key constraints of the table. Looks +only at the >= 4.0.18-format id's, which are of the form +databasename/tablename_ibfk_[number]. +@return highest number, 0 if table has no new format foreign key constraints */ +UNIV_INTERN +ulint +dict_table_get_highest_foreign_id( +/*==============================*/ + dict_table_t* table); /*!< in: table in the dictionary + memory cache */ +/********************************************************************//** +Return the end of table name where we have removed dbname and '/'. +@return table name */ +UNIV_INTERN +const char* +dict_remove_db_name( +/*================*/ + const char* name) /*!< in: table name in the form + dbname '/' tablename */ + __attribute__((nonnull, warn_unused_result)); + +/** Operation to perform when opening a table */ +enum dict_table_op_t { + /** Expect the tablespace to exist. */ + DICT_TABLE_OP_NORMAL = 0, + /** Drop any orphan indexes after an aborted online index creation */ + DICT_TABLE_OP_DROP_ORPHAN, + /** Silently load the tablespace if it does not exist, + and do not load the definitions of incomplete indexes. */ + DICT_TABLE_OP_LOAD_TABLESPACE +}; + +/**********************************************************************//** +Returns a table object based on table id. +@return table, NULL if does not exist */ +UNIV_INTERN +dict_table_t* +dict_table_open_on_id( +/*==================*/ + table_id_t table_id, /*!< in: table id */ + ibool dict_locked, /*!< in: TRUE=data dictionary locked */ + dict_table_op_t table_op) /*!< in: operation to perform */ + __attribute__((warn_unused_result)); +/********************************************************************//** +Decrements the count of open handles to a table. */ +UNIV_INTERN +void +dict_table_close( +/*=============*/ + dict_table_t* table, /*!< in/out: table */ + ibool dict_locked, /*!< in: TRUE=data dictionary locked */ + ibool try_drop) /*!< in: TRUE=try to drop any orphan + indexes after an aborted online + index creation */ + __attribute__((nonnull)); +/**********************************************************************//** +Inits the data dictionary module. */ +UNIV_INTERN +void +dict_init(void); +/*===========*/ +/********************************************************************//** +Gets the space id of every table of the data dictionary and makes a linear +list and a hash table of them to the data dictionary cache. This function +can be called at database startup if we did not need to do a crash recovery. +In crash recovery we must scan the space id's from the .ibd files in MySQL +database directories. */ +UNIV_INTERN +void +dict_load_space_id_list(void); +/*=========================*/ +/*********************************************************************//** +Gets the minimum number of bytes per character. +@return minimum multi-byte char size, in bytes */ +UNIV_INLINE +ulint +dict_col_get_mbminlen( +/*==================*/ + const dict_col_t* col) /*!< in: column */ + __attribute__((nonnull, warn_unused_result)); +/*********************************************************************//** +Gets the maximum number of bytes per character. +@return maximum multi-byte char size, in bytes */ +UNIV_INLINE +ulint +dict_col_get_mbmaxlen( +/*==================*/ + const dict_col_t* col) /*!< in: column */ + __attribute__((nonnull, warn_unused_result)); +/*********************************************************************//** +Sets the minimum and maximum number of bytes per character. */ +UNIV_INLINE +void +dict_col_set_mbminmaxlen( +/*=====================*/ + dict_col_t* col, /*!< in/out: column */ + ulint mbminlen, /*!< in: minimum multi-byte + character size, in bytes */ + ulint mbmaxlen) /*!< in: minimum multi-byte + character size, in bytes */ + __attribute__((nonnull)); +/*********************************************************************//** +Gets the column data type. */ +UNIV_INLINE +void +dict_col_copy_type( +/*===============*/ + const dict_col_t* col, /*!< in: column */ + dtype_t* type) /*!< out: data type */ + __attribute__((nonnull)); +/**********************************************************************//** +Determine bytes of column prefix to be stored in the undo log. Please +note if the table format is UNIV_FORMAT_A (< UNIV_FORMAT_B), no prefix +needs to be stored in the undo log. +@return bytes of column prefix to be stored in the undo log */ +UNIV_INLINE +ulint +dict_max_field_len_store_undo( +/*==========================*/ + dict_table_t* table, /*!< in: table */ + const dict_col_t* col) /*!< in: column which index prefix + is based on */ + __attribute__((nonnull, warn_unused_result)); +#endif /* !UNIV_HOTBACKUP */ +#ifdef UNIV_DEBUG +/*********************************************************************//** +Assert that a column and a data type match. +@return TRUE */ +UNIV_INLINE +ibool +dict_col_type_assert_equal( +/*=======================*/ + const dict_col_t* col, /*!< in: column */ + const dtype_t* type) /*!< in: data type */ + __attribute__((nonnull, warn_unused_result)); +#endif /* UNIV_DEBUG */ +#ifndef UNIV_HOTBACKUP +/***********************************************************************//** +Returns the minimum size of the column. +@return minimum size */ +UNIV_INLINE +ulint +dict_col_get_min_size( +/*==================*/ + const dict_col_t* col) /*!< in: column */ + __attribute__((nonnull, warn_unused_result)); +/***********************************************************************//** +Returns the maximum size of the column. +@return maximum size */ +UNIV_INLINE +ulint +dict_col_get_max_size( +/*==================*/ + const dict_col_t* col) /*!< in: column */ + __attribute__((nonnull, warn_unused_result)); +/***********************************************************************//** +Returns the size of a fixed size column, 0 if not a fixed size column. +@return fixed size, or 0 */ +UNIV_INLINE +ulint +dict_col_get_fixed_size( +/*====================*/ + const dict_col_t* col, /*!< in: column */ + ulint comp) /*!< in: nonzero=ROW_FORMAT=COMPACT */ + __attribute__((nonnull, warn_unused_result)); +/***********************************************************************//** +Returns the ROW_FORMAT=REDUNDANT stored SQL NULL size of a column. +For fixed length types it is the fixed length of the type, otherwise 0. +@return SQL null storage size in ROW_FORMAT=REDUNDANT */ +UNIV_INLINE +ulint +dict_col_get_sql_null_size( +/*=======================*/ + const dict_col_t* col, /*!< in: column */ + ulint comp) /*!< in: nonzero=ROW_FORMAT=COMPACT */ + __attribute__((nonnull, warn_unused_result)); +/*********************************************************************//** +Gets the column number. +@return col->ind, table column position (starting from 0) */ +UNIV_INLINE +ulint +dict_col_get_no( +/*============*/ + const dict_col_t* col) /*!< in: column */ + __attribute__((nonnull, warn_unused_result)); +/*********************************************************************//** +Gets the column position in the clustered index. */ +UNIV_INLINE +ulint +dict_col_get_clust_pos( +/*===================*/ + const dict_col_t* col, /*!< in: table column */ + const dict_index_t* clust_index) /*!< in: clustered index */ + __attribute__((nonnull, warn_unused_result)); +/****************************************************************//** +If the given column name is reserved for InnoDB system columns, return +TRUE. +@return TRUE if name is reserved */ +UNIV_INTERN +ibool +dict_col_name_is_reserved( +/*======================*/ + const char* name) /*!< in: column name */ + __attribute__((nonnull, warn_unused_result)); +/********************************************************************//** +Acquire the autoinc lock. */ +UNIV_INTERN +void +dict_table_autoinc_lock( +/*====================*/ + dict_table_t* table) /*!< in/out: table */ + __attribute__((nonnull)); +/********************************************************************//** +Unconditionally set the autoinc counter. */ +UNIV_INTERN +void +dict_table_autoinc_initialize( +/*==========================*/ + dict_table_t* table, /*!< in/out: table */ + ib_uint64_t value) /*!< in: next value to assign to a row */ + __attribute__((nonnull)); +/********************************************************************//** +Reads the next autoinc value (== autoinc counter value), 0 if not yet +initialized. +@return value for a new row, or 0 */ +UNIV_INTERN +ib_uint64_t +dict_table_autoinc_read( +/*====================*/ + const dict_table_t* table) /*!< in: table */ + __attribute__((nonnull, warn_unused_result)); +/********************************************************************//** +Updates the autoinc counter if the value supplied is greater than the +current value. */ +UNIV_INTERN +void +dict_table_autoinc_update_if_greater( +/*=================================*/ + + dict_table_t* table, /*!< in/out: table */ + ib_uint64_t value) /*!< in: value which was assigned to a row */ + __attribute__((nonnull)); +/********************************************************************//** +Release the autoinc lock. */ +UNIV_INTERN +void +dict_table_autoinc_unlock( +/*======================*/ + dict_table_t* table) /*!< in/out: table */ + __attribute__((nonnull)); +#endif /* !UNIV_HOTBACKUP */ +/**********************************************************************//** +Adds system columns to a table object. */ +UNIV_INTERN +void +dict_table_add_system_columns( +/*==========================*/ + dict_table_t* table, /*!< in/out: table */ + mem_heap_t* heap) /*!< in: temporary heap */ + __attribute__((nonnull)); +#ifndef UNIV_HOTBACKUP +/**********************************************************************//** +Adds a table object to the dictionary cache. */ +UNIV_INTERN +void +dict_table_add_to_cache( +/*====================*/ + dict_table_t* table, /*!< in: table */ + ibool can_be_evicted, /*!< in: TRUE if can be evicted*/ + mem_heap_t* heap) /*!< in: temporary heap */ + __attribute__((nonnull)); +/**********************************************************************//** +Removes a table object from the dictionary cache. */ +UNIV_INTERN +void +dict_table_remove_from_cache( +/*=========================*/ + dict_table_t* table) /*!< in, own: table */ + __attribute__((nonnull)); +/**********************************************************************//** +Renames a table object. +@return TRUE if success */ +UNIV_INTERN +dberr_t +dict_table_rename_in_cache( +/*=======================*/ + dict_table_t* table, /*!< in/out: table */ + const char* new_name, /*!< in: new name */ + ibool rename_also_foreigns) + /*!< in: in ALTER TABLE we want + to preserve the original table name + in constraints which reference it */ + __attribute__((nonnull, warn_unused_result)); +/**********************************************************************//** +Removes an index from the dictionary cache. */ +UNIV_INTERN +void +dict_index_remove_from_cache( +/*=========================*/ + dict_table_t* table, /*!< in/out: table */ + dict_index_t* index) /*!< in, own: index */ + __attribute__((nonnull)); +/**********************************************************************//** +Change the id of a table object in the dictionary cache. This is used in +DISCARD TABLESPACE. */ +UNIV_INTERN +void +dict_table_change_id_in_cache( +/*==========================*/ + dict_table_t* table, /*!< in/out: table object already in cache */ + table_id_t new_id) /*!< in: new id to set */ + __attribute__((nonnull)); +/**********************************************************************//** +Removes a foreign constraint struct from the dictionary cache. */ +UNIV_INTERN +void +dict_foreign_remove_from_cache( +/*===========================*/ + dict_foreign_t* foreign) /*!< in, own: foreign constraint */ + __attribute__((nonnull)); +/**********************************************************************//** +Adds a foreign key constraint object to the dictionary cache. May free +the object if there already is an object with the same identifier in. +At least one of foreign table or referenced table must already be in +the dictionary cache! +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +dict_foreign_add_to_cache( +/*======================*/ + dict_foreign_t* foreign, + /*!< in, own: foreign key constraint */ + const char** col_names, + /*!< in: column names, or NULL to use + foreign->foreign_table->col_names */ + bool check_charsets, + /*!< in: whether to check charset + compatibility */ + dict_err_ignore_t ignore_err) + /*!< in: error to be ignored */ + __attribute__((nonnull(1), warn_unused_result)); +/*********************************************************************//** +Check if the index is referenced by a foreign key, if TRUE return the +matching instance NULL otherwise. +@return pointer to foreign key struct if index is defined for foreign +key, otherwise NULL */ +UNIV_INTERN +dict_foreign_t* +dict_table_get_referenced_constraint( +/*=================================*/ + dict_table_t* table, /*!< in: InnoDB table */ + dict_index_t* index) /*!< in: InnoDB index */ + __attribute__((nonnull, warn_unused_result)); +/*********************************************************************//** +Checks if a table is referenced by foreign keys. +@return TRUE if table is referenced by a foreign key */ +UNIV_INTERN +ibool +dict_table_is_referenced_by_foreign_key( +/*====================================*/ + const dict_table_t* table) /*!< in: InnoDB table */ + __attribute__((nonnull, warn_unused_result)); +/**********************************************************************//** +Replace the index passed in with another equivalent index in the +foreign key lists of the table. +@return whether all replacements were found */ +UNIV_INTERN +bool +dict_foreign_replace_index( +/*=======================*/ + dict_table_t* table, /*!< in/out: table */ + const char** col_names, + /*!< in: column names, or NULL + to use table->col_names */ + const dict_index_t* index) /*!< in: index to be replaced */ + __attribute__((nonnull(1,3), warn_unused_result)); +/**********************************************************************//** +Determines whether a string starts with the specified keyword. +@return TRUE if str starts with keyword */ +UNIV_INTERN +ibool +dict_str_starts_with_keyword( +/*=========================*/ + THD* thd, /*!< in: MySQL thread handle */ + const char* str, /*!< in: string to scan for keyword */ + const char* keyword) /*!< in: keyword to look for */ + __attribute__((nonnull, warn_unused_result)); +/*********************************************************************//** +Checks if a index is defined for a foreign key constraint. Index is a part +of a foreign key constraint if the index is referenced by foreign key +or index is a foreign key index +@return pointer to foreign key struct if index is defined for foreign +key, otherwise NULL */ +UNIV_INTERN +dict_foreign_t* +dict_table_get_foreign_constraint( +/*==============================*/ + dict_table_t* table, /*!< in: InnoDB table */ + dict_index_t* index) /*!< in: InnoDB index */ + __attribute__((nonnull, warn_unused_result)); +/*********************************************************************//** +Scans a table create SQL string and adds to the data dictionary +the foreign key constraints declared in the string. This function +should be called after the indexes for a table have been created. +Each foreign key constraint must be accompanied with indexes in +bot participating tables. The indexes are allowed to contain more +fields than mentioned in the constraint. +@return error code or DB_SUCCESS */ +UNIV_INTERN +dberr_t +dict_create_foreign_constraints( +/*============================*/ + trx_t* trx, /*!< in: transaction */ + const char* sql_string, /*!< in: table create statement where + foreign keys are declared like: + FOREIGN KEY (a, b) REFERENCES + table2(c, d), table2 can be written + also with the database + name before it: test.table2; the + default database id the database of + parameter name */ + size_t sql_length, /*!< in: length of sql_string */ + const char* name, /*!< in: table full name in the + normalized form + database_name/table_name */ + ibool reject_fks) /*!< in: if TRUE, fail with error + code DB_CANNOT_ADD_CONSTRAINT if + any foreign keys are found. */ + __attribute__((nonnull, warn_unused_result)); +/**********************************************************************//** +Parses the CONSTRAINT id's to be dropped in an ALTER TABLE statement. +@return DB_SUCCESS or DB_CANNOT_DROP_CONSTRAINT if syntax error or the +constraint id does not match */ +UNIV_INTERN +dberr_t +dict_foreign_parse_drop_constraints( +/*================================*/ + mem_heap_t* heap, /*!< in: heap from which we can + allocate memory */ + trx_t* trx, /*!< in: transaction */ + dict_table_t* table, /*!< in: table */ + ulint* n, /*!< out: number of constraints + to drop */ + const char*** constraints_to_drop) /*!< out: id's of the + constraints to drop */ + __attribute__((nonnull, warn_unused_result)); +/**********************************************************************//** +Returns a table object and increments its open handle count. +NOTE! This is a high-level function to be used mainly from outside the +'dict' directory. Inside this directory dict_table_get_low +is usually the appropriate function. +@return table, NULL if does not exist */ +UNIV_INTERN +dict_table_t* +dict_table_open_on_name( +/*====================*/ + const char* table_name, /*!< in: table name */ + ibool dict_locked, /*!< in: TRUE=data dictionary locked */ + ibool try_drop, /*!< in: TRUE=try to drop any orphan + indexes after an aborted online + index creation */ + dict_err_ignore_t + ignore_err) /*!< in: error to be ignored when + loading the table */ + __attribute__((nonnull, warn_unused_result)); + +/*********************************************************************//** +Tries to find an index whose first fields are the columns in the array, +in the same order and is not marked for deletion and is not the same +as types_idx. +@return matching index, NULL if not found */ +UNIV_INTERN +dict_index_t* +dict_foreign_find_index( +/*====================*/ + const dict_table_t* table, /*!< in: table */ + const char** col_names, + /*!< in: column names, or NULL + to use table->col_names */ + const char** columns,/*!< in: array of column names */ + ulint n_cols, /*!< in: number of columns */ + const dict_index_t* types_idx, + /*!< in: NULL or an index + whose types the column types + must match */ + bool check_charsets, + /*!< in: whether to check + charsets. only has an effect + if types_idx != NULL */ + ulint check_null) + /*!< in: nonzero if none of + the columns must be declared + NOT NULL */ + __attribute__((nonnull(1,3), warn_unused_result)); +/**********************************************************************//** +Returns a column's name. +@return column name. NOTE: not guaranteed to stay valid if table is +modified in any way (columns added, etc.). */ +UNIV_INTERN +const char* +dict_table_get_col_name( +/*====================*/ + const dict_table_t* table, /*!< in: table */ + ulint col_nr) /*!< in: column number */ + __attribute__((nonnull, warn_unused_result)); +/**********************************************************************//** +Prints a table data. */ +UNIV_INTERN +void +dict_table_print( +/*=============*/ + dict_table_t* table) /*!< in: table */ + __attribute__((nonnull)); +/**********************************************************************//** +Outputs info on foreign keys of a table. */ +UNIV_INTERN +void +dict_print_info_on_foreign_keys( +/*============================*/ + ibool create_table_format, /*!< in: if TRUE then print in + a format suitable to be inserted into + a CREATE TABLE, otherwise in the format + of SHOW TABLE STATUS */ + FILE* file, /*!< in: file where to print */ + trx_t* trx, /*!< in: transaction */ + dict_table_t* table) /*!< in: table */ + __attribute__((nonnull)); +/**********************************************************************//** +Outputs info on a foreign key of a table in a format suitable for +CREATE TABLE. */ +UNIV_INTERN +void +dict_print_info_on_foreign_key_in_create_format( +/*============================================*/ + FILE* file, /*!< in: file where to print */ + trx_t* trx, /*!< in: transaction */ + dict_foreign_t* foreign, /*!< in: foreign key constraint */ + ibool add_newline) /*!< in: whether to add a newline */ + __attribute__((nonnull(1,3))); +/********************************************************************//** +Displays the names of the index and the table. */ +UNIV_INTERN +void +dict_index_name_print( +/*==================*/ + FILE* file, /*!< in: output stream */ + const trx_t* trx, /*!< in: transaction */ + const dict_index_t* index) /*!< in: index to print */ + __attribute__((nonnull(1,3))); +/*********************************************************************//** +Tries to find an index whose first fields are the columns in the array, +in the same order and is not marked for deletion and is not the same +as types_idx. +@return matching index, NULL if not found */ +UNIV_INTERN +bool +dict_foreign_qualify_index( +/*====================*/ + const dict_table_t* table, /*!< in: table */ + const char** col_names, + /*!< in: column names, or NULL + to use table->col_names */ + const char** columns,/*!< in: array of column names */ + ulint n_cols, /*!< in: number of columns */ + const dict_index_t* index, /*!< in: index to check */ + const dict_index_t* types_idx, + /*!< in: NULL or an index + whose types the column types + must match */ + bool check_charsets, + /*!< in: whether to check + charsets. only has an effect + if types_idx != NULL */ + ulint check_null) + /*!< in: nonzero if none of + the columns must be declared + NOT NULL */ + __attribute__((nonnull(1,3), warn_unused_result)); +#ifdef UNIV_DEBUG +/********************************************************************//** +Gets the first index on the table (the clustered index). +@return index, NULL if none exists */ +UNIV_INLINE +dict_index_t* +dict_table_get_first_index( +/*=======================*/ + const dict_table_t* table) /*!< in: table */ + __attribute__((nonnull, warn_unused_result)); +/********************************************************************//** +Gets the last index on the table. +@return index, NULL if none exists */ +UNIV_INLINE +dict_index_t* +dict_table_get_last_index( +/*=======================*/ + const dict_table_t* table) /*!< in: table */ + __attribute__((nonnull, warn_unused_result)); +/********************************************************************//** +Gets the next index on the table. +@return index, NULL if none left */ +UNIV_INLINE +dict_index_t* +dict_table_get_next_index( +/*======================*/ + const dict_index_t* index) /*!< in: index */ + __attribute__((nonnull, warn_unused_result)); +#else /* UNIV_DEBUG */ +# define dict_table_get_first_index(table) UT_LIST_GET_FIRST((table)->indexes) +# define dict_table_get_last_index(table) UT_LIST_GET_LAST((table)->indexes) +# define dict_table_get_next_index(index) UT_LIST_GET_NEXT(indexes, index) +#endif /* UNIV_DEBUG */ +#endif /* !UNIV_HOTBACKUP */ + +/* Skip corrupted index */ +#define dict_table_skip_corrupt_index(index) \ + while (index && dict_index_is_corrupted(index)) { \ + index = dict_table_get_next_index(index); \ + } + +/* Get the next non-corrupt index */ +#define dict_table_next_uncorrupted_index(index) \ +do { \ + index = dict_table_get_next_index(index); \ + dict_table_skip_corrupt_index(index); \ +} while (0) + +/********************************************************************//** +Check whether the index is the clustered index. +@return nonzero for clustered index, zero for other indexes */ +UNIV_INLINE +ulint +dict_index_is_clust( +/*================*/ + const dict_index_t* index) /*!< in: index */ + __attribute__((nonnull, pure, warn_unused_result)); +/********************************************************************//** +Check whether the index is unique. +@return nonzero for unique index, zero for other indexes */ +UNIV_INLINE +ulint +dict_index_is_unique( +/*=================*/ + const dict_index_t* index) /*!< in: index */ + __attribute__((nonnull, pure, warn_unused_result)); +/********************************************************************//** +Check whether the index is the insert buffer tree. +@return nonzero for insert buffer, zero for other indexes */ +UNIV_INLINE +ulint +dict_index_is_ibuf( +/*===============*/ + const dict_index_t* index) /*!< in: index */ + __attribute__((nonnull, pure, warn_unused_result)); +/********************************************************************//** +Check whether the index is a secondary index or the insert buffer tree. +@return nonzero for insert buffer, zero for other indexes */ +UNIV_INLINE +ulint +dict_index_is_sec_or_ibuf( +/*======================*/ + const dict_index_t* index) /*!< in: index */ + __attribute__((nonnull, pure, warn_unused_result)); + +/************************************************************************ +Gets the all the FTS indexes for the table. NOTE: must not be called for +tables which do not have an FTS-index. */ +UNIV_INTERN +ulint +dict_table_get_all_fts_indexes( +/*===========================*/ + /* out: number of indexes collected */ + dict_table_t* table, /* in: table */ + ib_vector_t* indexes)/* out: vector for collecting FTS indexes */ + __attribute__((nonnull)); +/********************************************************************//** +Gets the number of user-defined columns in a table in the dictionary +cache. +@return number of user-defined (e.g., not ROW_ID) columns of a table */ +UNIV_INLINE +ulint +dict_table_get_n_user_cols( +/*=======================*/ + const dict_table_t* table) /*!< in: table */ + __attribute__((nonnull, pure, warn_unused_result)); +/********************************************************************//** +Gets the number of system columns in a table in the dictionary cache. +@return number of system (e.g., ROW_ID) columns of a table */ +UNIV_INLINE +ulint +dict_table_get_n_sys_cols( +/*======================*/ + const dict_table_t* table) /*!< in: table */ + __attribute__((nonnull, pure, warn_unused_result)); +/********************************************************************//** +Gets the number of all columns (also system) in a table in the dictionary +cache. +@return number of columns of a table */ +UNIV_INLINE +ulint +dict_table_get_n_cols( +/*==================*/ + const dict_table_t* table) /*!< in: table */ + __attribute__((nonnull, pure, warn_unused_result)); +/********************************************************************//** +Gets the approximately estimated number of rows in the table. +@return estimated number of rows */ +UNIV_INLINE +ib_uint64_t +dict_table_get_n_rows( +/*==================*/ + const dict_table_t* table) /*!< in: table */ + __attribute__((nonnull, warn_unused_result)); +/********************************************************************//** +Increment the number of rows in the table by one. +Notice that this operation is not protected by any latch, the number is +approximate. */ +UNIV_INLINE +void +dict_table_n_rows_inc( +/*==================*/ + dict_table_t* table) /*!< in/out: table */ + __attribute__((nonnull)); +/********************************************************************//** +Decrement the number of rows in the table by one. +Notice that this operation is not protected by any latch, the number is +approximate. */ +UNIV_INLINE +void +dict_table_n_rows_dec( +/*==================*/ + dict_table_t* table) /*!< in/out: table */ + __attribute__((nonnull)); +#ifdef UNIV_DEBUG +/********************************************************************//** +Gets the nth column of a table. +@return pointer to column object */ +UNIV_INLINE +dict_col_t* +dict_table_get_nth_col( +/*===================*/ + const dict_table_t* table, /*!< in: table */ + ulint pos) /*!< in: position of column */ + __attribute__((nonnull, warn_unused_result)); +/********************************************************************//** +Gets the given system column of a table. +@return pointer to column object */ +UNIV_INLINE +dict_col_t* +dict_table_get_sys_col( +/*===================*/ + const dict_table_t* table, /*!< in: table */ + ulint sys) /*!< in: DATA_ROW_ID, ... */ + __attribute__((nonnull, warn_unused_result)); +#else /* UNIV_DEBUG */ +#define dict_table_get_nth_col(table, pos) \ +((table)->cols + (pos)) +#define dict_table_get_sys_col(table, sys) \ +((table)->cols + (table)->n_cols + (sys) - DATA_N_SYS_COLS) +#endif /* UNIV_DEBUG */ +/********************************************************************//** +Gets the given system column number of a table. +@return column number */ +UNIV_INLINE +ulint +dict_table_get_sys_col_no( +/*======================*/ + const dict_table_t* table, /*!< in: table */ + ulint sys) /*!< in: DATA_ROW_ID, ... */ + __attribute__((nonnull, warn_unused_result)); +#ifndef UNIV_HOTBACKUP +/********************************************************************//** +Returns the minimum data size of an index record. +@return minimum data size in bytes */ +UNIV_INLINE +ulint +dict_index_get_min_size( +/*====================*/ + const dict_index_t* index) /*!< in: index */ + __attribute__((nonnull, warn_unused_result)); +#endif /* !UNIV_HOTBACKUP */ +/********************************************************************//** +Check whether the table uses the compact page format. +@return TRUE if table uses the compact page format */ +UNIV_INLINE +ibool +dict_table_is_comp( +/*===============*/ + const dict_table_t* table) /*!< in: table */ + __attribute__((nonnull, warn_unused_result)); +/********************************************************************//** +Determine the file format of a table. +@return file format version */ +UNIV_INLINE +ulint +dict_table_get_format( +/*==================*/ + const dict_table_t* table) /*!< in: table */ + __attribute__((nonnull, warn_unused_result)); +/********************************************************************//** +Determine the file format from a dict_table_t::flags. +@return file format version */ +UNIV_INLINE +ulint +dict_tf_get_format( +/*===============*/ + ulint flags) /*!< in: dict_table_t::flags */ + __attribute__((warn_unused_result)); +/********************************************************************//** +Set the various values in a dict_table_t::flags pointer. */ +UNIV_INLINE +void +dict_tf_set( +/*========*/ + ulint* flags, /*!< in/out: table */ + rec_format_t format, /*!< in: file format */ + ulint zip_ssize, /*!< in: zip shift size */ + bool remote_path) /*!< in: table uses DATA DIRECTORY */ + __attribute__((nonnull)); +/********************************************************************//** +Convert a 32 bit integer table flags to the 32 bit integer that is +written into the tablespace header at the offset FSP_SPACE_FLAGS and is +also stored in the fil_space_t::flags field. The following chart shows +the translation of the low order bit. Other bits are the same. +========================= Low order bit ========================== + | REDUNDANT | COMPACT | COMPRESSED | DYNAMIC +dict_table_t::flags | 0 | 1 | 1 | 1 +fil_space_t::flags | 0 | 0 | 1 | 1 +================================================================== +@return tablespace flags (fil_space_t::flags) */ +UNIV_INLINE +ulint +dict_tf_to_fsp_flags( +/*=================*/ + ulint flags) /*!< in: dict_table_t::flags */ + __attribute__((const)); +/********************************************************************//** +Extract the compressed page size from table flags. +@return compressed page size, or 0 if not compressed */ +UNIV_INLINE +ulint +dict_tf_get_zip_size( +/*=================*/ + ulint flags) /*!< in: flags */ + __attribute__((const)); +/********************************************************************//** +Check whether the table uses the compressed compact page format. +@return compressed page size, or 0 if not compressed */ +UNIV_INLINE +ulint +dict_table_zip_size( +/*================*/ + const dict_table_t* table) /*!< in: table */ + __attribute__((nonnull, warn_unused_result)); +#ifndef UNIV_HOTBACKUP +/*********************************************************************//** +Obtain exclusive locks on all index trees of the table. This is to prevent +accessing index trees while InnoDB is updating internal metadata for +operations such as truncate tables. */ +UNIV_INLINE +void +dict_table_x_lock_indexes( +/*======================*/ + dict_table_t* table) /*!< in: table */ + __attribute__((nonnull)); +/*********************************************************************//** +Release the exclusive locks on all index tree. */ +UNIV_INLINE +void +dict_table_x_unlock_indexes( +/*========================*/ + dict_table_t* table) /*!< in: table */ + __attribute__((nonnull)); +/********************************************************************//** +Checks if a column is in the ordering columns of the clustered index of a +table. Column prefixes are treated like whole columns. +@return TRUE if the column, or its prefix, is in the clustered key */ +UNIV_INTERN +ibool +dict_table_col_in_clustered_key( +/*============================*/ + const dict_table_t* table, /*!< in: table */ + ulint n) /*!< in: column number */ + __attribute__((nonnull, warn_unused_result)); +/*******************************************************************//** +Check if the table has an FTS index. +@return TRUE if table has an FTS index */ +UNIV_INLINE +ibool +dict_table_has_fts_index( +/*=====================*/ + dict_table_t* table) /*!< in: table */ + __attribute__((nonnull, warn_unused_result)); +/*******************************************************************//** +Copies types of columns contained in table to tuple and sets all +fields of the tuple to the SQL NULL value. This function should +be called right after dtuple_create(). */ +UNIV_INTERN +void +dict_table_copy_types( +/*==================*/ + dtuple_t* tuple, /*!< in/out: data tuple */ + const dict_table_t* table) /*!< in: table */ + __attribute__((nonnull)); +/******************************************************************** +Wait until all the background threads of the given table have exited, i.e., +bg_threads == 0. Note: bg_threads_mutex must be reserved when +calling this. */ +UNIV_INTERN +void +dict_table_wait_for_bg_threads_to_exit( +/*===================================*/ + dict_table_t* table, /* in: table */ + ulint delay) /* in: time in microseconds to wait between + checks of bg_threads. */ + __attribute__((nonnull)); +/**********************************************************************//** +Looks for an index with the given id. NOTE that we do not reserve +the dictionary mutex: this function is for emergency purposes like +printing info of a corrupt database page! +@return index or NULL if not found from cache */ +UNIV_INTERN +dict_index_t* +dict_index_find_on_id_low( +/*======================*/ + index_id_t id) /*!< in: index id */ + __attribute__((warn_unused_result)); +/**********************************************************************//** +Make room in the table cache by evicting an unused table. The unused table +should not be part of FK relationship and currently not used in any user +transaction. There is no guarantee that it will remove a table. +@return number of tables evicted. */ +UNIV_INTERN +ulint +dict_make_room_in_cache( +/*====================*/ + ulint max_tables, /*!< in: max tables allowed in cache */ + ulint pct_check); /*!< in: max percent to check */ +/**********************************************************************//** +Adds an index to the dictionary cache. +@return DB_SUCCESS, DB_TOO_BIG_RECORD, or DB_CORRUPTION */ +UNIV_INTERN +dberr_t +dict_index_add_to_cache( +/*====================*/ + dict_table_t* table, /*!< in: table on which the index is */ + dict_index_t* index, /*!< in, own: index; NOTE! The index memory + object is freed in this function! */ + ulint page_no,/*!< in: root page number of the index */ + ibool strict) /*!< in: TRUE=refuse to create the index + if records could be too big to fit in + an B-tree page */ + __attribute__((nonnull, warn_unused_result)); +/**********************************************************************//** +Removes an index from the dictionary cache. */ +UNIV_INTERN +void +dict_index_remove_from_cache( +/*=========================*/ + dict_table_t* table, /*!< in/out: table */ + dict_index_t* index) /*!< in, own: index */ + __attribute__((nonnull)); +#endif /* !UNIV_HOTBACKUP */ +/********************************************************************//** +Gets the number of fields in the internal representation of an index, +including fields added by the dictionary system. +@return number of fields */ +UNIV_INLINE +ulint +dict_index_get_n_fields( +/*====================*/ + const dict_index_t* index) /*!< in: an internal + representation of index (in + the dictionary cache) */ + __attribute__((nonnull, warn_unused_result)); +/********************************************************************//** +Gets the number of fields in the internal representation of an index +that uniquely determine the position of an index entry in the index, if +we do not take multiversioning into account: in the B-tree use the value +returned by dict_index_get_n_unique_in_tree. +@return number of fields */ +UNIV_INLINE +ulint +dict_index_get_n_unique( +/*====================*/ + const dict_index_t* index) /*!< in: an internal representation + of index (in the dictionary cache) */ + __attribute__((nonnull, warn_unused_result)); +/********************************************************************//** +Gets the number of fields in the internal representation of an index +which uniquely determine the position of an index entry in the index, if +we also take multiversioning into account. +@return number of fields */ +UNIV_INLINE +ulint +dict_index_get_n_unique_in_tree( +/*============================*/ + const dict_index_t* index) /*!< in: an internal representation + of index (in the dictionary cache) */ + __attribute__((nonnull, warn_unused_result)); +/********************************************************************//** +Gets the number of user-defined ordering fields in the index. In the internal +representation we add the row id to the ordering fields to make all indexes +unique, but this function returns the number of fields the user defined +in the index as ordering fields. +@return number of fields */ +UNIV_INLINE +ulint +dict_index_get_n_ordering_defined_by_user( +/*======================================*/ + const dict_index_t* index) /*!< in: an internal representation + of index (in the dictionary cache) */ + __attribute__((nonnull, warn_unused_result)); +#ifdef UNIV_DEBUG +/********************************************************************//** +Gets the nth field of an index. +@return pointer to field object */ +UNIV_INLINE +dict_field_t* +dict_index_get_nth_field( +/*=====================*/ + const dict_index_t* index, /*!< in: index */ + ulint pos) /*!< in: position of field */ + __attribute__((nonnull, warn_unused_result)); +#else /* UNIV_DEBUG */ +# define dict_index_get_nth_field(index, pos) ((index)->fields + (pos)) +#endif /* UNIV_DEBUG */ +/********************************************************************//** +Gets pointer to the nth column in an index. +@return column */ +UNIV_INLINE +const dict_col_t* +dict_index_get_nth_col( +/*===================*/ + const dict_index_t* index, /*!< in: index */ + ulint pos) /*!< in: position of the field */ + __attribute__((nonnull, warn_unused_result)); +/********************************************************************//** +Gets the column number of the nth field in an index. +@return column number */ +UNIV_INLINE +ulint +dict_index_get_nth_col_no( +/*======================*/ + const dict_index_t* index, /*!< in: index */ + ulint pos) /*!< in: position of the field */ + __attribute__((nonnull, warn_unused_result)); +/********************************************************************//** +Looks for column n in an index. +@return position in internal representation of the index; +ULINT_UNDEFINED if not contained */ +UNIV_INLINE +ulint +dict_index_get_nth_col_pos( +/*=======================*/ + const dict_index_t* index, /*!< in: index */ + ulint n) /*!< in: column number */ + __attribute__((nonnull, warn_unused_result)); +/********************************************************************//** +Looks for column n in an index. +@return position in internal representation of the index; +ULINT_UNDEFINED if not contained */ +UNIV_INTERN +ulint +dict_index_get_nth_col_or_prefix_pos( +/*=================================*/ + const dict_index_t* index, /*!< in: index */ + ulint n, /*!< in: column number */ + ibool inc_prefix) /*!< in: TRUE=consider + column prefixes too */ + __attribute__((nonnull, warn_unused_result)); +/********************************************************************//** +Returns TRUE if the index contains a column or a prefix of that column. +@return TRUE if contains the column or its prefix */ +UNIV_INTERN +ibool +dict_index_contains_col_or_prefix( +/*==============================*/ + const dict_index_t* index, /*!< in: index */ + ulint n) /*!< in: column number */ + __attribute__((nonnull, warn_unused_result)); +/********************************************************************//** +Looks for a matching field in an index. The column has to be the same. The +column in index must be complete, or must contain a prefix longer than the +column in index2. That is, we must be able to construct the prefix in index2 +from the prefix in index. +@return position in internal representation of the index; +ULINT_UNDEFINED if not contained */ +UNIV_INTERN +ulint +dict_index_get_nth_field_pos( +/*=========================*/ + const dict_index_t* index, /*!< in: index from which to search */ + const dict_index_t* index2, /*!< in: index */ + ulint n) /*!< in: field number in index2 */ + __attribute__((nonnull, warn_unused_result)); +/********************************************************************//** +Looks for column n position in the clustered index. +@return position in internal representation of the clustered index */ +UNIV_INTERN +ulint +dict_table_get_nth_col_pos( +/*=======================*/ + const dict_table_t* table, /*!< in: table */ + ulint n) /*!< in: column number */ + __attribute__((nonnull, warn_unused_result)); +/********************************************************************//** +Returns the position of a system column in an index. +@return position, ULINT_UNDEFINED if not contained */ +UNIV_INLINE +ulint +dict_index_get_sys_col_pos( +/*=======================*/ + const dict_index_t* index, /*!< in: index */ + ulint type) /*!< in: DATA_ROW_ID, ... */ + __attribute__((nonnull, warn_unused_result)); +/*******************************************************************//** +Adds a column to index. */ +UNIV_INTERN +void +dict_index_add_col( +/*===============*/ + dict_index_t* index, /*!< in/out: index */ + const dict_table_t* table, /*!< in: table */ + dict_col_t* col, /*!< in: column */ + ulint prefix_len) /*!< in: column prefix length */ + __attribute__((nonnull)); +#ifndef UNIV_HOTBACKUP +/*******************************************************************//** +Copies types of fields contained in index to tuple. */ +UNIV_INTERN +void +dict_index_copy_types( +/*==================*/ + dtuple_t* tuple, /*!< in/out: data tuple */ + const dict_index_t* index, /*!< in: index */ + ulint n_fields) /*!< in: number of + field types to copy */ + __attribute__((nonnull)); +#endif /* !UNIV_HOTBACKUP */ +/*********************************************************************//** +Gets the field column. +@return field->col, pointer to the table column */ +UNIV_INLINE +const dict_col_t* +dict_field_get_col( +/*===============*/ + const dict_field_t* field) /*!< in: index field */ + __attribute__((nonnull, warn_unused_result)); +#ifndef UNIV_HOTBACKUP +/**********************************************************************//** +Returns an index object if it is found in the dictionary cache. +Assumes that dict_sys->mutex is already being held. +@return index, NULL if not found */ +UNIV_INTERN +dict_index_t* +dict_index_get_if_in_cache_low( +/*===========================*/ + index_id_t index_id) /*!< in: index id */ + __attribute__((warn_unused_result)); +#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG +/**********************************************************************//** +Returns an index object if it is found in the dictionary cache. +@return index, NULL if not found */ +UNIV_INTERN +dict_index_t* +dict_index_get_if_in_cache( +/*=======================*/ + index_id_t index_id) /*!< in: index id */ + __attribute__((warn_unused_result)); +#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ +#ifdef UNIV_DEBUG +/**********************************************************************//** +Checks that a tuple has n_fields_cmp value in a sensible range, so that +no comparison can occur with the page number field in a node pointer. +@return TRUE if ok */ +UNIV_INTERN +ibool +dict_index_check_search_tuple( +/*==========================*/ + const dict_index_t* index, /*!< in: index tree */ + const dtuple_t* tuple) /*!< in: tuple used in a search */ + __attribute__((nonnull, warn_unused_result)); +/** Whether and when to allow temporary index names */ +enum check_name { + /** Require all indexes to be complete. */ + CHECK_ALL_COMPLETE, + /** Allow aborted online index creation. */ + CHECK_ABORTED_OK, + /** Allow partial indexes to exist. */ + CHECK_PARTIAL_OK +}; +/**********************************************************************//** +Check for duplicate index entries in a table [using the index name] */ +UNIV_INTERN +void +dict_table_check_for_dup_indexes( +/*=============================*/ + const dict_table_t* table, /*!< in: Check for dup indexes + in this table */ + enum check_name check) /*!< in: whether and when to allow + temporary index names */ + __attribute__((nonnull)); +#endif /* UNIV_DEBUG */ +/**********************************************************************//** +Builds a node pointer out of a physical record and a page number. +@return own: node pointer */ +UNIV_INTERN +dtuple_t* +dict_index_build_node_ptr( +/*======================*/ + const dict_index_t* index, /*!< in: index */ + const rec_t* rec, /*!< in: record for which to build node + pointer */ + ulint page_no,/*!< in: page number to put in node + pointer */ + mem_heap_t* heap, /*!< in: memory heap where pointer + created */ + ulint level) /*!< in: level of rec in tree: + 0 means leaf level */ + __attribute__((nonnull, warn_unused_result)); +/**********************************************************************//** +Copies an initial segment of a physical record, long enough to specify an +index entry uniquely. +@return pointer to the prefix record */ +UNIV_INTERN +rec_t* +dict_index_copy_rec_order_prefix( +/*=============================*/ + const dict_index_t* index, /*!< in: index */ + const rec_t* rec, /*!< in: record for which to + copy prefix */ + ulint* n_fields,/*!< out: number of fields copied */ + byte** buf, /*!< in/out: memory buffer for the + copied prefix, or NULL */ + ulint* buf_size)/*!< in/out: buffer size */ + __attribute__((nonnull, warn_unused_result)); +/**********************************************************************//** +Builds a typed data tuple out of a physical record. +@return own: data tuple */ +UNIV_INTERN +dtuple_t* +dict_index_build_data_tuple( +/*========================*/ + dict_index_t* index, /*!< in: index */ + rec_t* rec, /*!< in: record for which to build data tuple */ + ulint n_fields,/*!< in: number of data fields */ + mem_heap_t* heap) /*!< in: memory heap where tuple created */ + __attribute__((nonnull, warn_unused_result)); +/*********************************************************************//** +Gets the space id of the root of the index tree. +@return space id */ +UNIV_INLINE +ulint +dict_index_get_space( +/*=================*/ + const dict_index_t* index) /*!< in: index */ + __attribute__((nonnull, warn_unused_result)); +/*********************************************************************//** +Sets the space id of the root of the index tree. */ +UNIV_INLINE +void +dict_index_set_space( +/*=================*/ + dict_index_t* index, /*!< in/out: index */ + ulint space) /*!< in: space id */ + __attribute__((nonnull)); +/*********************************************************************//** +Gets the page number of the root of the index tree. +@return page number */ +UNIV_INLINE +ulint +dict_index_get_page( +/*================*/ + const dict_index_t* tree) /*!< in: index */ + __attribute__((nonnull, warn_unused_result)); +/*********************************************************************//** +Gets the read-write lock of the index tree. +@return read-write lock */ +UNIV_INLINE +rw_lock_t* +dict_index_get_lock( +/*================*/ + dict_index_t* index) /*!< in: index */ + __attribute__((nonnull, warn_unused_result)); +/********************************************************************//** +Returns free space reserved for future updates of records. This is +relevant only in the case of many consecutive inserts, as updates +which make the records bigger might fragment the index. +@return number of free bytes on page, reserved for updates */ +UNIV_INLINE +ulint +dict_index_get_space_reserve(void); +/*==============================*/ + +/* Online index creation @{ */ +/********************************************************************//** +Gets the status of online index creation. +@return the status */ +UNIV_INLINE +enum online_index_status +dict_index_get_online_status( +/*=========================*/ + const dict_index_t* index) /*!< in: secondary index */ + __attribute__((nonnull, warn_unused_result)); +/********************************************************************//** +Sets the status of online index creation. */ +UNIV_INLINE +void +dict_index_set_online_status( +/*=========================*/ + dict_index_t* index, /*!< in/out: index */ + enum online_index_status status) /*!< in: status */ + __attribute__((nonnull)); +/********************************************************************//** +Determines if a secondary index is being or has been created online, +or if the table is being rebuilt online, allowing concurrent modifications +to the table. +@retval true if the index is being or has been built online, or +if this is a clustered index and the table is being or has been rebuilt online +@retval false if the index has been created or the table has been +rebuilt completely */ +UNIV_INLINE +bool +dict_index_is_online_ddl( +/*=====================*/ + const dict_index_t* index) /*!< in: index */ + __attribute__((nonnull, warn_unused_result)); +/*********************************************************************//** +Calculates the minimum record length in an index. */ +UNIV_INTERN +ulint +dict_index_calc_min_rec_len( +/*========================*/ + const dict_index_t* index) /*!< in: index */ + __attribute__((nonnull, warn_unused_result)); +/********************************************************************//** +Reserves the dictionary system mutex for MySQL. */ +UNIV_INTERN +void +dict_mutex_enter_for_mysql(void); +/*============================*/ +/********************************************************************//** +Releases the dictionary system mutex for MySQL. */ +UNIV_INTERN +void +dict_mutex_exit_for_mysql(void); +/*===========================*/ + +/** Create a dict_table_t's stats latch or delay for lazy creation. +This function is only called from either single threaded environment +or from a thread that has not shared the table object with other threads. +@param[in,out] table table whose stats latch to create +@param[in] enabled if false then the latch is disabled +and dict_table_stats_lock()/unlock() become noop on this table. */ + +void +dict_table_stats_latch_create( + dict_table_t* table, + bool enabled); + +/** Destroy a dict_table_t's stats latch. +This function is only called from either single threaded environment +or from a thread that has not shared the table object with other threads. +@param[in,out] table table whose stats latch to destroy */ + +void +dict_table_stats_latch_destroy( + dict_table_t* table); + +/**********************************************************************//** +Lock the appropriate latch to protect a given table's statistics. +table->id is used to pick the corresponding latch from a global array of +latches. */ +UNIV_INTERN +void +dict_table_stats_lock( +/*==================*/ + dict_table_t* table, /*!< in: table */ + ulint latch_mode); /*!< in: RW_S_LATCH or RW_X_LATCH */ +/**********************************************************************//** +Unlock the latch that has been locked by dict_table_stats_lock() */ +UNIV_INTERN +void +dict_table_stats_unlock( +/*====================*/ + dict_table_t* table, /*!< in: table */ + ulint latch_mode); /*!< in: RW_S_LATCH or RW_X_LATCH */ +/********************************************************************//** +Checks if the database name in two table names is the same. +@return TRUE if same db name */ +UNIV_INTERN +ibool +dict_tables_have_same_db( +/*=====================*/ + const char* name1, /*!< in: table name in the form + dbname '/' tablename */ + const char* name2) /*!< in: table name in the form + dbname '/' tablename */ + __attribute__((nonnull, warn_unused_result)); +/*********************************************************************//** +Removes an index from the cache */ +UNIV_INTERN +void +dict_index_remove_from_cache( +/*=========================*/ + dict_table_t* table, /*!< in/out: table */ + dict_index_t* index) /*!< in, own: index */ + __attribute__((nonnull)); +/**********************************************************************//** +Get index by name +@return index, NULL if does not exist */ +UNIV_INTERN +dict_index_t* +dict_table_get_index_on_name( +/*=========================*/ + dict_table_t* table, /*!< in: table */ + const char* name) /*!< in: name of the index to find */ + __attribute__((nonnull, warn_unused_result)); +/**********************************************************************//** +In case there is more than one index with the same name return the index +with the min(id). +@return index, NULL if does not exist */ +UNIV_INTERN +dict_index_t* +dict_table_get_index_on_name_and_min_id( +/*====================================*/ + dict_table_t* table, /*!< in: table */ + const char* name) /*!< in: name of the index to find */ + __attribute__((nonnull, warn_unused_result)); +/*************************************************************** +Check whether a column exists in an FTS index. */ +UNIV_INLINE +ulint +dict_table_is_fts_column( +/*=====================*/ + /* out: ULINT_UNDEFINED if no match else + the offset within the vector */ + ib_vector_t* indexes,/* in: vector containing only FTS indexes */ + ulint col_no) /* in: col number to search for */ + __attribute__((nonnull, warn_unused_result)); +/**********************************************************************//** +Move a table to the non LRU end of the LRU list. */ +UNIV_INTERN +void +dict_table_move_from_lru_to_non_lru( +/*================================*/ + dict_table_t* table) /*!< in: table to move from LRU to non-LRU */ + __attribute__((nonnull)); +/**********************************************************************//** +Move a table to the LRU list from the non-LRU list. */ +UNIV_INTERN +void +dict_table_move_from_non_lru_to_lru( +/*================================*/ + dict_table_t* table) /*!< in: table to move from non-LRU to LRU */ + __attribute__((nonnull)); +/**********************************************************************//** +Move to the most recently used segment of the LRU list. */ +UNIV_INTERN +void +dict_move_to_mru( +/*=============*/ + dict_table_t* table) /*!< in: table to move to MRU */ + __attribute__((nonnull)); + +/** Maximum number of columns in a foreign key constraint. Please Note MySQL +has a much lower limit on the number of columns allowed in a foreign key +constraint */ +#define MAX_NUM_FK_COLUMNS 500 + +/* Buffers for storing detailed information about the latest foreign key +and unique key errors */ +extern FILE* dict_foreign_err_file; +extern ib_mutex_t dict_foreign_err_mutex; /* mutex protecting the buffers */ + +/** the dictionary system */ +extern dict_sys_t* dict_sys; +/** the data dictionary rw-latch protecting dict_sys */ +extern rw_lock_t dict_operation_lock; + +/* Dictionary system struct */ +struct dict_sys_t{ + ib_mutex_t mutex; /*!< mutex protecting the data + dictionary; protects also the + disk-based dictionary system tables; + this mutex serializes CREATE TABLE + and DROP TABLE, as well as reading + the dictionary data for a table from + system tables */ + row_id_t row_id; /*!< the next row id to assign; + NOTE that at a checkpoint this + must be written to the dict system + header and flushed to a file; in + recovery this must be derived from + the log records */ + hash_table_t* table_hash; /*!< hash table of the tables, based + on name */ + hash_table_t* table_id_hash; /*!< hash table of the tables, based + on id */ + ulint size; /*!< varying space in bytes occupied + by the data dictionary table and + index objects */ + dict_table_t* sys_tables; /*!< SYS_TABLES table */ + dict_table_t* sys_columns; /*!< SYS_COLUMNS table */ + dict_table_t* sys_indexes; /*!< SYS_INDEXES table */ + dict_table_t* sys_fields; /*!< SYS_FIELDS table */ + + /*=============================*/ + UT_LIST_BASE_NODE_T(dict_table_t) + table_LRU; /*!< List of tables that can be evicted + from the cache */ + UT_LIST_BASE_NODE_T(dict_table_t) + table_non_LRU; /*!< List of tables that can't be + evicted from the cache */ +}; +#endif /* !UNIV_HOTBACKUP */ + +/** dummy index for ROW_FORMAT=REDUNDANT supremum and infimum records */ +extern dict_index_t* dict_ind_redundant; +/** dummy index for ROW_FORMAT=COMPACT supremum and infimum records */ +extern dict_index_t* dict_ind_compact; + +/**********************************************************************//** +Inits dict_ind_redundant and dict_ind_compact. */ +UNIV_INTERN +void +dict_ind_init(void); +/*===============*/ + +/* Auxiliary structs for checking a table definition @{ */ + +/* This struct is used to specify the name and type that a column must +have when checking a table's schema. */ +struct dict_col_meta_t { + const char* name; /* column name */ + ulint mtype; /* required column main type */ + ulint prtype_mask; /* required column precise type mask; + if this is non-zero then all the + bits it has set must also be set + in the column's prtype */ + ulint len; /* required column length */ +}; + +/* This struct is used for checking whether a given table exists and +whether it has a predefined schema (number of columns and columns names +and types) */ +struct dict_table_schema_t { + const char* table_name; /* the name of the table whose + structure we are checking */ + ulint n_cols; /* the number of columns the + table must have */ + dict_col_meta_t* columns; /* metadata for the columns; + this array has n_cols + elements */ + ulint n_foreign; /* number of foreign keys this + table has, pointing to other + tables (where this table is + FK child) */ + ulint n_referenced; /* number of foreign keys other + tables have, pointing to this + table (where this table is + parent) */ +}; +/* @} */ + +/*********************************************************************//** +Checks whether a table exists and whether it has the given structure. +The table must have the same number of columns with the same names and +types. The order of the columns does not matter. +The caller must own the dictionary mutex. +dict_table_schema_check() @{ +@return DB_SUCCESS if the table exists and contains the necessary columns */ +UNIV_INTERN +dberr_t +dict_table_schema_check( +/*====================*/ + dict_table_schema_t* req_schema, /*!< in/out: required table + schema */ + char* errstr, /*!< out: human readable error + message if != DB_SUCCESS and + != DB_TABLE_NOT_FOUND is + returned */ + size_t errstr_sz) /*!< in: errstr size */ + __attribute__((nonnull, warn_unused_result)); +/* @} */ + +/*********************************************************************//** +Converts a database and table name from filesystem encoding +(e.g. d@i1b/a@q1b@1Kc, same format as used in dict_table_t::name) in two +strings in UTF8 encoding (e.g. dцb and aюbØc). The output buffers must be +at least MAX_DB_UTF8_LEN and MAX_TABLE_UTF8_LEN bytes. */ +UNIV_INTERN +void +dict_fs2utf8( +/*=========*/ + const char* db_and_table, /*!< in: database and table names, + e.g. d@i1b/a@q1b@1Kc */ + char* db_utf8, /*!< out: database name, e.g. dцb */ + size_t db_utf8_size, /*!< in: dbname_utf8 size */ + char* table_utf8, /*!< out: table name, e.g. aюbØc */ + size_t table_utf8_size)/*!< in: table_utf8 size */ + __attribute__((nonnull)); + +/**********************************************************************//** +Closes the data dictionary module. */ +UNIV_INTERN +void +dict_close(void); +/*============*/ +#ifndef UNIV_HOTBACKUP +/**********************************************************************//** +Check whether the table is corrupted. +@return nonzero for corrupted table, zero for valid tables */ +UNIV_INLINE +ulint +dict_table_is_corrupted( +/*====================*/ + const dict_table_t* table) /*!< in: table */ + __attribute__((nonnull, warn_unused_result)); + +/**********************************************************************//** +Check whether the index is corrupted. +@return nonzero for corrupted index, zero for valid indexes */ +UNIV_INLINE +ulint +dict_index_is_corrupted( +/*====================*/ + const dict_index_t* index) /*!< in: index */ + __attribute__((nonnull, warn_unused_result)); + +#endif /* !UNIV_HOTBACKUP */ +/**********************************************************************//** +Flags an index and table corrupted both in the data dictionary cache +and in the system table SYS_INDEXES. */ +UNIV_INTERN +void +dict_set_corrupted( +/*===============*/ + dict_index_t* index, /*!< in/out: index */ + trx_t* trx, /*!< in/out: transaction */ + const char* ctx) /*!< in: context */ + UNIV_COLD __attribute__((nonnull)); + +/**********************************************************************//** +Flags an index corrupted in the data dictionary cache only. This +is used mostly to mark a corrupted index when index's own dictionary +is corrupted, and we force to load such index for repair purpose */ +UNIV_INTERN +void +dict_set_corrupted_index_cache_only( +/*================================*/ + dict_index_t* index, /*!< in/out: index */ + dict_table_t* table) /*!< in/out: table */ + __attribute__((nonnull)); + +/**********************************************************************//** +Flags a table with specified space_id corrupted in the table dictionary +cache. +@return TRUE if successful */ +UNIV_INTERN +ibool +dict_set_corrupted_by_space( +/*========================*/ + ulint space_id); /*!< in: space ID */ + +/********************************************************************//** +Validate the table flags. +@return true if valid. */ +UNIV_INLINE +bool +dict_tf_is_valid( +/*=============*/ + ulint flags) /*!< in: table flags */ + __attribute__((warn_unused_result)); + +/********************************************************************//** +Check if the tablespace for the table has been discarded. +@return true if the tablespace has been discarded. */ +UNIV_INLINE +bool +dict_table_is_discarded( +/*====================*/ + const dict_table_t* table) /*!< in: table to check */ + __attribute__((nonnull, pure, warn_unused_result)); + +/********************************************************************//** +Check if it is a temporary table. +@return true if temporary table flag is set. */ +UNIV_INLINE +bool +dict_table_is_temporary( +/*====================*/ + const dict_table_t* table) /*!< in: table to check */ + __attribute__((nonnull, pure, warn_unused_result)); + +#ifndef UNIV_HOTBACKUP +/*********************************************************************//** +This function should be called whenever a page is successfully +compressed. Updates the compression padding information. */ +UNIV_INTERN +void +dict_index_zip_success( +/*===================*/ + dict_index_t* index) /*!< in/out: index to be updated. */ + __attribute__((nonnull)); +/*********************************************************************//** +This function should be called whenever a page compression attempt +fails. Updates the compression padding information. */ +UNIV_INTERN +void +dict_index_zip_failure( +/*===================*/ + dict_index_t* index) /*!< in/out: index to be updated. */ + __attribute__((nonnull)); +/*********************************************************************//** +Return the optimal page size, for which page will likely compress. +@return page size beyond which page may not compress*/ +UNIV_INTERN +ulint +dict_index_zip_pad_optimal_page_size( +/*=================================*/ + dict_index_t* index) /*!< in: index for which page size + is requested */ + __attribute__((nonnull, warn_unused_result)); +/*************************************************************//** +Convert table flag to row format string. +@return row format name */ +UNIV_INTERN +const char* +dict_tf_to_row_format_string( +/*=========================*/ + ulint table_flag); /*!< in: row format setting */ +/*****************************************************************//** +Get index by first field of the index +@return index which is having first field matches +with the field present in field_index position of table */ +UNIV_INLINE +dict_index_t* +dict_table_get_index_on_first_col( +/*==============================*/ + const dict_table_t* table, /*!< in: table */ + ulint col_index); /*!< in: position of column + in table */ + +#endif /* !UNIV_HOTBACKUP */ + +#ifndef UNIV_NONINL +#include "dict0dict.ic" +#endif + +#endif diff --git a/storage/innobase/include/dict0dict.ic b/storage/innobase/include/dict0dict.ic new file mode 100644 index 00000000000..066ffe47e4a --- /dev/null +++ b/storage/innobase/include/dict0dict.ic @@ -0,0 +1,1433 @@ +/***************************************************************************** + +Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/dict0dict.ic +Data dictionary system + +Created 1/8/1996 Heikki Tuuri +***********************************************************************/ + +#include "data0type.h" +#ifndef UNIV_HOTBACKUP +#include "dict0load.h" +#include "rem0types.h" +#include "fsp0fsp.h" +#include "srv0srv.h" +#include "sync0rw.h" /* RW_S_LATCH */ + +/*********************************************************************//** +Gets the minimum number of bytes per character. +@return minimum multi-byte char size, in bytes */ +UNIV_INLINE +ulint +dict_col_get_mbminlen( +/*==================*/ + const dict_col_t* col) /*!< in: column */ +{ + return(DATA_MBMINLEN(col->mbminmaxlen)); +} +/*********************************************************************//** +Gets the maximum number of bytes per character. +@return maximum multi-byte char size, in bytes */ +UNIV_INLINE +ulint +dict_col_get_mbmaxlen( +/*==================*/ + const dict_col_t* col) /*!< in: column */ +{ + return(DATA_MBMAXLEN(col->mbminmaxlen)); +} +/*********************************************************************//** +Sets the minimum and maximum number of bytes per character. */ +UNIV_INLINE +void +dict_col_set_mbminmaxlen( +/*=====================*/ + dict_col_t* col, /*!< in/out: column */ + ulint mbminlen, /*!< in: minimum multi-byte + character size, in bytes */ + ulint mbmaxlen) /*!< in: minimum multi-byte + character size, in bytes */ +{ + ut_ad(mbminlen < DATA_MBMAX); + ut_ad(mbmaxlen < DATA_MBMAX); + ut_ad(mbminlen <= mbmaxlen); + + col->mbminmaxlen = DATA_MBMINMAXLEN(mbminlen, mbmaxlen); +} +/*********************************************************************//** +Gets the column data type. */ +UNIV_INLINE +void +dict_col_copy_type( +/*===============*/ + const dict_col_t* col, /*!< in: column */ + dtype_t* type) /*!< out: data type */ +{ + ut_ad(col && type); + + type->mtype = col->mtype; + type->prtype = col->prtype; + type->len = col->len; + type->mbminmaxlen = col->mbminmaxlen; +} +#endif /* !UNIV_HOTBACKUP */ + +#ifdef UNIV_DEBUG +/*********************************************************************//** +Assert that a column and a data type match. +@return TRUE */ +UNIV_INLINE +ibool +dict_col_type_assert_equal( +/*=======================*/ + const dict_col_t* col, /*!< in: column */ + const dtype_t* type) /*!< in: data type */ +{ + ut_ad(col); + ut_ad(type); + + ut_ad(col->mtype == type->mtype); + ut_ad(col->prtype == type->prtype); + //ut_ad(col->len == type->len); +# ifndef UNIV_HOTBACKUP + ut_ad(col->mbminmaxlen == type->mbminmaxlen); +# endif /* !UNIV_HOTBACKUP */ + + return(TRUE); +} +#endif /* UNIV_DEBUG */ + +#ifndef UNIV_HOTBACKUP +/***********************************************************************//** +Returns the minimum size of the column. +@return minimum size */ +UNIV_INLINE +ulint +dict_col_get_min_size( +/*==================*/ + const dict_col_t* col) /*!< in: column */ +{ + return(dtype_get_min_size_low(col->mtype, col->prtype, col->len, + col->mbminmaxlen)); +} +/***********************************************************************//** +Returns the maximum size of the column. +@return maximum size */ +UNIV_INLINE +ulint +dict_col_get_max_size( +/*==================*/ + const dict_col_t* col) /*!< in: column */ +{ + return(dtype_get_max_size_low(col->mtype, col->len)); +} +#endif /* !UNIV_HOTBACKUP */ +/***********************************************************************//** +Returns the size of a fixed size column, 0 if not a fixed size column. +@return fixed size, or 0 */ +UNIV_INLINE +ulint +dict_col_get_fixed_size( +/*====================*/ + const dict_col_t* col, /*!< in: column */ + ulint comp) /*!< in: nonzero=ROW_FORMAT=COMPACT */ +{ + return(dtype_get_fixed_size_low(col->mtype, col->prtype, col->len, + col->mbminmaxlen, comp)); +} +/***********************************************************************//** +Returns the ROW_FORMAT=REDUNDANT stored SQL NULL size of a column. +For fixed length types it is the fixed length of the type, otherwise 0. +@return SQL null storage size in ROW_FORMAT=REDUNDANT */ +UNIV_INLINE +ulint +dict_col_get_sql_null_size( +/*=======================*/ + const dict_col_t* col, /*!< in: column */ + ulint comp) /*!< in: nonzero=ROW_FORMAT=COMPACT */ +{ + return(dict_col_get_fixed_size(col, comp)); +} + +/*********************************************************************//** +Gets the column number. +@return col->ind, table column position (starting from 0) */ +UNIV_INLINE +ulint +dict_col_get_no( +/*============*/ + const dict_col_t* col) /*!< in: column */ +{ + ut_ad(col); + + return(col->ind); +} + +/*********************************************************************//** +Gets the column position in the clustered index. */ +UNIV_INLINE +ulint +dict_col_get_clust_pos( +/*===================*/ + const dict_col_t* col, /*!< in: table column */ + const dict_index_t* clust_index) /*!< in: clustered index */ +{ + ulint i; + + ut_ad(col); + ut_ad(clust_index); + ut_ad(dict_index_is_clust(clust_index)); + + for (i = 0; i < clust_index->n_def; i++) { + const dict_field_t* field = &clust_index->fields[i]; + + if (!field->prefix_len && field->col == col) { + return(i); + } + } + + return(ULINT_UNDEFINED); +} + +#ifndef UNIV_HOTBACKUP +#ifdef UNIV_DEBUG +/********************************************************************//** +Gets the first index on the table (the clustered index). +@return index, NULL if none exists */ +UNIV_INLINE +dict_index_t* +dict_table_get_first_index( +/*=======================*/ + const dict_table_t* table) /*!< in: table */ +{ + ut_ad(table); + ut_ad(table->magic_n == DICT_TABLE_MAGIC_N); + + return(UT_LIST_GET_FIRST(((dict_table_t*) table)->indexes)); +} + +/********************************************************************//** +Gets the last index on the table. +@return index, NULL if none exists */ +UNIV_INLINE +dict_index_t* +dict_table_get_last_index( +/*=======================*/ + const dict_table_t* table) /*!< in: table */ +{ + ut_ad(table); + ut_ad(table->magic_n == DICT_TABLE_MAGIC_N); + + return(UT_LIST_GET_LAST((const_cast<dict_table_t*>(table)) + ->indexes)); +} + +/********************************************************************//** +Gets the next index on the table. +@return index, NULL if none left */ +UNIV_INLINE +dict_index_t* +dict_table_get_next_index( +/*======================*/ + const dict_index_t* index) /*!< in: index */ +{ + ut_ad(index); + ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); + + return(UT_LIST_GET_NEXT(indexes, (dict_index_t*) index)); +} +#endif /* UNIV_DEBUG */ +#endif /* !UNIV_HOTBACKUP */ + +/********************************************************************//** +Check whether the index is the clustered index. +@return nonzero for clustered index, zero for other indexes */ +UNIV_INLINE +ulint +dict_index_is_clust( +/*================*/ + const dict_index_t* index) /*!< in: index */ +{ + ut_ad(index); + ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); + + return(index->type & DICT_CLUSTERED); +} +/********************************************************************//** +Check whether the index is unique. +@return nonzero for unique index, zero for other indexes */ +UNIV_INLINE +ulint +dict_index_is_unique( +/*=================*/ + const dict_index_t* index) /*!< in: index */ +{ + ut_ad(index); + ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); + + return(index->type & DICT_UNIQUE); +} + +/********************************************************************//** +Check whether the index is the insert buffer tree. +@return nonzero for insert buffer, zero for other indexes */ +UNIV_INLINE +ulint +dict_index_is_ibuf( +/*===============*/ + const dict_index_t* index) /*!< in: index */ +{ + ut_ad(index); + ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); + + return(index->type & DICT_IBUF); +} + +/********************************************************************//** +Check whether the index is an universal index tree. +@return nonzero for universal tree, zero for other indexes */ +UNIV_INLINE +ulint +dict_index_is_univ( +/*===============*/ + const dict_index_t* index) /*!< in: index */ +{ + ut_ad(index); + ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); + + return(index->type & DICT_UNIVERSAL); +} + +/********************************************************************//** +Check whether the index is a secondary index or the insert buffer tree. +@return nonzero for insert buffer, zero for other indexes */ +UNIV_INLINE +ulint +dict_index_is_sec_or_ibuf( +/*======================*/ + const dict_index_t* index) /*!< in: index */ +{ + ulint type; + + ut_ad(index); + ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); + + type = index->type; + + return(!(type & DICT_CLUSTERED) || (type & DICT_IBUF)); +} + +/********************************************************************//** +Gets the number of user-defined columns in a table in the dictionary +cache. +@return number of user-defined (e.g., not ROW_ID) columns of a table */ +UNIV_INLINE +ulint +dict_table_get_n_user_cols( +/*=======================*/ + const dict_table_t* table) /*!< in: table */ +{ + ut_ad(table); + ut_ad(table->magic_n == DICT_TABLE_MAGIC_N); + + return(table->n_cols - DATA_N_SYS_COLS); +} + +/********************************************************************//** +Gets the number of system columns in a table in the dictionary cache. +@return number of system (e.g., ROW_ID) columns of a table */ +UNIV_INLINE +ulint +dict_table_get_n_sys_cols( +/*======================*/ + const dict_table_t* table __attribute__((unused))) /*!< in: table */ +{ + ut_ad(table); + ut_ad(table->magic_n == DICT_TABLE_MAGIC_N); + ut_ad(table->cached); + + return(DATA_N_SYS_COLS); +} + +/********************************************************************//** +Gets the number of all columns (also system) in a table in the dictionary +cache. +@return number of columns of a table */ +UNIV_INLINE +ulint +dict_table_get_n_cols( +/*==================*/ + const dict_table_t* table) /*!< in: table */ +{ + ut_ad(table); + ut_ad(table->magic_n == DICT_TABLE_MAGIC_N); + + return(table->n_cols); +} + +/********************************************************************//** +Gets the approximately estimated number of rows in the table. +@return estimated number of rows */ +UNIV_INLINE +ib_uint64_t +dict_table_get_n_rows( +/*==================*/ + const dict_table_t* table) /*!< in: table */ +{ + ut_ad(table->stat_initialized); + + return(table->stat_n_rows); +} + +/********************************************************************//** +Increment the number of rows in the table by one. +Notice that this operation is not protected by any latch, the number is +approximate. */ +UNIV_INLINE +void +dict_table_n_rows_inc( +/*==================*/ + dict_table_t* table) /*!< in/out: table */ +{ + if (table->stat_initialized) { + ib_uint64_t n_rows = table->stat_n_rows; + if (n_rows < 0xFFFFFFFFFFFFFFFFULL) { + table->stat_n_rows = n_rows + 1; + } + } +} + +/********************************************************************//** +Decrement the number of rows in the table by one. +Notice that this operation is not protected by any latch, the number is +approximate. */ +UNIV_INLINE +void +dict_table_n_rows_dec( +/*==================*/ + dict_table_t* table) /*!< in/out: table */ +{ + if (table->stat_initialized) { + ib_uint64_t n_rows = table->stat_n_rows; + if (n_rows > 0) { + table->stat_n_rows = n_rows - 1; + } + } +} + +#ifdef UNIV_DEBUG +/********************************************************************//** +Gets the nth column of a table. +@return pointer to column object */ +UNIV_INLINE +dict_col_t* +dict_table_get_nth_col( +/*===================*/ + const dict_table_t* table, /*!< in: table */ + ulint pos) /*!< in: position of column */ +{ + ut_ad(table); + ut_ad(pos < table->n_def); + ut_ad(table->magic_n == DICT_TABLE_MAGIC_N); + + return((dict_col_t*) (table->cols) + pos); +} + +/********************************************************************//** +Gets the given system column of a table. +@return pointer to column object */ +UNIV_INLINE +dict_col_t* +dict_table_get_sys_col( +/*===================*/ + const dict_table_t* table, /*!< in: table */ + ulint sys) /*!< in: DATA_ROW_ID, ... */ +{ + dict_col_t* col; + + ut_ad(table); + ut_ad(sys < DATA_N_SYS_COLS); + ut_ad(table->magic_n == DICT_TABLE_MAGIC_N); + + col = dict_table_get_nth_col(table, table->n_cols + - DATA_N_SYS_COLS + sys); + ut_ad(col->mtype == DATA_SYS); + ut_ad(col->prtype == (sys | DATA_NOT_NULL)); + + return(col); +} +#endif /* UNIV_DEBUG */ + +/********************************************************************//** +Gets the given system column number of a table. +@return column number */ +UNIV_INLINE +ulint +dict_table_get_sys_col_no( +/*======================*/ + const dict_table_t* table, /*!< in: table */ + ulint sys) /*!< in: DATA_ROW_ID, ... */ +{ + ut_ad(table); + ut_ad(sys < DATA_N_SYS_COLS); + ut_ad(table->magic_n == DICT_TABLE_MAGIC_N); + + return(table->n_cols - DATA_N_SYS_COLS + sys); +} + +/********************************************************************//** +Check whether the table uses the compact page format. +@return TRUE if table uses the compact page format */ +UNIV_INLINE +ibool +dict_table_is_comp( +/*===============*/ + const dict_table_t* table) /*!< in: table */ +{ + ut_ad(table); + +#if DICT_TF_COMPACT != 1 +#error "DICT_TF_COMPACT must be 1" +#endif + + return(table->flags & DICT_TF_COMPACT); +} + +/************************************************************************ +Check if the table has an FTS index. */ +UNIV_INLINE +ibool +dict_table_has_fts_index( +/*=====================*/ + /* out: TRUE if table has an FTS index */ + dict_table_t* table) /* in: table */ +{ + ut_ad(table); + + return(DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS)); +} + +/********************************************************************//** +Validate the table flags. +@return true if valid. */ +UNIV_INLINE +bool +dict_tf_is_valid( +/*=============*/ + ulint flags) /*!< in: table flags */ +{ + ulint compact = DICT_TF_GET_COMPACT(flags); + ulint zip_ssize = DICT_TF_GET_ZIP_SSIZE(flags); + ulint atomic_blobs = DICT_TF_HAS_ATOMIC_BLOBS(flags); + ulint unused = DICT_TF_GET_UNUSED(flags); + + /* Make sure there are no bits that we do not know about. */ + if (unused != 0) { + + return(false); + + } else if (atomic_blobs) { + /* Barracuda row formats COMPRESSED and DYNAMIC build on + the page structure introduced for the COMPACT row format + by allowing keys in secondary indexes to be made from + data stored off-page in the clustered index. */ + + if (!compact) { + return(false); + } + + } else if (zip_ssize) { + + /* Antelope does not support COMPRESSED row format. */ + return(false); + } + + if (zip_ssize) { + + /* COMPRESSED row format must have compact and atomic_blobs + bits set and validate the number is within allowed range. */ + + if (!compact + || !atomic_blobs + || zip_ssize > PAGE_ZIP_SSIZE_MAX) { + + return(false); + } + } + + /* CREATE TABLE ... DATA DIRECTORY is supported for any row format, + so the DATA_DIR flag is compatible with all other table flags. */ + + return(true); +} + +/********************************************************************//** +Validate a SYS_TABLES TYPE field and return it. +@return Same as input after validating it as a SYS_TABLES TYPE field. +If there is an error, return ULINT_UNDEFINED. */ +UNIV_INLINE +ulint +dict_sys_tables_type_validate( +/*==========================*/ + ulint type, /*!< in: SYS_TABLES.TYPE */ + ulint n_cols) /*!< in: SYS_TABLES.N_COLS */ +{ + ulint low_order_bit = DICT_TF_GET_COMPACT(type); + ulint redundant = !(n_cols & DICT_N_COLS_COMPACT); + ulint zip_ssize = DICT_TF_GET_ZIP_SSIZE(type); + ulint atomic_blobs = DICT_TF_HAS_ATOMIC_BLOBS(type); + ulint unused = DICT_TF_GET_UNUSED(type); + + /* The low order bit of SYS_TABLES.TYPE is always set to 1. + If the format is UNIV_FORMAT_B or higher, this field is the same + as dict_table_t::flags. Zero is not allowed here. */ + if (!low_order_bit) { + return(ULINT_UNDEFINED); + } + + if (redundant) { + if (zip_ssize || atomic_blobs) { + return(ULINT_UNDEFINED); + } + } + + /* Make sure there are no bits that we do not know about. */ + if (unused) { + return(ULINT_UNDEFINED); + } + + if (atomic_blobs) { + /* Barracuda row formats COMPRESSED and DYNAMIC build on + the page structure introduced for the COMPACT row format + by allowing keys in secondary indexes to be made from + data stored off-page in the clustered index. + + The DICT_N_COLS_COMPACT flag should be in N_COLS, + but we already know that. */ + + } else if (zip_ssize) { + /* Antelope does not support COMPRESSED format. */ + return(ULINT_UNDEFINED); + } + + if (zip_ssize) { + /* COMPRESSED row format must have low_order_bit and + atomic_blobs bits set and the DICT_N_COLS_COMPACT flag + should be in N_COLS, but we already know about the + low_order_bit and DICT_N_COLS_COMPACT flags. */ + if (!atomic_blobs) { + return(ULINT_UNDEFINED); + } + + /* Validate that the number is within allowed range. */ + if (zip_ssize > PAGE_ZIP_SSIZE_MAX) { + return(ULINT_UNDEFINED); + } + } + + /* There is nothing to validate for the data_dir field. + CREATE TABLE ... DATA DIRECTORY is supported for any row + format, so the DATA_DIR flag is compatible with any other + table flags. However, it is not used with TEMPORARY tables.*/ + + /* Return the validated SYS_TABLES.TYPE. */ + return(type); +} + +/********************************************************************//** +Determine the file format from dict_table_t::flags +The low order bit will be zero for REDUNDANT and 1 for COMPACT. For any +other row_format, file_format is > 0 and DICT_TF_COMPACT will also be set. +@return file format version */ +UNIV_INLINE +rec_format_t +dict_tf_get_rec_format( +/*===================*/ + ulint flags) /*!< in: dict_table_t::flags */ +{ + ut_a(dict_tf_is_valid(flags)); + + if (!DICT_TF_GET_COMPACT(flags)) { + return(REC_FORMAT_REDUNDANT); + } + + if (!DICT_TF_HAS_ATOMIC_BLOBS(flags)) { + return(REC_FORMAT_COMPACT); + } + + if (DICT_TF_GET_ZIP_SSIZE(flags)) { + return(REC_FORMAT_COMPRESSED); + } + + return(REC_FORMAT_DYNAMIC); +} + +/********************************************************************//** +Determine the file format from a dict_table_t::flags. +@return file format version */ +UNIV_INLINE +ulint +dict_tf_get_format( +/*===============*/ + ulint flags) /*!< in: dict_table_t::flags */ +{ + if (DICT_TF_HAS_ATOMIC_BLOBS(flags)) { + return(UNIV_FORMAT_B); + } + + return(UNIV_FORMAT_A); +} + +/********************************************************************//** +Determine the file format of a table. +@return file format version */ +UNIV_INLINE +ulint +dict_table_get_format( +/*==================*/ + const dict_table_t* table) /*!< in: table */ +{ + ut_ad(table); + + return(dict_tf_get_format(table->flags)); +} + +/********************************************************************//** +Set the file format and zip size in a dict_table_t::flags. If zip size +is not needed, it should be 0. */ +UNIV_INLINE +void +dict_tf_set( +/*========*/ + ulint* flags, /*!< in/out: table flags */ + rec_format_t format, /*!< in: file format */ + ulint zip_ssize, /*!< in: zip shift size */ + bool use_data_dir) /*!< in: table uses DATA DIRECTORY */ +{ + switch (format) { + case REC_FORMAT_REDUNDANT: + *flags = 0; + ut_ad(zip_ssize == 0); + break; + case REC_FORMAT_COMPACT: + *flags = DICT_TF_COMPACT; + ut_ad(zip_ssize == 0); + break; + case REC_FORMAT_COMPRESSED: + *flags = DICT_TF_COMPACT + | (1 << DICT_TF_POS_ATOMIC_BLOBS) + | (zip_ssize << DICT_TF_POS_ZIP_SSIZE); + break; + case REC_FORMAT_DYNAMIC: + *flags = DICT_TF_COMPACT + | (1 << DICT_TF_POS_ATOMIC_BLOBS); + ut_ad(zip_ssize == 0); + break; + } + + if (use_data_dir) { + *flags |= (1 << DICT_TF_POS_DATA_DIR); + } +} + +/********************************************************************//** +Convert a 32 bit integer table flags to the 32 bit integer that is +written into the tablespace header at the offset FSP_SPACE_FLAGS and is +also stored in the fil_space_t::flags field. The following chart shows +the translation of the low order bit. Other bits are the same. +========================= Low order bit ========================== + | REDUNDANT | COMPACT | COMPRESSED | DYNAMIC +dict_table_t::flags | 0 | 1 | 1 | 1 +fil_space_t::flags | 0 | 0 | 1 | 1 +================================================================== +@return tablespace flags (fil_space_t::flags) */ +UNIV_INLINE +ulint +dict_tf_to_fsp_flags( +/*=================*/ + ulint table_flags) /*!< in: dict_table_t::flags */ +{ + ulint fsp_flags; + + DBUG_EXECUTE_IF("dict_tf_to_fsp_flags_failure", + return(ULINT_UNDEFINED);); + + /* Adjust bit zero. */ + fsp_flags = DICT_TF_HAS_ATOMIC_BLOBS(table_flags) ? 1 : 0; + + /* ZIP_SSIZE and ATOMIC_BLOBS are at the same position. */ + fsp_flags |= table_flags & DICT_TF_MASK_ZIP_SSIZE; + fsp_flags |= table_flags & DICT_TF_MASK_ATOMIC_BLOBS; + + /* In addition, tablespace flags also contain the page size. */ + fsp_flags |= fsp_flags_set_page_size(fsp_flags, UNIV_PAGE_SIZE); + + /* The DATA_DIR flag is in a different position in fsp_flag */ + fsp_flags |= DICT_TF_HAS_DATA_DIR(table_flags) + ? FSP_FLAGS_MASK_DATA_DIR : 0; + + ut_a(fsp_flags_is_valid(fsp_flags)); + + return(fsp_flags); +} + +/********************************************************************//** +Convert a 32 bit integer from SYS_TABLES.TYPE to dict_table_t::flags +The following chart shows the translation of the low order bit. +Other bits are the same. +========================= Low order bit ========================== + | REDUNDANT | COMPACT | COMPRESSED and DYNAMIC +SYS_TABLES.TYPE | 1 | 1 | 1 +dict_table_t::flags | 0 | 1 | 1 +================================================================== +@return ulint containing SYS_TABLES.TYPE */ +UNIV_INLINE +ulint +dict_sys_tables_type_to_tf( +/*=======================*/ + ulint type, /*!< in: SYS_TABLES.TYPE field */ + ulint n_cols) /*!< in: SYS_TABLES.N_COLS field */ +{ + ulint flags; + ulint redundant = !(n_cols & DICT_N_COLS_COMPACT); + + /* Adjust bit zero. */ + flags = redundant ? 0 : 1; + + /* ZIP_SSIZE, ATOMIC_BLOBS & DATA_DIR are the same. */ + flags |= type & (DICT_TF_MASK_ZIP_SSIZE + | DICT_TF_MASK_ATOMIC_BLOBS + | DICT_TF_MASK_DATA_DIR); + + return(flags); +} + +/********************************************************************//** +Convert a 32 bit integer table flags to the 32bit integer that is written +to a SYS_TABLES.TYPE field. The following chart shows the translation of +the low order bit. Other bits are the same. +========================= Low order bit ========================== + | REDUNDANT | COMPACT | COMPRESSED and DYNAMIC +dict_table_t::flags | 0 | 1 | 1 +SYS_TABLES.TYPE | 1 | 1 | 1 +================================================================== +@return ulint containing SYS_TABLES.TYPE */ +UNIV_INLINE +ulint +dict_tf_to_sys_tables_type( +/*=======================*/ + ulint flags) /*!< in: dict_table_t::flags */ +{ + ulint type; + + ut_a(dict_tf_is_valid(flags)); + + /* Adjust bit zero. It is always 1 in SYS_TABLES.TYPE */ + type = 1; + + /* ZIP_SSIZE, ATOMIC_BLOBS & DATA_DIR are the same. */ + type |= flags & (DICT_TF_MASK_ZIP_SSIZE + | DICT_TF_MASK_ATOMIC_BLOBS + | DICT_TF_MASK_DATA_DIR); + + return(type); +} + +/********************************************************************//** +Extract the compressed page size from dict_table_t::flags. +These flags are in memory, so assert that they are valid. +@return compressed page size, or 0 if not compressed */ +UNIV_INLINE +ulint +dict_tf_get_zip_size( +/*=================*/ + ulint flags) /*!< in: flags */ +{ + ulint zip_ssize = DICT_TF_GET_ZIP_SSIZE(flags); + ulint zip_size = (zip_ssize + ? (UNIV_ZIP_SIZE_MIN >> 1) << zip_ssize + : 0); + + ut_ad(zip_size <= UNIV_ZIP_SIZE_MAX); + + return(zip_size); +} + +/********************************************************************//** +Check whether the table uses the compressed compact page format. +@return compressed page size, or 0 if not compressed */ +UNIV_INLINE +ulint +dict_table_zip_size( +/*================*/ + const dict_table_t* table) /*!< in: table */ +{ + ut_ad(table); + + return(dict_tf_get_zip_size(table->flags)); +} + +#ifndef UNIV_HOTBACKUP +/*********************************************************************//** +Obtain exclusive locks on all index trees of the table. This is to prevent +accessing index trees while InnoDB is updating internal metadata for +operations such as truncate tables. */ +UNIV_INLINE +void +dict_table_x_lock_indexes( +/*======================*/ + dict_table_t* table) /*!< in: table */ +{ + dict_index_t* index; + + ut_a(table); + ut_ad(mutex_own(&(dict_sys->mutex))); + + /* Loop through each index of the table and lock them */ + for (index = dict_table_get_first_index(table); + index != NULL; + index = dict_table_get_next_index(index)) { + rw_lock_x_lock(dict_index_get_lock(index)); + } +} + +/*********************************************************************//** +Release the exclusive locks on all index tree. */ +UNIV_INLINE +void +dict_table_x_unlock_indexes( +/*========================*/ + dict_table_t* table) /*!< in: table */ +{ + dict_index_t* index; + + ut_a(table); + ut_ad(mutex_own(&(dict_sys->mutex))); + + for (index = dict_table_get_first_index(table); + index != NULL; + index = dict_table_get_next_index(index)) { + rw_lock_x_unlock(dict_index_get_lock(index)); + } +} +#endif /* !UNIV_HOTBACKUP */ + +/********************************************************************//** +Gets the number of fields in the internal representation of an index, +including fields added by the dictionary system. +@return number of fields */ +UNIV_INLINE +ulint +dict_index_get_n_fields( +/*====================*/ + const dict_index_t* index) /*!< in: an internal + representation of index (in + the dictionary cache) */ +{ + ut_ad(index); + ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); + + return(index->n_fields); +} + +/********************************************************************//** +Gets the number of fields in the internal representation of an index +that uniquely determine the position of an index entry in the index, if +we do not take multiversioning into account: in the B-tree use the value +returned by dict_index_get_n_unique_in_tree. +@return number of fields */ +UNIV_INLINE +ulint +dict_index_get_n_unique( +/*====================*/ + const dict_index_t* index) /*!< in: an internal representation + of index (in the dictionary cache) */ +{ + ut_ad(index); + ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); + ut_ad(index->cached); + + return(index->n_uniq); +} + +/********************************************************************//** +Gets the number of fields in the internal representation of an index +which uniquely determine the position of an index entry in the index, if +we also take multiversioning into account. +@return number of fields */ +UNIV_INLINE +ulint +dict_index_get_n_unique_in_tree( +/*============================*/ + const dict_index_t* index) /*!< in: an internal representation + of index (in the dictionary cache) */ +{ + ut_ad(index); + ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); + ut_ad(index->cached); + + if (dict_index_is_clust(index)) { + + return(dict_index_get_n_unique(index)); + } + + return(dict_index_get_n_fields(index)); +} + +/********************************************************************//** +Gets the number of user-defined ordering fields in the index. In the internal +representation of clustered indexes we add the row id to the ordering fields +to make a clustered index unique, but this function returns the number of +fields the user defined in the index as ordering fields. +@return number of fields */ +UNIV_INLINE +ulint +dict_index_get_n_ordering_defined_by_user( +/*======================================*/ + const dict_index_t* index) /*!< in: an internal representation + of index (in the dictionary cache) */ +{ + return(index->n_user_defined_cols); +} + +#ifdef UNIV_DEBUG +/********************************************************************//** +Gets the nth field of an index. +@return pointer to field object */ +UNIV_INLINE +dict_field_t* +dict_index_get_nth_field( +/*=====================*/ + const dict_index_t* index, /*!< in: index */ + ulint pos) /*!< in: position of field */ +{ + ut_ad(index); + ut_ad(pos < index->n_def); + ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); + + return((dict_field_t*) (index->fields) + pos); +} +#endif /* UNIV_DEBUG */ + +/********************************************************************//** +Returns the position of a system column in an index. +@return position, ULINT_UNDEFINED if not contained */ +UNIV_INLINE +ulint +dict_index_get_sys_col_pos( +/*=======================*/ + const dict_index_t* index, /*!< in: index */ + ulint type) /*!< in: DATA_ROW_ID, ... */ +{ + ut_ad(index); + ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); + ut_ad(!dict_index_is_univ(index)); + + if (dict_index_is_clust(index)) { + + return(dict_col_get_clust_pos( + dict_table_get_sys_col(index->table, type), + index)); + } + + return(dict_index_get_nth_col_pos( + index, dict_table_get_sys_col_no(index->table, type))); +} + +/*********************************************************************//** +Gets the field column. +@return field->col, pointer to the table column */ +UNIV_INLINE +const dict_col_t* +dict_field_get_col( +/*===============*/ + const dict_field_t* field) /*!< in: index field */ +{ + ut_ad(field); + + return(field->col); +} + +/********************************************************************//** +Gets pointer to the nth column in an index. +@return column */ +UNIV_INLINE +const dict_col_t* +dict_index_get_nth_col( +/*===================*/ + const dict_index_t* index, /*!< in: index */ + ulint pos) /*!< in: position of the field */ +{ + return(dict_field_get_col(dict_index_get_nth_field(index, pos))); +} + +/********************************************************************//** +Gets the column number the nth field in an index. +@return column number */ +UNIV_INLINE +ulint +dict_index_get_nth_col_no( +/*======================*/ + const dict_index_t* index, /*!< in: index */ + ulint pos) /*!< in: position of the field */ +{ + return(dict_col_get_no(dict_index_get_nth_col(index, pos))); +} + +/********************************************************************//** +Looks for column n in an index. +@return position in internal representation of the index; +ULINT_UNDEFINED if not contained */ +UNIV_INLINE +ulint +dict_index_get_nth_col_pos( +/*=======================*/ + const dict_index_t* index, /*!< in: index */ + ulint n) /*!< in: column number */ +{ + return(dict_index_get_nth_col_or_prefix_pos(index, n, FALSE)); +} + +#ifndef UNIV_HOTBACKUP +/********************************************************************//** +Returns the minimum data size of an index record. +@return minimum data size in bytes */ +UNIV_INLINE +ulint +dict_index_get_min_size( +/*====================*/ + const dict_index_t* index) /*!< in: index */ +{ + ulint n = dict_index_get_n_fields(index); + ulint size = 0; + + while (n--) { + size += dict_col_get_min_size(dict_index_get_nth_col(index, + n)); + } + + return(size); +} + +/*********************************************************************//** +Gets the space id of the root of the index tree. +@return space id */ +UNIV_INLINE +ulint +dict_index_get_space( +/*=================*/ + const dict_index_t* index) /*!< in: index */ +{ + ut_ad(index); + ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); + + return(index->space); +} + +/*********************************************************************//** +Sets the space id of the root of the index tree. */ +UNIV_INLINE +void +dict_index_set_space( +/*=================*/ + dict_index_t* index, /*!< in/out: index */ + ulint space) /*!< in: space id */ +{ + ut_ad(index); + ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); + + index->space = space; +} + +/*********************************************************************//** +Gets the page number of the root of the index tree. +@return page number */ +UNIV_INLINE +ulint +dict_index_get_page( +/*================*/ + const dict_index_t* index) /*!< in: index */ +{ + ut_ad(index); + ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); + + return(index->page); +} + +/*********************************************************************//** +Gets the read-write lock of the index tree. +@return read-write lock */ +UNIV_INLINE +rw_lock_t* +dict_index_get_lock( +/*================*/ + dict_index_t* index) /*!< in: index */ +{ + ut_ad(index); + ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); + + return(&(index->lock)); +} + +/********************************************************************//** +Returns free space reserved for future updates of records. This is +relevant only in the case of many consecutive inserts, as updates +which make the records bigger might fragment the index. +@return number of free bytes on page, reserved for updates */ +UNIV_INLINE +ulint +dict_index_get_space_reserve(void) +/*==============================*/ +{ + return(UNIV_PAGE_SIZE / 16); +} + +/********************************************************************//** +Gets the status of online index creation. +@return the status */ +UNIV_INLINE +enum online_index_status +dict_index_get_online_status( +/*=========================*/ + const dict_index_t* index) /*!< in: secondary index */ +{ + enum online_index_status status; + + status = (enum online_index_status) index->online_status; + + /* Without the index->lock protection, the online + status can change from ONLINE_INDEX_CREATION to + ONLINE_INDEX_COMPLETE (or ONLINE_INDEX_ABORTED) in + row_log_apply() once log application is done. So to make + sure the status is ONLINE_INDEX_CREATION or ONLINE_INDEX_COMPLETE + you should always do the recheck after acquiring index->lock */ + +#ifdef UNIV_DEBUG + switch (status) { + case ONLINE_INDEX_COMPLETE: + case ONLINE_INDEX_CREATION: + case ONLINE_INDEX_ABORTED: + case ONLINE_INDEX_ABORTED_DROPPED: + return(status); + } + ut_error; +#endif /* UNIV_DEBUG */ + return(status); +} + +/********************************************************************//** +Sets the status of online index creation. */ +UNIV_INLINE +void +dict_index_set_online_status( +/*=========================*/ + dict_index_t* index, /*!< in/out: index */ + enum online_index_status status) /*!< in: status */ +{ + ut_ad(!(index->type & DICT_FTS)); +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(dict_index_get_lock(index), RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ +#ifdef UNIV_DEBUG + switch (dict_index_get_online_status(index)) { + case ONLINE_INDEX_COMPLETE: + case ONLINE_INDEX_CREATION: + break; + case ONLINE_INDEX_ABORTED: + ut_ad(status == ONLINE_INDEX_ABORTED_DROPPED); + break; + case ONLINE_INDEX_ABORTED_DROPPED: + ut_error; + } +#endif /* UNIV_DEBUG */ + + index->online_status = status; + ut_ad(dict_index_get_online_status(index) == status); +} + +/********************************************************************//** +Determines if a secondary index is being or has been created online, +or if the table is being rebuilt online, allowing concurrent modifications +to the table. +@retval true if the index is being or has been built online, or +if this is a clustered index and the table is being or has been rebuilt online +@retval false if the index has been created or the table has been +rebuilt completely */ +UNIV_INLINE +bool +dict_index_is_online_ddl( +/*=====================*/ + const dict_index_t* index) /*!< in: index */ +{ +#ifdef UNIV_DEBUG + if (dict_index_is_clust(index)) { + switch (dict_index_get_online_status(index)) { + case ONLINE_INDEX_CREATION: + return(true); + case ONLINE_INDEX_COMPLETE: + return(false); + case ONLINE_INDEX_ABORTED: + case ONLINE_INDEX_ABORTED_DROPPED: + break; + } + ut_ad(0); + return(false); + } +#endif /* UNIV_DEBUG */ + + return(UNIV_UNLIKELY(dict_index_get_online_status(index) + != ONLINE_INDEX_COMPLETE)); +} + +/**********************************************************************//** +Check whether a column exists in an FTS index. +@return ULINT_UNDEFINED if no match else the offset within the vector */ +UNIV_INLINE +ulint +dict_table_is_fts_column( +/*=====================*/ + ib_vector_t* indexes,/*!< in: vector containing only FTS indexes */ + ulint col_no) /*!< in: col number to search for */ + +{ + ulint i; + + for (i = 0; i < ib_vector_size(indexes); ++i) { + dict_index_t* index; + + index = (dict_index_t*) ib_vector_getp(indexes, i); + + if (dict_index_contains_col_or_prefix(index, col_no)) { + + return(i); + } + } + + return(ULINT_UNDEFINED); +} + +/**********************************************************************//** +Determine bytes of column prefix to be stored in the undo log. Please +note if the table format is UNIV_FORMAT_A (< UNIV_FORMAT_B), no prefix +needs to be stored in the undo log. +@return bytes of column prefix to be stored in the undo log */ +UNIV_INLINE +ulint +dict_max_field_len_store_undo( +/*==========================*/ + dict_table_t* table, /*!< in: table */ + const dict_col_t* col) /*!< in: column which index prefix + is based on */ +{ + ulint prefix_len = 0; + + if (dict_table_get_format(table) >= UNIV_FORMAT_B) + { + prefix_len = col->max_prefix + ? col->max_prefix + : DICT_MAX_FIELD_LEN_BY_FORMAT(table); + } + + return(prefix_len); +} + +/********************************************************************//** +Check whether the table is corrupted. +@return nonzero for corrupted table, zero for valid tables */ +UNIV_INLINE +ulint +dict_table_is_corrupted( +/*====================*/ + const dict_table_t* table) /*!< in: table */ +{ + ut_ad(table); + ut_ad(table->magic_n == DICT_TABLE_MAGIC_N); + + return(table->corrupted); +} + +/********************************************************************//** +Check whether the index is corrupted. +@return nonzero for corrupted index, zero for valid indexes */ +UNIV_INLINE +ulint +dict_index_is_corrupted( +/*====================*/ + const dict_index_t* index) /*!< in: index */ +{ + ut_ad(index); + ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); + + return((index->type & DICT_CORRUPT) + || (index->table && index->table->corrupted)); +} + +/********************************************************************//** +Check if the tablespace for the table has been discarded. +@return true if the tablespace has been discarded. */ +UNIV_INLINE +bool +dict_table_is_discarded( +/*====================*/ + const dict_table_t* table) /*!< in: table to check */ +{ + return(DICT_TF2_FLAG_IS_SET(table, DICT_TF2_DISCARDED)); +} + +/********************************************************************//** +Check if it is a temporary table. +@return true if temporary table flag is set. */ +UNIV_INLINE +bool +dict_table_is_temporary( +/*====================*/ + const dict_table_t* table) /*!< in: table to check */ +{ + return(DICT_TF2_FLAG_IS_SET(table, DICT_TF2_TEMPORARY)); +} + +/**********************************************************************//** +Get index by first field of the index +@return index which is having first field matches +with the field present in field_index position of table */ +UNIV_INLINE +dict_index_t* +dict_table_get_index_on_first_col( +/*==============================*/ + const dict_table_t* table, /*!< in: table */ + ulint col_index) /*!< in: position of column + in table */ +{ + ut_ad(col_index < table->n_cols); + + dict_col_t* column = dict_table_get_nth_col(table, col_index); + + for (dict_index_t* index = dict_table_get_first_index(table); + index != NULL; index = dict_table_get_next_index(index)) { + + if (index->fields[0].col == column) { + return(index); + } + } + ut_error; + return(0); +} + +#endif /* !UNIV_HOTBACKUP */ diff --git a/storage/innobase/include/dict0load.h b/storage/innobase/include/dict0load.h new file mode 100644 index 00000000000..030190b1a8e --- /dev/null +++ b/storage/innobase/include/dict0load.h @@ -0,0 +1,428 @@ +/***************************************************************************** + +Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/dict0load.h +Loads to the memory cache database object definitions +from dictionary tables + +Created 4/24/1996 Heikki Tuuri +*******************************************************/ + +#ifndef dict0load_h +#define dict0load_h + +#include "univ.i" +#include "dict0types.h" +#include "trx0types.h" +#include "ut0byte.h" +#include "mem0mem.h" +#include "btr0types.h" + +/** enum that defines all system table IDs. @see SYSTEM_TABLE_NAME[] */ +enum dict_system_id_t { + SYS_TABLES = 0, + SYS_INDEXES, + SYS_COLUMNS, + SYS_FIELDS, + SYS_FOREIGN, + SYS_FOREIGN_COLS, + SYS_TABLESPACES, + SYS_DATAFILES, + + /* This must be last item. Defines the number of system tables. */ + SYS_NUM_SYSTEM_TABLES +}; + +/** Status bit for dict_process_sys_tables_rec_and_mtr_commit() */ +enum dict_table_info_t { + DICT_TABLE_LOAD_FROM_RECORD = 0,/*!< Directly populate a dict_table_t + structure with information from + a SYS_TABLES record */ + DICT_TABLE_LOAD_FROM_CACHE = 1 /*!< Check first whether dict_table_t + is in the cache, if so, return it */ +}; + +/** Check type for dict_check_tablespaces_and_store_max_id() */ +enum dict_check_t { + /** No user tablespaces have been opened + (no crash recovery, no transactions recovered). */ + DICT_CHECK_NONE_LOADED = 0, + /** Some user tablespaces may have been opened + (no crash recovery; recovered table locks for transactions). */ + DICT_CHECK_SOME_LOADED, + /** All user tablespaces have been opened (crash recovery). */ + DICT_CHECK_ALL_LOADED +}; + +/********************************************************************//** +In a crash recovery we already have all the tablespace objects created. +This function compares the space id information in the InnoDB data dictionary +to what we already read with fil_load_single_table_tablespaces(). + +In a normal startup, we create the tablespace objects for every table in +InnoDB's data dictionary, if the corresponding .ibd file exists. +We also scan the biggest space id, and store it to fil_system. */ +UNIV_INTERN +void +dict_check_tablespaces_and_store_max_id( +/*====================================*/ + dict_check_t dict_check); /*!< in: how to check */ +/********************************************************************//** +Finds the first table name in the given database. +@return own: table name, NULL if does not exist; the caller must free +the memory in the string! */ +UNIV_INTERN +char* +dict_get_first_table_name_in_db( +/*============================*/ + const char* name); /*!< in: database name which ends to '/' */ + +/********************************************************************//** +Loads a table definition from a SYS_TABLES record to dict_table_t. +Does not load any columns or indexes. +@return error message, or NULL on success */ +UNIV_INTERN +const char* +dict_load_table_low( +/*================*/ + const char* name, /*!< in: table name */ + const rec_t* rec, /*!< in: SYS_TABLES record */ + dict_table_t** table); /*!< out,own: table, or NULL */ +/********************************************************************//** +Loads a table column definition from a SYS_COLUMNS record to +dict_table_t. +@return error message, or NULL on success */ +UNIV_INTERN +const char* +dict_load_column_low( +/*=================*/ + dict_table_t* table, /*!< in/out: table, could be NULL + if we just populate a dict_column_t + struct with information from + a SYS_COLUMNS record */ + mem_heap_t* heap, /*!< in/out: memory heap + for temporary storage */ + dict_col_t* column, /*!< out: dict_column_t to fill, + or NULL if table != NULL */ + table_id_t* table_id, /*!< out: table id */ + const char** col_name, /*!< out: column name */ + const rec_t* rec); /*!< in: SYS_COLUMNS record */ +/********************************************************************//** +Loads an index definition from a SYS_INDEXES record to dict_index_t. +If allocate=TRUE, we will create a dict_index_t structure and fill it +accordingly. If allocated=FALSE, the dict_index_t will be supplied by +the caller and filled with information read from the record. @return +error message, or NULL on success */ +UNIV_INTERN +const char* +dict_load_index_low( +/*================*/ + byte* table_id, /*!< in/out: table id (8 bytes), + an "in" value if allocate=TRUE + and "out" when allocate=FALSE */ + const char* table_name, /*!< in: table name */ + mem_heap_t* heap, /*!< in/out: temporary memory heap */ + const rec_t* rec, /*!< in: SYS_INDEXES record */ + ibool allocate, /*!< in: TRUE=allocate *index, + FALSE=fill in a pre-allocated + *index */ + dict_index_t** index); /*!< out,own: index, or NULL */ +/********************************************************************//** +Loads an index field definition from a SYS_FIELDS record to +dict_index_t. +@return error message, or NULL on success */ +UNIV_INTERN +const char* +dict_load_field_low( +/*================*/ + byte* index_id, /*!< in/out: index id (8 bytes) + an "in" value if index != NULL + and "out" if index == NULL */ + dict_index_t* index, /*!< in/out: index, could be NULL + if we just populate a dict_field_t + struct with information from + a SYS_FIELDS record */ + dict_field_t* sys_field, /*!< out: dict_field_t to be + filled */ + ulint* pos, /*!< out: Field position */ + byte* last_index_id, /*!< in: last index id */ + mem_heap_t* heap, /*!< in/out: memory heap + for temporary storage */ + const rec_t* rec); /*!< in: SYS_FIELDS record */ +/********************************************************************//** +Using the table->heap, copy the null-terminated filepath into +table->data_dir_path and put a null byte before the extension. +This allows SHOW CREATE TABLE to return the correct DATA DIRECTORY path. +Make this data directory path only if it has not yet been saved. */ +UNIV_INTERN +void +dict_save_data_dir_path( +/*====================*/ + dict_table_t* table, /*!< in/out: table */ + char* filepath); /*!< in: filepath of tablespace */ +/*****************************************************************//** +Make sure the data_file_name is saved in dict_table_t if needed. Try to +read it from the file dictionary first, then from SYS_DATAFILES. */ +UNIV_INTERN +void +dict_get_and_save_data_dir_path( +/*============================*/ + dict_table_t* table, /*!< in/out: table */ + bool dict_mutex_own); /*!< in: true if dict_sys->mutex + is owned already */ +/********************************************************************//** +Loads a table definition and also all its index definitions, and also +the cluster definition if the table is a member in a cluster. Also loads +all foreign key constraints where the foreign key is in the table or where +a foreign key references columns in this table. +@return table, NULL if does not exist; if the table is stored in an +.ibd file, but the file does not exist, then we set the +ibd_file_missing flag TRUE in the table object we return */ +UNIV_INTERN +dict_table_t* +dict_load_table( +/*============*/ + const char* name, /*!< in: table name in the + databasename/tablename format */ + ibool cached, /*!< in: TRUE=add to cache, FALSE=do not */ + dict_err_ignore_t ignore_err); + /*!< in: error to be ignored when loading + table and its indexes' definition */ +/***********************************************************************//** +Loads a table object based on the table id. +@return table; NULL if table does not exist */ +UNIV_INTERN +dict_table_t* +dict_load_table_on_id( +/*==================*/ + table_id_t table_id, /*!< in: table id */ + dict_err_ignore_t ignore_err); /*!< in: errors to ignore + when loading the table */ +/********************************************************************//** +This function is called when the database is booted. +Loads system table index definitions except for the clustered index which +is added to the dictionary cache at booting before calling this function. */ +UNIV_INTERN +void +dict_load_sys_table( +/*================*/ + dict_table_t* table); /*!< in: system table */ +/***********************************************************************//** +Loads foreign key constraints where the table is either the foreign key +holder or where the table is referenced by a foreign key. Adds these +constraints to the data dictionary. Note that we know that the dictionary +cache already contains all constraints where the other relevant table is +already in the dictionary cache. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +dict_load_foreigns( +/*===============*/ + const char* table_name, /*!< in: table name */ + const char** col_names, /*!< in: column names, or NULL + to use table->col_names */ + bool check_recursive,/*!< in: Whether to check + recursive load of tables + chained by FK */ + bool check_charsets, /*!< in: whether to check + charset compatibility */ + dict_err_ignore_t ignore_err) /*!< in: error to be ignored */ + __attribute__((nonnull(1), warn_unused_result)); +/********************************************************************//** +Prints to the standard output information on all tables found in the data +dictionary system table. */ +UNIV_INTERN +void +dict_print(void); +/*============*/ + +/********************************************************************//** +This function opens a system table, and return the first record. +@return first record of the system table */ +UNIV_INTERN +const rec_t* +dict_startscan_system( +/*==================*/ + btr_pcur_t* pcur, /*!< out: persistent cursor to + the record */ + mtr_t* mtr, /*!< in: the mini-transaction */ + dict_system_id_t system_id); /*!< in: which system table to open */ +/********************************************************************//** +This function get the next system table record as we scan the table. +@return the record if found, NULL if end of scan. */ +UNIV_INTERN +const rec_t* +dict_getnext_system( +/*================*/ + btr_pcur_t* pcur, /*!< in/out: persistent cursor + to the record */ + mtr_t* mtr); /*!< in: the mini-transaction */ +/********************************************************************//** +This function processes one SYS_TABLES record and populate the dict_table_t +struct for the table. Extracted out of dict_print() to be used by +both monitor table output and information schema innodb_sys_tables output. +@return error message, or NULL on success */ +UNIV_INTERN +const char* +dict_process_sys_tables_rec_and_mtr_commit( +/*=======================================*/ + mem_heap_t* heap, /*!< in: temporary memory heap */ + const rec_t* rec, /*!< in: SYS_TABLES record */ + dict_table_t** table, /*!< out: dict_table_t to fill */ + dict_table_info_t status, /*!< in: status bit controls + options such as whether we shall + look for dict_table_t from cache + first */ + mtr_t* mtr); /*!< in/out: mini-transaction, + will be committed */ +/********************************************************************//** +This function parses a SYS_INDEXES record and populate a dict_index_t +structure with the information from the record. For detail information +about SYS_INDEXES fields, please refer to dict_boot() function. +@return error message, or NULL on success */ +UNIV_INTERN +const char* +dict_process_sys_indexes_rec( +/*=========================*/ + mem_heap_t* heap, /*!< in/out: heap memory */ + const rec_t* rec, /*!< in: current SYS_INDEXES rec */ + dict_index_t* index, /*!< out: dict_index_t to be + filled */ + table_id_t* table_id); /*!< out: table id */ +/********************************************************************//** +This function parses a SYS_COLUMNS record and populate a dict_column_t +structure with the information from the record. +@return error message, or NULL on success */ +UNIV_INTERN +const char* +dict_process_sys_columns_rec( +/*=========================*/ + mem_heap_t* heap, /*!< in/out: heap memory */ + const rec_t* rec, /*!< in: current SYS_COLUMNS rec */ + dict_col_t* column, /*!< out: dict_col_t to be filled */ + table_id_t* table_id, /*!< out: table id */ + const char** col_name); /*!< out: column name */ +/********************************************************************//** +This function parses a SYS_FIELDS record and populate a dict_field_t +structure with the information from the record. +@return error message, or NULL on success */ +UNIV_INTERN +const char* +dict_process_sys_fields_rec( +/*========================*/ + mem_heap_t* heap, /*!< in/out: heap memory */ + const rec_t* rec, /*!< in: current SYS_FIELDS rec */ + dict_field_t* sys_field, /*!< out: dict_field_t to be + filled */ + ulint* pos, /*!< out: Field position */ + index_id_t* index_id, /*!< out: current index id */ + index_id_t last_id); /*!< in: previous index id */ +/********************************************************************//** +This function parses a SYS_FOREIGN record and populate a dict_foreign_t +structure with the information from the record. For detail information +about SYS_FOREIGN fields, please refer to dict_load_foreign() function +@return error message, or NULL on success */ +UNIV_INTERN +const char* +dict_process_sys_foreign_rec( +/*=========================*/ + mem_heap_t* heap, /*!< in/out: heap memory */ + const rec_t* rec, /*!< in: current SYS_FOREIGN rec */ + dict_foreign_t* foreign); /*!< out: dict_foreign_t to be + filled */ +/********************************************************************//** +This function parses a SYS_FOREIGN_COLS record and extract necessary +information from the record and return to caller. +@return error message, or NULL on success */ +UNIV_INTERN +const char* +dict_process_sys_foreign_col_rec( +/*=============================*/ + mem_heap_t* heap, /*!< in/out: heap memory */ + const rec_t* rec, /*!< in: current SYS_FOREIGN_COLS rec */ + const char** name, /*!< out: foreign key constraint name */ + const char** for_col_name, /*!< out: referencing column name */ + const char** ref_col_name, /*!< out: referenced column name + in referenced table */ + ulint* pos); /*!< out: column position */ +/********************************************************************//** +This function parses a SYS_TABLESPACES record, extracts necessary +information from the record and returns to caller. +@return error message, or NULL on success */ +UNIV_INTERN +const char* +dict_process_sys_tablespaces( +/*=========================*/ + mem_heap_t* heap, /*!< in/out: heap memory */ + const rec_t* rec, /*!< in: current SYS_TABLESPACES rec */ + ulint* space, /*!< out: pace id */ + const char** name, /*!< out: tablespace name */ + ulint* flags); /*!< out: tablespace flags */ +/********************************************************************//** +This function parses a SYS_DATAFILES record, extracts necessary +information from the record and returns to caller. +@return error message, or NULL on success */ +UNIV_INTERN +const char* +dict_process_sys_datafiles( +/*=======================*/ + mem_heap_t* heap, /*!< in/out: heap memory */ + const rec_t* rec, /*!< in: current SYS_DATAFILES rec */ + ulint* space, /*!< out: pace id */ + const char** path); /*!< out: datafile path */ +/********************************************************************//** +Get the filepath for a spaceid from SYS_DATAFILES. This function provides +a temporary heap which is used for the table lookup, but not for the path. +The caller must free the memory for the path returned. This function can +return NULL if the space ID is not found in SYS_DATAFILES, then the caller +will assume that the ibd file is in the normal datadir. +@return own: A copy of the first datafile found in SYS_DATAFILES.PATH for +the given space ID. NULL if space ID is zero or not found. */ +UNIV_INTERN +char* +dict_get_first_path( +/*================*/ + ulint space, /*!< in: space id */ + const char* name); /*!< in: tablespace name */ +/********************************************************************//** +Update the record for space_id in SYS_TABLESPACES to this filepath. +@return DB_SUCCESS if OK, dberr_t if the insert failed */ +UNIV_INTERN +dberr_t +dict_update_filepath( +/*=================*/ + ulint space_id, /*!< in: space id */ + const char* filepath); /*!< in: filepath */ +/********************************************************************//** +Insert records into SYS_TABLESPACES and SYS_DATAFILES. +@return DB_SUCCESS if OK, dberr_t if the insert failed */ +UNIV_INTERN +dberr_t +dict_insert_tablespace_and_filepath( +/*================================*/ + ulint space, /*!< in: space id */ + const char* name, /*!< in: talespace name */ + const char* filepath, /*!< in: filepath */ + ulint fsp_flags); /*!< in: tablespace flags */ + +#ifndef UNIV_NONINL +#include "dict0load.ic" +#endif + +#endif diff --git a/storage/innobase/include/dict0load.ic b/storage/innobase/include/dict0load.ic new file mode 100644 index 00000000000..2c0f1ff38a5 --- /dev/null +++ b/storage/innobase/include/dict0load.ic @@ -0,0 +1,26 @@ +/***************************************************************************** + +Copyright (c) 1996, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/dict0load.ic +Loads to the memory cache database object definitions +from dictionary tables + +Created 4/24/1996 Heikki Tuuri +*******************************************************/ + diff --git a/storage/innobase/include/dict0mem.h b/storage/innobase/include/dict0mem.h new file mode 100644 index 00000000000..460a7e125ad --- /dev/null +++ b/storage/innobase/include/dict0mem.h @@ -0,0 +1,1214 @@ +/***************************************************************************** + +Copyright (c) 1996, 2014, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2012, Facebook Inc. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/dict0mem.h +Data dictionary memory object creation + +Created 1/8/1996 Heikki Tuuri +*******************************************************/ + +#ifndef dict0mem_h +#define dict0mem_h + +#include "univ.i" +#include "dict0types.h" +#include "data0type.h" +#include "mem0mem.h" +#include "row0types.h" +#include "rem0types.h" +#include "btr0types.h" +#ifndef UNIV_HOTBACKUP +# include "lock0types.h" +# include "que0types.h" +# include "sync0rw.h" +#endif /* !UNIV_HOTBACKUP */ +#include "ut0mem.h" +#include "ut0lst.h" +#include "ut0rnd.h" +#include "ut0byte.h" +#include "hash0hash.h" +#include "trx0types.h" +#include "fts0fts.h" +#include "os0once.h" +#include <set> +#include <algorithm> +#include <iterator> + +/* Forward declaration. */ +struct ib_rbt_t; + +/** Type flags of an index: OR'ing of the flags is allowed to define a +combination of types */ +/* @{ */ +#define DICT_CLUSTERED 1 /*!< clustered index */ +#define DICT_UNIQUE 2 /*!< unique index */ +#define DICT_UNIVERSAL 4 /*!< index which can contain records from any + other index */ +#define DICT_IBUF 8 /*!< insert buffer tree */ +#define DICT_CORRUPT 16 /*!< bit to store the corrupted flag + in SYS_INDEXES.TYPE */ +#define DICT_FTS 32 /* FTS index; can't be combined with the + other flags */ + +#define DICT_IT_BITS 6 /*!< number of bits used for + SYS_INDEXES.TYPE */ +/* @} */ + +#if 0 /* not implemented, retained for history */ +/** Types for a table object */ +#define DICT_TABLE_ORDINARY 1 /*!< ordinary table */ +#define DICT_TABLE_CLUSTER_MEMBER 2 +#define DICT_TABLE_CLUSTER 3 /* this means that the table is + really a cluster definition */ +#endif + +/* Table and tablespace flags are generally not used for the Antelope file +format except for the low order bit, which is used differently depending on +where the flags are stored. + +==================== Low order flags bit ========================= + | REDUNDANT | COMPACT | COMPRESSED and DYNAMIC +SYS_TABLES.TYPE | 1 | 1 | 1 +dict_table_t::flags | 0 | 1 | 1 +FSP_SPACE_FLAGS | 0 | 0 | 1 +fil_space_t::flags | 0 | 0 | 1 + +Before the 5.1 plugin, SYS_TABLES.TYPE was always DICT_TABLE_ORDINARY (1) +and the tablespace flags field was always 0. In the 5.1 plugin, these fields +were repurposed to identify compressed and dynamic row formats. + +The following types and constants describe the flags found in dict_table_t +and SYS_TABLES.TYPE. Similar flags found in fil_space_t and FSP_SPACE_FLAGS +are described in fsp0fsp.h. */ + +/* @{ */ +/** dict_table_t::flags bit 0 is equal to 0 if the row format = Redundant */ +#define DICT_TF_REDUNDANT 0 /*!< Redundant row format. */ +/** dict_table_t::flags bit 0 is equal to 1 if the row format = Compact */ +#define DICT_TF_COMPACT 1 /*!< Compact row format. */ + +/** This bitmask is used in SYS_TABLES.N_COLS to set and test whether +the Compact page format is used, i.e ROW_FORMAT != REDUNDANT */ +#define DICT_N_COLS_COMPACT 0x80000000UL + +/** Width of the COMPACT flag */ +#define DICT_TF_WIDTH_COMPACT 1 +/** Width of the ZIP_SSIZE flag */ +#define DICT_TF_WIDTH_ZIP_SSIZE 4 +/** Width of the ATOMIC_BLOBS flag. The Antelope file formats broke up +BLOB and TEXT fields, storing the first 768 bytes in the clustered index. +Brracuda row formats store the whole blob or text field off-page atomically. +Secondary indexes are created from this external data using row_ext_t +to cache the BLOB prefixes. */ +#define DICT_TF_WIDTH_ATOMIC_BLOBS 1 +/** If a table is created with the MYSQL option DATA DIRECTORY and +innodb-file-per-table, an older engine will not be able to find that table. +This flag prevents older engines from attempting to open the table and +allows InnoDB to update_create_info() accordingly. */ +#define DICT_TF_WIDTH_DATA_DIR 1 + +/** Width of all the currently known table flags */ +#define DICT_TF_BITS (DICT_TF_WIDTH_COMPACT \ + + DICT_TF_WIDTH_ZIP_SSIZE \ + + DICT_TF_WIDTH_ATOMIC_BLOBS \ + + DICT_TF_WIDTH_DATA_DIR) + +/** A mask of all the known/used bits in table flags */ +#define DICT_TF_BIT_MASK (~(~0 << DICT_TF_BITS)) + +/** Zero relative shift position of the COMPACT field */ +#define DICT_TF_POS_COMPACT 0 +/** Zero relative shift position of the ZIP_SSIZE field */ +#define DICT_TF_POS_ZIP_SSIZE (DICT_TF_POS_COMPACT \ + + DICT_TF_WIDTH_COMPACT) +/** Zero relative shift position of the ATOMIC_BLOBS field */ +#define DICT_TF_POS_ATOMIC_BLOBS (DICT_TF_POS_ZIP_SSIZE \ + + DICT_TF_WIDTH_ZIP_SSIZE) +/** Zero relative shift position of the DATA_DIR field */ +#define DICT_TF_POS_DATA_DIR (DICT_TF_POS_ATOMIC_BLOBS \ + + DICT_TF_WIDTH_ATOMIC_BLOBS) +/** Zero relative shift position of the start of the UNUSED bits */ +#define DICT_TF_POS_UNUSED (DICT_TF_POS_DATA_DIR \ + + DICT_TF_WIDTH_DATA_DIR) + +/** Bit mask of the COMPACT field */ +#define DICT_TF_MASK_COMPACT \ + ((~(~0 << DICT_TF_WIDTH_COMPACT)) \ + << DICT_TF_POS_COMPACT) +/** Bit mask of the ZIP_SSIZE field */ +#define DICT_TF_MASK_ZIP_SSIZE \ + ((~(~0 << DICT_TF_WIDTH_ZIP_SSIZE)) \ + << DICT_TF_POS_ZIP_SSIZE) +/** Bit mask of the ATOMIC_BLOBS field */ +#define DICT_TF_MASK_ATOMIC_BLOBS \ + ((~(~0 << DICT_TF_WIDTH_ATOMIC_BLOBS)) \ + << DICT_TF_POS_ATOMIC_BLOBS) +/** Bit mask of the DATA_DIR field */ +#define DICT_TF_MASK_DATA_DIR \ + ((~(~0 << DICT_TF_WIDTH_DATA_DIR)) \ + << DICT_TF_POS_DATA_DIR) + +/** Return the value of the COMPACT field */ +#define DICT_TF_GET_COMPACT(flags) \ + ((flags & DICT_TF_MASK_COMPACT) \ + >> DICT_TF_POS_COMPACT) +/** Return the value of the ZIP_SSIZE field */ +#define DICT_TF_GET_ZIP_SSIZE(flags) \ + ((flags & DICT_TF_MASK_ZIP_SSIZE) \ + >> DICT_TF_POS_ZIP_SSIZE) +/** Return the value of the ATOMIC_BLOBS field */ +#define DICT_TF_HAS_ATOMIC_BLOBS(flags) \ + ((flags & DICT_TF_MASK_ATOMIC_BLOBS) \ + >> DICT_TF_POS_ATOMIC_BLOBS) +/** Return the value of the ATOMIC_BLOBS field */ +#define DICT_TF_HAS_DATA_DIR(flags) \ + ((flags & DICT_TF_MASK_DATA_DIR) \ + >> DICT_TF_POS_DATA_DIR) +/** Return the contents of the UNUSED bits */ +#define DICT_TF_GET_UNUSED(flags) \ + (flags >> DICT_TF_POS_UNUSED) +/* @} */ + +/** @brief Table Flags set number 2. + +These flags will be stored in SYS_TABLES.MIX_LEN. All unused flags +will be written as 0. The column may contain garbage for tables +created with old versions of InnoDB that only implemented +ROW_FORMAT=REDUNDANT. InnoDB engines do not check these flags +for unknown bits in order to protect backward incompatibility. */ +/* @{ */ +/** Total number of bits in table->flags2. */ +#define DICT_TF2_BITS 7 +#define DICT_TF2_BIT_MASK ~(~0 << DICT_TF2_BITS) + +/** TEMPORARY; TRUE for tables from CREATE TEMPORARY TABLE. */ +#define DICT_TF2_TEMPORARY 1 +/** The table has an internal defined DOC ID column */ +#define DICT_TF2_FTS_HAS_DOC_ID 2 +/** The table has an FTS index */ +#define DICT_TF2_FTS 4 +/** Need to add Doc ID column for FTS index build. +This is a transient bit for index build */ +#define DICT_TF2_FTS_ADD_DOC_ID 8 +/** This bit is used during table creation to indicate that it will +use its own tablespace instead of the system tablespace. */ +#define DICT_TF2_USE_TABLESPACE 16 + +/** Set when we discard/detach the tablespace */ +#define DICT_TF2_DISCARDED 32 + +/** This bit is set if all aux table names (both common tables and +index tables) of a FTS table are in HEX format. */ +#define DICT_TF2_FTS_AUX_HEX_NAME 64 +/* @} */ + +#define DICT_TF2_FLAG_SET(table, flag) \ + (table->flags2 |= (flag)) + +#define DICT_TF2_FLAG_IS_SET(table, flag) \ + (table->flags2 & (flag)) + +#define DICT_TF2_FLAG_UNSET(table, flag) \ + (table->flags2 &= ~(flag)) + +/** Tables could be chained together with Foreign key constraint. When +first load the parent table, we would load all of its descedents. +This could result in rescursive calls and out of stack error eventually. +DICT_FK_MAX_RECURSIVE_LOAD defines the maximum number of recursive loads, +when exceeded, the child table will not be loaded. It will be loaded when +the foreign constraint check needs to be run. */ +#define DICT_FK_MAX_RECURSIVE_LOAD 20 + +/** Similarly, when tables are chained together with foreign key constraints +with on cascading delete/update clause, delete from parent table could +result in recursive cascading calls. This defines the maximum number of +such cascading deletes/updates allowed. When exceeded, the delete from +parent table will fail, and user has to drop excessive foreign constraint +before proceeds. */ +#define FK_MAX_CASCADE_DEL 255 + +/**********************************************************************//** +Creates a table memory object. +@return own: table object */ +UNIV_INTERN +dict_table_t* +dict_mem_table_create( +/*==================*/ + const char* name, /*!< in: table name */ + ulint space, /*!< in: space where the clustered index + of the table is placed */ + ulint n_cols, /*!< in: number of columns */ + ulint flags, /*!< in: table flags */ + ulint flags2); /*!< in: table flags2 */ +/****************************************************************//** +Free a table memory object. */ +UNIV_INTERN +void +dict_mem_table_free( +/*================*/ + dict_table_t* table); /*!< in: table */ +/**********************************************************************//** +Adds a column definition to a table. */ +UNIV_INTERN +void +dict_mem_table_add_col( +/*===================*/ + dict_table_t* table, /*!< in: table */ + mem_heap_t* heap, /*!< in: temporary memory heap, or NULL */ + const char* name, /*!< in: column name, or NULL */ + ulint mtype, /*!< in: main datatype */ + ulint prtype, /*!< in: precise type */ + ulint len) /*!< in: precision */ + __attribute__((nonnull(1))); +/**********************************************************************//** +Renames a column of a table in the data dictionary cache. */ +UNIV_INTERN +void +dict_mem_table_col_rename( +/*======================*/ + dict_table_t* table, /*!< in/out: table */ + unsigned nth_col,/*!< in: column index */ + const char* from, /*!< in: old column name */ + const char* to) /*!< in: new column name */ + __attribute__((nonnull)); +/**********************************************************************//** +This function populates a dict_col_t memory structure with +supplied information. */ +UNIV_INTERN +void +dict_mem_fill_column_struct( +/*========================*/ + dict_col_t* column, /*!< out: column struct to be + filled */ + ulint col_pos, /*!< in: column position */ + ulint mtype, /*!< in: main data type */ + ulint prtype, /*!< in: precise type */ + ulint col_len); /*!< in: column length */ +/**********************************************************************//** +This function poplulates a dict_index_t index memory structure with +supplied information. */ +UNIV_INLINE +void +dict_mem_fill_index_struct( +/*=======================*/ + dict_index_t* index, /*!< out: index to be filled */ + mem_heap_t* heap, /*!< in: memory heap */ + const char* table_name, /*!< in: table name */ + const char* index_name, /*!< in: index name */ + ulint space, /*!< in: space where the index tree is + placed, ignored if the index is of + the clustered type */ + ulint type, /*!< in: DICT_UNIQUE, + DICT_CLUSTERED, ... ORed */ + ulint n_fields); /*!< in: number of fields */ +/**********************************************************************//** +Creates an index memory object. +@return own: index object */ +UNIV_INTERN +dict_index_t* +dict_mem_index_create( +/*==================*/ + const char* table_name, /*!< in: table name */ + const char* index_name, /*!< in: index name */ + ulint space, /*!< in: space where the index tree is + placed, ignored if the index is of + the clustered type */ + ulint type, /*!< in: DICT_UNIQUE, + DICT_CLUSTERED, ... ORed */ + ulint n_fields); /*!< in: number of fields */ +/**********************************************************************//** +Adds a field definition to an index. NOTE: does not take a copy +of the column name if the field is a column. The memory occupied +by the column name may be released only after publishing the index. */ +UNIV_INTERN +void +dict_mem_index_add_field( +/*=====================*/ + dict_index_t* index, /*!< in: index */ + const char* name, /*!< in: column name */ + ulint prefix_len); /*!< in: 0 or the column prefix length + in a MySQL index like + INDEX (textcol(25)) */ +/**********************************************************************//** +Frees an index memory object. */ +UNIV_INTERN +void +dict_mem_index_free( +/*================*/ + dict_index_t* index); /*!< in: index */ +/**********************************************************************//** +Creates and initializes a foreign constraint memory object. +@return own: foreign constraint struct */ +UNIV_INTERN +dict_foreign_t* +dict_mem_foreign_create(void); +/*=========================*/ + +/**********************************************************************//** +Sets the foreign_table_name_lookup pointer based on the value of +lower_case_table_names. If that is 0 or 1, foreign_table_name_lookup +will point to foreign_table_name. If 2, then another string is +allocated from the heap and set to lower case. */ +UNIV_INTERN +void +dict_mem_foreign_table_name_lookup_set( +/*===================================*/ + dict_foreign_t* foreign, /*!< in/out: foreign struct */ + ibool do_alloc); /*!< in: is an alloc needed */ + +/**********************************************************************//** +Sets the referenced_table_name_lookup pointer based on the value of +lower_case_table_names. If that is 0 or 1, referenced_table_name_lookup +will point to referenced_table_name. If 2, then another string is +allocated from the heap and set to lower case. */ +UNIV_INTERN +void +dict_mem_referenced_table_name_lookup_set( +/*======================================*/ + dict_foreign_t* foreign, /*!< in/out: foreign struct */ + ibool do_alloc); /*!< in: is an alloc needed */ + +/** Create a temporary tablename like "#sql-ibtid-inc where + tid = the Table ID + inc = a randomly initialized number that is incremented for each file +The table ID is a 64 bit integer, can use up to 20 digits, and is +initialized at bootstrap. The second number is 32 bits, can use up to 10 +digits, and is initialized at startup to a randomly distributed number. +It is hoped that the combination of these two numbers will provide a +reasonably unique temporary file name. +@param[in] heap A memory heap +@param[in] dbtab Table name in the form database/table name +@param[in] id Table id +@return A unique temporary tablename suitable for InnoDB use */ +UNIV_INTERN +char* +dict_mem_create_temporary_tablename( + mem_heap_t* heap, + const char* dbtab, + table_id_t id); + +/** Initialize dict memory variables */ + +void +dict_mem_init(void); + +/** Data structure for a column in a table */ +struct dict_col_t{ + /*----------------------*/ + /** The following are copied from dtype_t, + so that all bit-fields can be packed tightly. */ + /* @{ */ + unsigned prtype:32; /*!< precise type; MySQL data + type, charset code, flags to + indicate nullability, + signedness, whether this is a + binary string, whether this is + a true VARCHAR where MySQL + uses 2 bytes to store the length */ + unsigned mtype:8; /*!< main data type */ + + /* the remaining fields do not affect alphabetical ordering: */ + + unsigned len:16; /*!< length; for MySQL data this + is field->pack_length(), + except that for a >= 5.0.3 + type true VARCHAR this is the + maximum byte length of the + string data (in addition to + the string, MySQL uses 1 or 2 + bytes to store the string length) */ + + unsigned mbminmaxlen:5; /*!< minimum and maximum length of a + character, in bytes; + DATA_MBMINMAXLEN(mbminlen,mbmaxlen); + mbminlen=DATA_MBMINLEN(mbminmaxlen); + mbmaxlen=DATA_MBMINLEN(mbminmaxlen) */ + /*----------------------*/ + /* End of definitions copied from dtype_t */ + /* @} */ + + unsigned ind:10; /*!< table column position + (starting from 0) */ + unsigned ord_part:1; /*!< nonzero if this column + appears in the ordering fields + of an index */ + unsigned max_prefix:12; /*!< maximum index prefix length on + this column. Our current max limit is + 3072 for Barracuda table */ +}; + +/** @brief DICT_ANTELOPE_MAX_INDEX_COL_LEN is measured in bytes and +is the maximum indexed column length (or indexed prefix length) in +ROW_FORMAT=REDUNDANT and ROW_FORMAT=COMPACT. Also, in any format, +any fixed-length field that is longer than this will be encoded as +a variable-length field. + +It is set to 3*256, so that one can create a column prefix index on +256 characters of a TEXT or VARCHAR column also in the UTF-8 +charset. In that charset, a character may take at most 3 bytes. This +constant MUST NOT BE CHANGED, or the compatibility of InnoDB data +files would be at risk! */ +#define DICT_ANTELOPE_MAX_INDEX_COL_LEN REC_ANTELOPE_MAX_INDEX_COL_LEN + +/** Find out maximum indexed column length by its table format. +For ROW_FORMAT=REDUNDANT and ROW_FORMAT=COMPACT, the maximum +field length is REC_ANTELOPE_MAX_INDEX_COL_LEN - 1 (767). For +Barracuda row formats COMPRESSED and DYNAMIC, the length could +be REC_VERSION_56_MAX_INDEX_COL_LEN (3072) bytes */ +#define DICT_MAX_FIELD_LEN_BY_FORMAT(table) \ + ((dict_table_get_format(table) < UNIV_FORMAT_B) \ + ? (REC_ANTELOPE_MAX_INDEX_COL_LEN - 1) \ + : REC_VERSION_56_MAX_INDEX_COL_LEN) + +#define DICT_MAX_FIELD_LEN_BY_FORMAT_FLAG(flags) \ + ((DICT_TF_HAS_ATOMIC_BLOBS(flags) < UNIV_FORMAT_B) \ + ? (REC_ANTELOPE_MAX_INDEX_COL_LEN - 1) \ + : REC_VERSION_56_MAX_INDEX_COL_LEN) + +/** Defines the maximum fixed length column size */ +#define DICT_MAX_FIXED_COL_LEN DICT_ANTELOPE_MAX_INDEX_COL_LEN + +/** Data structure for a field in an index */ +struct dict_field_t{ + dict_col_t* col; /*!< pointer to the table column */ + const char* name; /*!< name of the column */ + unsigned prefix_len:12; /*!< 0 or the length of the column + prefix in bytes in a MySQL index of + type, e.g., INDEX (textcol(25)); + must be smaller than + DICT_MAX_FIELD_LEN_BY_FORMAT; + NOTE that in the UTF-8 charset, MySQL + sets this to (mbmaxlen * the prefix len) + in UTF-8 chars */ + unsigned fixed_len:10; /*!< 0 or the fixed length of the + column if smaller than + DICT_ANTELOPE_MAX_INDEX_COL_LEN */ +}; + +/**********************************************************************//** +PADDING HEURISTIC BASED ON LINEAR INCREASE OF PADDING TO AVOID +COMPRESSION FAILURES +(Note: this is relevant only for compressed indexes) +GOAL: Avoid compression failures by maintaining information about the +compressibility of data. If data is not very compressible then leave +some extra space 'padding' in the uncompressed page making it more +likely that compression of less than fully packed uncompressed page will +succeed. + +This padding heuristic works by increasing the pad linearly until the +desired failure rate is reached. A "round" is a fixed number of +compression operations. +After each round, the compression failure rate for that round is +computed. If the failure rate is too high, then padding is incremented +by a fixed value, otherwise it's left intact. +If the compression failure is lower than the desired rate for a fixed +number of consecutive rounds, then the padding is decreased by a fixed +value. This is done to prevent overshooting the padding value, +and to accommodate the possible change in data compressibility. */ + +/** Number of zip ops in one round. */ +#define ZIP_PAD_ROUND_LEN (128) + +/** Number of successful rounds after which the padding is decreased */ +#define ZIP_PAD_SUCCESSFUL_ROUND_LIMIT (5) + +/** Amount by which padding is increased. */ +#define ZIP_PAD_INCR (128) + +/** Percentage of compression failures that are allowed in a single +round */ +extern ulong zip_failure_threshold_pct; + +/** Maximum percentage of a page that can be allowed as a pad to avoid +compression failures */ +extern ulong zip_pad_max; + +/** Data structure to hold information about about how much space in +an uncompressed page should be left as padding to avoid compression +failures. This estimate is based on a self-adapting heuristic. */ +struct zip_pad_info_t { + os_fast_mutex_t mutex; /*!< mutex protecting the info */ + ulint pad; /*!< number of bytes used as pad */ + ulint success;/*!< successful compression ops during + current round */ + ulint failure;/*!< failed compression ops during + current round */ + ulint n_rounds;/*!< number of currently successful + rounds */ +}; + +/** Data structure for an index. Most fields will be +initialized to 0, NULL or FALSE in dict_mem_index_create(). */ +struct dict_index_t{ + index_id_t id; /*!< id of the index */ + mem_heap_t* heap; /*!< memory heap */ + const char* name; /*!< index name */ + const char* table_name;/*!< table name */ + dict_table_t* table; /*!< back pointer to table */ +#ifndef UNIV_HOTBACKUP + unsigned space:32; + /*!< space where the index tree is placed */ + unsigned page:32;/*!< index tree root page number */ +#endif /* !UNIV_HOTBACKUP */ + unsigned type:DICT_IT_BITS; + /*!< index type (DICT_CLUSTERED, DICT_UNIQUE, + DICT_UNIVERSAL, DICT_IBUF, DICT_CORRUPT) */ +#define MAX_KEY_LENGTH_BITS 12 + unsigned trx_id_offset:MAX_KEY_LENGTH_BITS; + /*!< position of the trx id column + in a clustered index record, if the fields + before it are known to be of a fixed size, + 0 otherwise */ +#if (1<<MAX_KEY_LENGTH_BITS) < MAX_KEY_LENGTH +# error (1<<MAX_KEY_LENGTH_BITS) < MAX_KEY_LENGTH +#endif + unsigned n_user_defined_cols:10; + /*!< number of columns the user defined to + be in the index: in the internal + representation we add more columns */ + unsigned n_uniq:10;/*!< number of fields from the beginning + which are enough to determine an index + entry uniquely */ + unsigned n_def:10;/*!< number of fields defined so far */ + unsigned n_fields:10;/*!< number of fields in the index */ + unsigned n_nullable:10;/*!< number of nullable fields */ + unsigned cached:1;/*!< TRUE if the index object is in the + dictionary cache */ + unsigned to_be_dropped:1; + /*!< TRUE if the index is to be dropped; + protected by dict_operation_lock */ + unsigned online_status:2; + /*!< enum online_index_status. + Transitions from ONLINE_INDEX_COMPLETE (to + ONLINE_INDEX_CREATION) are protected + by dict_operation_lock and + dict_sys->mutex. Other changes are + protected by index->lock. */ + dict_field_t* fields; /*!< array of field descriptions */ +#ifndef UNIV_HOTBACKUP + UT_LIST_NODE_T(dict_index_t) + indexes;/*!< list of indexes of the table */ + btr_search_t* search_info; + /*!< info used in optimistic searches */ + row_log_t* online_log; + /*!< the log of modifications + during online index creation; + valid when online_status is + ONLINE_INDEX_CREATION */ + /*----------------------*/ + /** Statistics for query optimization */ + /* @{ */ + ib_uint64_t* stat_n_diff_key_vals; + /*!< approximate number of different + key values for this index, for each + n-column prefix where 1 <= n <= + dict_get_n_unique(index) (the array is + indexed from 0 to n_uniq-1); we + periodically calculate new + estimates */ + ib_uint64_t* stat_n_sample_sizes; + /*!< number of pages that were sampled + to calculate each of stat_n_diff_key_vals[], + e.g. stat_n_sample_sizes[3] pages were sampled + to get the number stat_n_diff_key_vals[3]. */ + ib_uint64_t* stat_n_non_null_key_vals; + /* approximate number of non-null key values + for this index, for each column where + 1 <= n <= dict_get_n_unique(index) (the array + is indexed from 0 to n_uniq-1); This + is used when innodb_stats_method is + "nulls_ignored". */ + ulint stat_index_size; + /*!< approximate index size in + database pages */ + ulint stat_n_leaf_pages; + /*!< approximate number of leaf pages in the + index tree */ + /* @} */ + rw_lock_t lock; /*!< read-write lock protecting the + upper levels of the index tree */ + trx_id_t trx_id; /*!< id of the transaction that created this + index, or 0 if the index existed + when InnoDB was started up */ + zip_pad_info_t zip_pad;/*!< Information about state of + compression failures and successes */ +#endif /* !UNIV_HOTBACKUP */ +#ifdef UNIV_BLOB_DEBUG + ib_mutex_t blobs_mutex; + /*!< mutex protecting blobs */ + ib_rbt_t* blobs; /*!< map of (page_no,heap_no,field_no) + to first_blob_page_no; protected by + blobs_mutex; @see btr_blob_dbg_t */ +#endif /* UNIV_BLOB_DEBUG */ +#ifdef UNIV_DEBUG + ulint magic_n;/*!< magic number */ +/** Value of dict_index_t::magic_n */ +# define DICT_INDEX_MAGIC_N 76789786 +#endif +}; + +/** The status of online index creation */ +enum online_index_status { + /** the index is complete and ready for access */ + ONLINE_INDEX_COMPLETE = 0, + /** the index is being created, online + (allowing concurrent modifications) */ + ONLINE_INDEX_CREATION, + /** secondary index creation was aborted and the index + should be dropped as soon as index->table->n_ref_count reaches 0, + or online table rebuild was aborted and the clustered index + of the original table should soon be restored to + ONLINE_INDEX_COMPLETE */ + ONLINE_INDEX_ABORTED, + /** the online index creation was aborted, the index was + dropped from the data dictionary and the tablespace, and it + should be dropped from the data dictionary cache as soon as + index->table->n_ref_count reaches 0. */ + ONLINE_INDEX_ABORTED_DROPPED +}; + +/** Data structure for a foreign key constraint; an example: +FOREIGN KEY (A, B) REFERENCES TABLE2 (C, D). Most fields will be +initialized to 0, NULL or FALSE in dict_mem_foreign_create(). */ +struct dict_foreign_t{ + mem_heap_t* heap; /*!< this object is allocated from + this memory heap */ + char* id; /*!< id of the constraint as a + null-terminated string */ + unsigned n_fields:10; /*!< number of indexes' first fields + for which the foreign key + constraint is defined: we allow the + indexes to contain more fields than + mentioned in the constraint, as long + as the first fields are as mentioned */ + unsigned type:6; /*!< 0 or DICT_FOREIGN_ON_DELETE_CASCADE + or DICT_FOREIGN_ON_DELETE_SET_NULL */ + char* foreign_table_name;/*!< foreign table name */ + char* foreign_table_name_lookup; + /*!< foreign table name used for dict lookup */ + dict_table_t* foreign_table; /*!< table where the foreign key is */ + const char** foreign_col_names;/*!< names of the columns in the + foreign key */ + char* referenced_table_name;/*!< referenced table name */ + char* referenced_table_name_lookup; + /*!< referenced table name for dict lookup*/ + dict_table_t* referenced_table;/*!< table where the referenced key + is */ + const char** referenced_col_names;/*!< names of the referenced + columns in the referenced table */ + dict_index_t* foreign_index; /*!< foreign index; we require that + both tables contain explicitly defined + indexes for the constraint: InnoDB + does not generate new indexes + implicitly */ + dict_index_t* referenced_index;/*!< referenced index */ +}; + +std::ostream& +operator<< (std::ostream& out, const dict_foreign_t& foreign); + +struct dict_foreign_print { + + dict_foreign_print(std::ostream& out) + : m_out(out) + {} + + void operator()(const dict_foreign_t* foreign) { + m_out << *foreign; + } +private: + std::ostream& m_out; +}; + +/** Compare two dict_foreign_t objects using their ids. Used in the ordering +of dict_table_t::foreign_set and dict_table_t::referenced_set. It returns +true if the first argument is considered to go before the second in the +strict weak ordering it defines, and false otherwise. */ +struct dict_foreign_compare { + + bool operator()( + const dict_foreign_t* lhs, + const dict_foreign_t* rhs) const + { + return(ut_strcmp(lhs->id, rhs->id) < 0); + } +}; + +/** A function object to find a foreign key with the given index as the +referenced index. Return the foreign key with matching criteria or NULL */ +struct dict_foreign_with_index { + + dict_foreign_with_index(const dict_index_t* index) + : m_index(index) + {} + + bool operator()(const dict_foreign_t* foreign) const + { + return(foreign->referenced_index == m_index); + } + + const dict_index_t* m_index; +}; + +/* A function object to check if the foreign constraint is between different +tables. Returns true if foreign key constraint is between different tables, +false otherwise. */ +struct dict_foreign_different_tables { + + bool operator()(const dict_foreign_t* foreign) const + { + return(foreign->foreign_table != foreign->referenced_table); + } +}; + +/** A function object to check if the foreign key constraint has the same +name as given. If the full name of the foreign key constraint doesn't match, +then, check if removing the database name from the foreign key constraint +matches. Return true if it matches, false otherwise. */ +struct dict_foreign_matches_id { + + dict_foreign_matches_id(const char* id) + : m_id(id) + {} + + bool operator()(const dict_foreign_t* foreign) const + { + if (0 == innobase_strcasecmp(foreign->id, m_id)) { + return(true); + } + if (const char* pos = strchr(foreign->id, '/')) { + if (0 == innobase_strcasecmp(m_id, pos + 1)) { + return(true); + } + } + return(false); + } + + const char* m_id; +}; + +typedef std::set<dict_foreign_t*, dict_foreign_compare> dict_foreign_set; + +std::ostream& +operator<< (std::ostream& out, const dict_foreign_set& fk_set); + +/** Function object to check if a foreign key object is there +in the given foreign key set or not. It returns true if the +foreign key is not found, false otherwise */ +struct dict_foreign_not_exists { + dict_foreign_not_exists(const dict_foreign_set& obj_) + : m_foreigns(obj_) + {} + + /* Return true if the given foreign key is not found */ + bool operator()(dict_foreign_t* const & foreign) const { + return(m_foreigns.find(foreign) == m_foreigns.end()); + } +private: + const dict_foreign_set& m_foreigns; +}; + +/** Validate the search order in the foreign key set. +@param[in] fk_set the foreign key set to be validated +@return true if search order is fine in the set, false otherwise. */ +bool +dict_foreign_set_validate( + const dict_foreign_set& fk_set); + +/** Validate the search order in the foreign key sets of the table +(foreign_set and referenced_set). +@param[in] table table whose foreign key sets are to be validated +@return true if foreign key sets are fine, false otherwise. */ +bool +dict_foreign_set_validate( + const dict_table_t& table); + +/*********************************************************************//** +Frees a foreign key struct. */ +inline +void +dict_foreign_free( +/*==============*/ + dict_foreign_t* foreign) /*!< in, own: foreign key struct */ +{ + mem_heap_free(foreign->heap); +} + +/** The destructor will free all the foreign key constraints in the set +by calling dict_foreign_free() on each of the foreign key constraints. +This is used to free the allocated memory when a local set goes out +of scope. */ +struct dict_foreign_set_free { + + dict_foreign_set_free(const dict_foreign_set& foreign_set) + : m_foreign_set(foreign_set) + {} + + ~dict_foreign_set_free() + { + std::for_each(m_foreign_set.begin(), + m_foreign_set.end(), + dict_foreign_free); + } + + const dict_foreign_set& m_foreign_set; +}; + +/** The flags for ON_UPDATE and ON_DELETE can be ORed; the default is that +a foreign key constraint is enforced, therefore RESTRICT just means no flag */ +/* @{ */ +#define DICT_FOREIGN_ON_DELETE_CASCADE 1 /*!< ON DELETE CASCADE */ +#define DICT_FOREIGN_ON_DELETE_SET_NULL 2 /*!< ON UPDATE SET NULL */ +#define DICT_FOREIGN_ON_UPDATE_CASCADE 4 /*!< ON DELETE CASCADE */ +#define DICT_FOREIGN_ON_UPDATE_SET_NULL 8 /*!< ON UPDATE SET NULL */ +#define DICT_FOREIGN_ON_DELETE_NO_ACTION 16 /*!< ON DELETE NO ACTION */ +#define DICT_FOREIGN_ON_UPDATE_NO_ACTION 32 /*!< ON UPDATE NO ACTION */ +/* @} */ + +/* This flag is for sync SQL DDL and memcached DML. +if table->memcached_sync_count == DICT_TABLE_IN_DDL means there's DDL running on +the table, DML from memcached will be blocked. */ +#define DICT_TABLE_IN_DDL -1 + +/** Data structure for a database table. Most fields will be +initialized to 0, NULL or FALSE in dict_mem_table_create(). */ +struct dict_table_t{ + + + table_id_t id; /*!< id of the table */ + mem_heap_t* heap; /*!< memory heap */ + char* name; /*!< table name */ + const char* dir_path_of_temp_table;/*!< NULL or the directory path + where a TEMPORARY table that was explicitly + created by a user should be placed if + innodb_file_per_table is defined in my.cnf; + in Unix this is usually /tmp/..., in Windows + temp\... */ + char* data_dir_path; /*!< NULL or the directory path + specified by DATA DIRECTORY */ + unsigned space:32; + /*!< space where the clustered index of the + table is placed */ + unsigned flags:DICT_TF_BITS; /*!< DICT_TF_... */ + unsigned flags2:DICT_TF2_BITS; /*!< DICT_TF2_... */ + unsigned ibd_file_missing:1; + /*!< TRUE if this is in a single-table + tablespace and the .ibd file is missing; then + we must return in ha_innodb.cc an error if the + user tries to query such an orphaned table */ + unsigned cached:1;/*!< TRUE if the table object has been added + to the dictionary cache */ + unsigned to_be_dropped:1; + /*!< TRUE if the table is to be dropped, but + not yet actually dropped (could in the bk + drop list); It is turned on at the beginning + of row_drop_table_for_mysql() and turned off + just before we start to update system tables + for the drop. It is protected by + dict_operation_lock */ + unsigned n_def:10;/*!< number of columns defined so far */ + unsigned n_cols:10;/*!< number of columns */ + unsigned can_be_evicted:1; + /*!< TRUE if it's not an InnoDB system table + or a table that has no FK relationships */ + unsigned corrupted:1; + /*!< TRUE if table is corrupted */ + unsigned drop_aborted:1; + /*!< TRUE if some indexes should be dropped + after ONLINE_INDEX_ABORTED + or ONLINE_INDEX_ABORTED_DROPPED */ + dict_col_t* cols; /*!< array of column descriptions */ + const char* col_names; + /*!< Column names packed in a character string + "name1\0name2\0...nameN\0". Until + the string contains n_cols, it will be + allocated from a temporary heap. The final + string will be allocated from table->heap. */ +#ifndef UNIV_HOTBACKUP + hash_node_t name_hash; /*!< hash chain node */ + hash_node_t id_hash; /*!< hash chain node */ + UT_LIST_BASE_NODE_T(dict_index_t) + indexes; /*!< list of indexes of the table */ + + dict_foreign_set foreign_set; + /*!< set of foreign key constraints + in the table; these refer to columns + in other tables */ + + dict_foreign_set referenced_set; + /*!< list of foreign key constraints + which refer to this table */ + + UT_LIST_NODE_T(dict_table_t) + table_LRU; /*!< node of the LRU list of tables */ + unsigned fk_max_recusive_level:8; + /*!< maximum recursive level we support when + loading tables chained together with FK + constraints. If exceeds this level, we will + stop loading child table into memory along with + its parent table */ + ulint n_foreign_key_checks_running; + /*!< count of how many foreign key check + operations are currently being performed + on the table: we cannot drop the table while + there are foreign key checks running on + it! */ + trx_id_t def_trx_id; + /*!< transaction id that last touched + the table definition, either when + loading the definition or CREATE + TABLE, or ALTER TABLE (prepare, + commit, and rollback phases) */ + trx_id_t query_cache_inv_trx_id; + /*!< transactions whose trx id is + smaller than this number are not + allowed to store to the MySQL query + cache or retrieve from it; when a trx + with undo logs commits, it sets this + to the value of the trx id counter for + the tables it had an IX lock on */ +#ifdef UNIV_DEBUG + /*----------------------*/ + ibool does_not_fit_in_memory; + /*!< this field is used to specify in + simulations tables which are so big + that disk should be accessed: disk + access is simulated by putting the + thread to sleep for a while; NOTE that + this flag is not stored to the data + dictionary on disk, and the database + will forget about value TRUE if it has + to reload the table definition from + disk */ +#endif /* UNIV_DEBUG */ + /*----------------------*/ + unsigned big_rows:1; + /*!< flag: TRUE if the maximum length of + a single row exceeds BIG_ROW_SIZE; + initialized in dict_table_add_to_cache() */ + /** Statistics for query optimization */ + /* @{ */ + + volatile os_once::state_t stats_latch_created; + /*!< Creation state of 'stats_latch'. */ + + rw_lock_t* stats_latch; /*!< this latch protects: + dict_table_t::stat_initialized + dict_table_t::stat_n_rows (*) + dict_table_t::stat_clustered_index_size + dict_table_t::stat_sum_of_other_index_sizes + dict_table_t::stat_modified_counter (*) + dict_table_t::indexes*::stat_n_diff_key_vals[] + dict_table_t::indexes*::stat_index_size + dict_table_t::indexes*::stat_n_leaf_pages + (*) those are not always protected for + performance reasons */ + unsigned stat_initialized:1; /*!< TRUE if statistics have + been calculated the first time + after database startup or table creation */ +#define DICT_TABLE_IN_USED -1 + lint memcached_sync_count; + /*!< count of how many handles are opened + to this table from memcached; DDL on the + table is NOT allowed until this count + goes to zero. If it's -1, means there's DDL + on the table, DML from memcached will be + blocked. */ + ib_time_t stats_last_recalc; + /*!< Timestamp of last recalc of the stats */ + ib_uint32_t stat_persistent; + /*!< The two bits below are set in the + ::stat_persistent member and have the following + meaning: + 1. _ON=0, _OFF=0, no explicit persistent stats + setting for this table, the value of the global + srv_stats_persistent is used to determine + whether the table has persistent stats enabled + or not + 2. _ON=0, _OFF=1, persistent stats are + explicitly disabled for this table, regardless + of the value of the global srv_stats_persistent + 3. _ON=1, _OFF=0, persistent stats are + explicitly enabled for this table, regardless + of the value of the global srv_stats_persistent + 4. _ON=1, _OFF=1, not allowed, we assert if + this ever happens. */ +#define DICT_STATS_PERSISTENT_ON (1 << 1) +#define DICT_STATS_PERSISTENT_OFF (1 << 2) + ib_uint32_t stats_auto_recalc; + /*!< The two bits below are set in the + ::stats_auto_recalc member and have + the following meaning: + 1. _ON=0, _OFF=0, no explicit auto recalc + setting for this table, the value of the global + srv_stats_persistent_auto_recalc is used to + determine whether the table has auto recalc + enabled or not + 2. _ON=0, _OFF=1, auto recalc is explicitly + disabled for this table, regardless of the + value of the global + srv_stats_persistent_auto_recalc + 3. _ON=1, _OFF=0, auto recalc is explicitly + enabled for this table, regardless of the + value of the global + srv_stats_persistent_auto_recalc + 4. _ON=1, _OFF=1, not allowed, we assert if + this ever happens. */ +#define DICT_STATS_AUTO_RECALC_ON (1 << 1) +#define DICT_STATS_AUTO_RECALC_OFF (1 << 2) + ulint stats_sample_pages; + /*!< the number of pages to sample for this + table during persistent stats estimation; + if this is 0, then the value of the global + srv_stats_persistent_sample_pages will be + used instead. */ + ib_uint64_t stat_n_rows; + /*!< approximate number of rows in the table; + we periodically calculate new estimates */ + ulint stat_clustered_index_size; + /*!< approximate clustered index size in + database pages */ + ulint stat_sum_of_other_index_sizes; + /*!< other indexes in database pages */ + ib_uint64_t stat_modified_counter; + /*!< when a row is inserted, updated, + or deleted, + we add 1 to this number; we calculate new + estimates for the stat_... values for the + table and the indexes when about 1 / 16 of + table has been modified; + also when the estimate operation is + called for MySQL SHOW TABLE STATUS; the + counter is reset to zero at statistics + calculation; this counter is not protected by + any latch, because this is only used for + heuristics */ +#define BG_STAT_NONE 0 +#define BG_STAT_IN_PROGRESS (1 << 0) + /*!< BG_STAT_IN_PROGRESS is set in + stats_bg_flag when the background + stats code is working on this table. The DROP + TABLE code waits for this to be cleared + before proceeding. */ +#define BG_STAT_SHOULD_QUIT (1 << 1) + /*!< BG_STAT_SHOULD_QUIT is set in + stats_bg_flag when DROP TABLE starts + waiting on BG_STAT_IN_PROGRESS to be cleared, + the background stats thread will detect this + and will eventually quit sooner */ + byte stats_bg_flag; + /*!< see BG_STAT_* above. + Writes are covered by dict_sys->mutex. + Dirty reads are possible. */ + /* @} */ + /*----------------------*/ + /**!< The following fields are used by the + AUTOINC code. The actual collection of + tables locked during AUTOINC read/write is + kept in trx_t. In order to quickly determine + whether a transaction has locked the AUTOINC + lock we keep a pointer to the transaction + here in the autoinc_trx variable. This is to + avoid acquiring the lock_sys_t::mutex and + scanning the vector in trx_t. + + When an AUTOINC lock has to wait, the + corresponding lock instance is created on + the trx lock heap rather than use the + pre-allocated instance in autoinc_lock below.*/ + /* @{ */ + lock_t* autoinc_lock; + /*!< a buffer for an AUTOINC lock + for this table: we allocate the memory here + so that individual transactions can get it + and release it without a need to allocate + space from the lock heap of the trx: + otherwise the lock heap would grow rapidly + if we do a large insert from a select */ + ib_mutex_t autoinc_mutex; + /*!< mutex protecting the autoincrement + counter */ + ib_uint64_t autoinc;/*!< autoinc counter value to give to the + next inserted row */ + ulong n_waiting_or_granted_auto_inc_locks; + /*!< This counter is used to track the number + of granted and pending autoinc locks on this + table. This value is set after acquiring the + lock_sys_t::mutex but we peek the contents to + determine whether other transactions have + acquired the AUTOINC lock or not. Of course + only one transaction can be granted the + lock but there can be multiple waiters. */ + const trx_t* autoinc_trx; + /*!< The transaction that currently holds the + the AUTOINC lock on this table. + Protected by lock_sys->mutex. */ + fts_t* fts; /* FTS specific state variables */ + /* @} */ + /*----------------------*/ + + ib_quiesce_t quiesce;/*!< Quiescing states, protected by the + dict_index_t::lock. ie. we can only change + the state if we acquire all the latches + (dict_index_t::lock) in X mode of this table's + indexes. */ + + /*----------------------*/ + ulint n_rec_locks; + /*!< Count of the number of record locks on + this table. We use this to determine whether + we can evict the table from the dictionary + cache. It is protected by lock_sys->mutex. */ + ulint n_ref_count; + /*!< count of how many handles are opened + to this table; dropping of the table is + NOT allowed until this count gets to zero; + MySQL does NOT itself check the number of + open handles at drop */ + UT_LIST_BASE_NODE_T(lock_t) + locks; /*!< list of locks on the table; protected + by lock_sys->mutex */ +#endif /* !UNIV_HOTBACKUP */ + +#ifdef UNIV_DEBUG + ulint magic_n;/*!< magic number */ +/** Value of dict_table_t::magic_n */ +# define DICT_TABLE_MAGIC_N 76333786 +#endif /* UNIV_DEBUG */ +}; + +/** A function object to add the foreign key constraint to the referenced set +of the referenced table, if it exists in the dictionary cache. */ +struct dict_foreign_add_to_referenced_table { + void operator()(dict_foreign_t* foreign) const + { + if (dict_table_t* table = foreign->referenced_table) { + std::pair<dict_foreign_set::iterator, bool> ret + = table->referenced_set.insert(foreign); + ut_a(ret.second); + } + } +}; + +#ifndef UNIV_NONINL +#include "dict0mem.ic" +#endif + +#endif diff --git a/storage/innobase/include/dict0mem.ic b/storage/innobase/include/dict0mem.ic new file mode 100644 index 00000000000..38d51f61789 --- /dev/null +++ b/storage/innobase/include/dict0mem.ic @@ -0,0 +1,74 @@ +/***************************************************************************** + +Copyright (c) 1996, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/dict0mem.ic +Data dictionary memory object creation + +Created 1/8/1996 Heikki Tuuri +***********************************************************************/ + +#include "data0type.h" +#include "dict0mem.h" +#include "fil0fil.h" + +/**********************************************************************//** +This function poplulates a dict_index_t index memory structure with +supplied information. */ +UNIV_INLINE +void +dict_mem_fill_index_struct( +/*=======================*/ + dict_index_t* index, /*!< out: index to be filled */ + mem_heap_t* heap, /*!< in: memory heap */ + const char* table_name, /*!< in: table name */ + const char* index_name, /*!< in: index name */ + ulint space, /*!< in: space where the index tree is + placed, ignored if the index is of + the clustered type */ + ulint type, /*!< in: DICT_UNIQUE, + DICT_CLUSTERED, ... ORed */ + ulint n_fields) /*!< in: number of fields */ +{ + + if (heap) { + index->heap = heap; + index->name = mem_heap_strdup(heap, index_name); + index->fields = (dict_field_t*) mem_heap_alloc( + heap, 1 + n_fields * sizeof(dict_field_t)); + } else { + index->name = index_name; + index->heap = NULL; + index->fields = NULL; + } + + /* Assign a ulint to a 4-bit-mapped field. + Only the low-order 4 bits are assigned. */ + index->type = type; +#ifndef UNIV_HOTBACKUP + index->space = (unsigned int) space; + index->page = FIL_NULL; +#endif /* !UNIV_HOTBACKUP */ + index->table_name = table_name; + index->n_fields = (unsigned int) n_fields; + /* The '1 +' above prevents allocation + of an empty mem block */ +#ifdef UNIV_DEBUG + index->magic_n = DICT_INDEX_MAGIC_N; +#endif /* UNIV_DEBUG */ +} diff --git a/storage/innobase/include/dict0priv.h b/storage/innobase/include/dict0priv.h new file mode 100644 index 00000000000..9a3c8e22992 --- /dev/null +++ b/storage/innobase/include/dict0priv.h @@ -0,0 +1,63 @@ +/***************************************************************************** + +Copyright (c) 2010, 2013, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/dict0priv.h +Data dictionary private functions + +Created Fri 2 Jul 2010 13:30:38 EST - Sunny Bains +*******************************************************/ + +#ifndef dict0priv_h +#define dict0priv_h + +/**********************************************************************//** +Gets a table; loads it to the dictionary cache if necessary. A low-level +function. Note: Not to be called from outside dict0*c functions. +@return table, NULL if not found */ +UNIV_INLINE +dict_table_t* +dict_table_get_low( +/*===============*/ + const char* table_name); /*!< in: table name */ + +/**********************************************************************//** +Checks if a table is in the dictionary cache. +@return table, NULL if not found */ +UNIV_INLINE +dict_table_t* +dict_table_check_if_in_cache_low( +/*=============================*/ + const char* table_name); /*!< in: table name */ + +/**********************************************************************//** +Returns a table object based on table id. +@return table, NULL if does not exist */ +UNIV_INLINE +dict_table_t* +dict_table_open_on_id_low( +/*=====================*/ + table_id_t table_id, /*!< in: table id */ + dict_err_ignore_t ignore_err); /*!< in: errors to ignore + when loading the table */ + +#ifndef UNIV_NONINL +#include "dict0priv.ic" +#endif + +#endif /* dict0priv.h */ diff --git a/storage/innobase/include/dict0priv.ic b/storage/innobase/include/dict0priv.ic new file mode 100644 index 00000000000..30ba8fb60aa --- /dev/null +++ b/storage/innobase/include/dict0priv.ic @@ -0,0 +1,125 @@ +/***************************************************************************** + +Copyright (c) 2010, 2013, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/dict0priv.ic +Data dictionary system private include file + +Created Wed 13 Oct 2010 16:10:14 EST Sunny Bains +***********************************************************************/ + +#include "dict0dict.h" +#include "dict0load.h" +#include "dict0priv.h" +#ifndef UNIV_HOTBACKUP + +/**********************************************************************//** +Gets a table; loads it to the dictionary cache if necessary. A low-level +function. +@return table, NULL if not found */ +UNIV_INLINE +dict_table_t* +dict_table_get_low( +/*===============*/ + const char* table_name) /*!< in: table name */ +{ + dict_table_t* table; + + ut_ad(table_name); + ut_ad(mutex_own(&(dict_sys->mutex))); + + table = dict_table_check_if_in_cache_low(table_name); + + if (table && table->corrupted) { + fprintf(stderr, "InnoDB: table"); + ut_print_name(stderr, NULL, TRUE, table->name); + if (srv_load_corrupted) { + fputs(" is corrupted, but" + " innodb_force_load_corrupted is set\n", stderr); + } else { + fputs(" is corrupted\n", stderr); + return(NULL); + } + } + + if (table == NULL) { + table = dict_load_table(table_name, TRUE, DICT_ERR_IGNORE_NONE); + } + + ut_ad(!table || table->cached); + + return(table); +} + +/**********************************************************************//** +Returns a table object based on table id. +@return table, NULL if does not exist */ +UNIV_INLINE +dict_table_t* +dict_table_open_on_id_low( +/*======================*/ + table_id_t table_id, /*!< in: table id */ + dict_err_ignore_t ignore_err) /*!< in: errors to ignore + when loading the table */ +{ + dict_table_t* table; + ulint fold; + + ut_ad(mutex_own(&(dict_sys->mutex))); + + /* Look for the table name in the hash table */ + fold = ut_fold_ull(table_id); + + HASH_SEARCH(id_hash, dict_sys->table_id_hash, fold, + dict_table_t*, table, ut_ad(table->cached), + table->id == table_id); + if (table == NULL) { + table = dict_load_table_on_id(table_id, ignore_err); + } + + ut_ad(!table || table->cached); + + /* TODO: should get the type information from MySQL */ + + return(table); +} + +/**********************************************************************//** +Checks if a table is in the dictionary cache. +@return table, NULL if not found */ +UNIV_INLINE +dict_table_t* +dict_table_check_if_in_cache_low( +/*=============================*/ + const char* table_name) /*!< in: table name */ +{ + dict_table_t* table; + ulint table_fold; + + ut_ad(table_name); + ut_ad(mutex_own(&(dict_sys->mutex))); + + /* Look for the table name in the hash table */ + table_fold = ut_fold_string(table_name); + + HASH_SEARCH(name_hash, dict_sys->table_hash, table_fold, + dict_table_t*, table, ut_ad(table->cached), + !strcmp(table->name, table_name)); + return(table); +} +#endif /*! UNIV_HOTBACKUP */ diff --git a/storage/innobase/include/dict0stats.h b/storage/innobase/include/dict0stats.h new file mode 100644 index 00000000000..186f90e3694 --- /dev/null +++ b/storage/innobase/include/dict0stats.h @@ -0,0 +1,202 @@ +/***************************************************************************** + +Copyright (c) 2009, 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/dict0stats.h +Code used for calculating and manipulating table statistics. + +Created Jan 06, 2010 Vasil Dimov +*******************************************************/ + +#ifndef dict0stats_h +#define dict0stats_h + +#include "univ.i" + +#include "db0err.h" +#include "dict0types.h" +#include "trx0types.h" + +enum dict_stats_upd_option_t { + DICT_STATS_RECALC_PERSISTENT,/* (re) calculate the + statistics using a precise and slow + algo and save them to the persistent + storage, if the persistent storage is + not present then emit a warning and + fall back to transient stats */ + DICT_STATS_RECALC_TRANSIENT,/* (re) calculate the statistics + using an imprecise quick algo + without saving the results + persistently */ + DICT_STATS_EMPTY_TABLE, /* Write all zeros (or 1 where it makes sense) + into a table and its indexes' statistics + members. The resulting stats correspond to an + empty table. If the table is using persistent + statistics, then they are saved on disk. */ + DICT_STATS_FETCH_ONLY_IF_NOT_IN_MEMORY /* fetch the stats + from the persistent storage if the in-memory + structures have not been initialized yet, + otherwise do nothing */ +}; + +/*********************************************************************//** +Calculates new estimates for table and index statistics. This function +is relatively quick and is used to calculate transient statistics that +are not saved on disk. +This was the only way to calculate statistics before the +Persistent Statistics feature was introduced. */ +UNIV_INTERN +void +dict_stats_update_transient( +/*========================*/ + dict_table_t* table); /*!< in/out: table */ + +/*********************************************************************//** +Set the persistent statistics flag for a given table. This is set only +in the in-memory table object and is not saved on disk. It will be read +from the .frm file upon first open from MySQL after a server restart. */ +UNIV_INLINE +void +dict_stats_set_persistent( +/*======================*/ + dict_table_t* table, /*!< in/out: table */ + ibool ps_on, /*!< in: persistent stats explicitly enabled */ + ibool ps_off) /*!< in: persistent stats explicitly disabled */ + __attribute__((nonnull)); + +/*********************************************************************//** +Check whether persistent statistics is enabled for a given table. +@return TRUE if enabled, FALSE otherwise */ +UNIV_INLINE +ibool +dict_stats_is_persistent_enabled( +/*=============================*/ + const dict_table_t* table) /*!< in: table */ + __attribute__((nonnull, warn_unused_result)); + +/*********************************************************************//** +Set the auto recalc flag for a given table (only honored for a persistent +stats enabled table). The flag is set only in the in-memory table object +and is not saved in InnoDB files. It will be read from the .frm file upon +first open from MySQL after a server restart. */ +UNIV_INLINE +void +dict_stats_auto_recalc_set( +/*=======================*/ + dict_table_t* table, /*!< in/out: table */ + ibool auto_recalc_on, /*!< in: explicitly enabled */ + ibool auto_recalc_off); /*!< in: explicitly disabled */ + +/*********************************************************************//** +Check whether auto recalc is enabled for a given table. +@return TRUE if enabled, FALSE otherwise */ +UNIV_INLINE +ibool +dict_stats_auto_recalc_is_enabled( +/*==============================*/ + const dict_table_t* table); /*!< in: table */ + +/*********************************************************************//** +Initialize table's stats for the first time when opening a table. */ +UNIV_INLINE +void +dict_stats_init( +/*============*/ + dict_table_t* table); /*!< in/out: table */ + +/*********************************************************************//** +Deinitialize table's stats after the last close of the table. This is +used to detect "FLUSH TABLE" and refresh the stats upon next open. */ +UNIV_INLINE +void +dict_stats_deinit( +/*==============*/ + dict_table_t* table) /*!< in/out: table */ + __attribute__((nonnull)); + +/*********************************************************************//** +Calculates new estimates for table and index statistics. The statistics +are used in query optimization. +@return DB_* error code or DB_SUCCESS */ +UNIV_INTERN +dberr_t +dict_stats_update( +/*==============*/ + dict_table_t* table, /*!< in/out: table */ + dict_stats_upd_option_t stats_upd_option); + /*!< in: whether to (re) calc + the stats or to fetch them from + the persistent storage */ + +/*********************************************************************//** +Removes the information for a particular index's stats from the persistent +storage if it exists and if there is data stored for this index. +This function creates its own trx and commits it. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +dict_stats_drop_index( +/*==================*/ + const char* tname, /*!< in: table name */ + const char* iname, /*!< in: index name */ + char* errstr, /*!< out: error message if != DB_SUCCESS + is returned */ + ulint errstr_sz);/*!< in: size of the errstr buffer */ + +/*********************************************************************//** +Removes the statistics for a table and all of its indexes from the +persistent storage if it exists and if there is data stored for the table. +This function creates its own transaction and commits it. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +dict_stats_drop_table( +/*==================*/ + const char* table_name, /*!< in: table name */ + char* errstr, /*!< out: error message + if != DB_SUCCESS is returned */ + ulint errstr_sz); /*!< in: size of errstr buffer */ + +/*********************************************************************//** +Fetches or calculates new estimates for index statistics. */ +UNIV_INTERN +void +dict_stats_update_for_index( +/*========================*/ + dict_index_t* index) /*!< in/out: index */ + __attribute__((nonnull)); + +/*********************************************************************//** +Renames a table in InnoDB persistent stats storage. +This function creates its own transaction and commits it. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +dict_stats_rename_table( +/*====================*/ + const char* old_name, /*!< in: old table name */ + const char* new_name, /*!< in: new table name */ + char* errstr, /*!< out: error string if != DB_SUCCESS + is returned */ + size_t errstr_sz); /*!< in: errstr size */ + +#ifndef UNIV_NONINL +#include "dict0stats.ic" +#endif + +#endif /* dict0stats_h */ diff --git a/storage/innobase/include/dict0stats.ic b/storage/innobase/include/dict0stats.ic new file mode 100644 index 00000000000..ec9a9065470 --- /dev/null +++ b/storage/innobase/include/dict0stats.ic @@ -0,0 +1,236 @@ +/***************************************************************************** + +Copyright (c) 2012, 2013, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/dict0stats.ic +Code used for calculating and manipulating table statistics. + +Created Jan 23, 2012 Vasil Dimov +*******************************************************/ + +#include "univ.i" +#include "dict0dict.h" /* dict_table_stats_lock() */ +#include "dict0types.h" /* dict_table_t */ +#include "srv0srv.h" /* srv_stats_persistent, srv_stats_auto_recalc */ + +/*********************************************************************//** +Set the persistent statistics flag for a given table. This is set only +in the in-memory table object and is not saved on disk. It will be read +from the .frm file upon first open from MySQL after a server restart. */ +UNIV_INLINE +void +dict_stats_set_persistent( +/*======================*/ + dict_table_t* table, /*!< in/out: table */ + ibool ps_on, /*!< in: persistent stats explicitly enabled */ + ibool ps_off) /*!< in: persistent stats explicitly disabled */ +{ + /* Not allowed to have both flags set, but a CREATE or ALTER + statement that contains "STATS_PERSISTENT=0 STATS_PERSISTENT=1" would + end up having both set. In this case we clear the OFF flag. */ + if (ps_on && ps_off) { + ps_off = FALSE; + } + + ib_uint32_t stat_persistent = 0; + + if (ps_on) { + stat_persistent |= DICT_STATS_PERSISTENT_ON; + } + + if (ps_off) { + stat_persistent |= DICT_STATS_PERSISTENT_OFF; + } + + /* we rely on this assignment to be atomic */ + table->stat_persistent = stat_persistent; +} + +/*********************************************************************//** +Check whether persistent statistics is enabled for a given table. +@return TRUE if enabled, FALSE otherwise */ +UNIV_INLINE +ibool +dict_stats_is_persistent_enabled( +/*=============================*/ + const dict_table_t* table) /*!< in: table */ +{ + /* Because of the nature of this check (non-locking) it is possible + that a table becomes: + * PS-disabled immediately after this function has returned TRUE or + * PS-enabled immediately after this function has returned FALSE. + This means that it is possible that we do: + + dict_stats_update(DICT_STATS_RECALC_PERSISTENT) on a table that has + just been PS-disabled or + + dict_stats_update(DICT_STATS_RECALC_TRANSIENT) on a table that has + just been PS-enabled. + This is acceptable. Avoiding this would mean that we would have to + protect the ::stat_persistent with dict_table_stats_lock() like the + other ::stat_ members which would be too big performance penalty, + especially when this function is called from + row_update_statistics_if_needed(). */ + + /* we rely on this read to be atomic */ + ib_uint32_t stat_persistent = table->stat_persistent; + + if (stat_persistent & DICT_STATS_PERSISTENT_ON) { + ut_ad(!(stat_persistent & DICT_STATS_PERSISTENT_OFF)); + return(TRUE); + } else if (stat_persistent & DICT_STATS_PERSISTENT_OFF) { + return(FALSE); + } else { + return(srv_stats_persistent); + } +} + +/*********************************************************************//** +Set the auto recalc flag for a given table (only honored for a persistent +stats enabled table). The flag is set only in the in-memory table object +and is not saved in InnoDB files. It will be read from the .frm file upon +first open from MySQL after a server restart. */ +UNIV_INLINE +void +dict_stats_auto_recalc_set( +/*=======================*/ + dict_table_t* table, /*!< in/out: table */ + ibool auto_recalc_on, /*!< in: explicitly enabled */ + ibool auto_recalc_off) /*!< in: explicitly disabled */ +{ + ut_ad(!auto_recalc_on || !auto_recalc_off); + + ib_uint32_t stats_auto_recalc = 0; + + if (auto_recalc_on) { + stats_auto_recalc |= DICT_STATS_AUTO_RECALC_ON; + } + + if (auto_recalc_off) { + stats_auto_recalc |= DICT_STATS_AUTO_RECALC_OFF; + } + + /* we rely on this assignment to be atomic */ + table->stats_auto_recalc = stats_auto_recalc; +} + +/*********************************************************************//** +Check whether auto recalc is enabled for a given table. +@return TRUE if enabled, FALSE otherwise */ +UNIV_INLINE +ibool +dict_stats_auto_recalc_is_enabled( +/*==============================*/ + const dict_table_t* table) /*!< in: table */ +{ + /* we rely on this read to be atomic */ + ib_uint32_t stats_auto_recalc = table->stats_auto_recalc; + + if (stats_auto_recalc & DICT_STATS_AUTO_RECALC_ON) { + ut_ad(!(stats_auto_recalc & DICT_STATS_AUTO_RECALC_OFF)); + return(TRUE); + } else if (stats_auto_recalc & DICT_STATS_AUTO_RECALC_OFF) { + return(FALSE); + } else { + return(srv_stats_auto_recalc); + } +} + +/*********************************************************************//** +Initialize table's stats for the first time when opening a table. */ +UNIV_INLINE +void +dict_stats_init( +/*============*/ + dict_table_t* table) /*!< in/out: table */ +{ + ut_ad(!mutex_own(&dict_sys->mutex)); + + if (table->stat_initialized) { + return; + } + + dict_stats_upd_option_t opt; + + if (dict_stats_is_persistent_enabled(table)) { + opt = DICT_STATS_FETCH_ONLY_IF_NOT_IN_MEMORY; + } else { + opt = DICT_STATS_RECALC_TRANSIENT; + } + + dict_stats_update(table, opt); +} + +/*********************************************************************//** +Deinitialize table's stats after the last close of the table. This is +used to detect "FLUSH TABLE" and refresh the stats upon next open. */ +UNIV_INLINE +void +dict_stats_deinit( +/*==============*/ + dict_table_t* table) /*!< in/out: table */ +{ + ut_ad(mutex_own(&dict_sys->mutex)); + + ut_a(table->n_ref_count == 0); + + dict_table_stats_lock(table, RW_X_LATCH); + + if (!table->stat_initialized) { + dict_table_stats_unlock(table, RW_X_LATCH); + return; + } + + table->stat_initialized = FALSE; + +#ifdef UNIV_DEBUG_VALGRIND + UNIV_MEM_INVALID(&table->stat_n_rows, + sizeof(table->stat_n_rows)); + UNIV_MEM_INVALID(&table->stat_clustered_index_size, + sizeof(table->stat_clustered_index_size)); + UNIV_MEM_INVALID(&table->stat_sum_of_other_index_sizes, + sizeof(table->stat_sum_of_other_index_sizes)); + UNIV_MEM_INVALID(&table->stat_modified_counter, + sizeof(table->stat_modified_counter)); + + dict_index_t* index; + + for (index = dict_table_get_first_index(table); + index != NULL; + index = dict_table_get_next_index(index)) { + + ulint n_uniq = dict_index_get_n_unique(index); + + UNIV_MEM_INVALID( + index->stat_n_diff_key_vals, + n_uniq * sizeof(index->stat_n_diff_key_vals[0])); + UNIV_MEM_INVALID( + index->stat_n_sample_sizes, + n_uniq * sizeof(index->stat_n_sample_sizes[0])); + UNIV_MEM_INVALID( + index->stat_n_non_null_key_vals, + n_uniq * sizeof(index->stat_n_non_null_key_vals[0])); + UNIV_MEM_INVALID( + &index->stat_index_size, + sizeof(index->stat_index_size)); + UNIV_MEM_INVALID( + &index->stat_n_leaf_pages, + sizeof(index->stat_n_leaf_pages)); + } +#endif /* UNIV_DEBUG_VALGRIND */ + + dict_table_stats_unlock(table, RW_X_LATCH); +} diff --git a/storage/innobase/include/dict0stats_bg.h b/storage/innobase/include/dict0stats_bg.h new file mode 100644 index 00000000000..e866ab419fe --- /dev/null +++ b/storage/innobase/include/dict0stats_bg.h @@ -0,0 +1,127 @@ +/***************************************************************************** + +Copyright (c) 2012, 2013, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/dict0stats_bg.h +Code used for background table and index stats gathering. + +Created Apr 26, 2012 Vasil Dimov +*******************************************************/ + +#ifndef dict0stats_bg_h +#define dict0stats_bg_h + +#include "univ.i" + +#include "dict0types.h" /* dict_table_t, table_id_t */ +#include "os0sync.h" /* os_event_t */ +#include "os0thread.h" /* DECLARE_THREAD */ + +/** Event to wake up the stats thread */ +extern os_event_t dict_stats_event; + +/*****************************************************************//** +Add a table to the recalc pool, which is processed by the +background stats gathering thread. Only the table id is added to the +list, so the table can be closed after being enqueued and it will be +opened when needed. If the table does not exist later (has been DROPped), +then it will be removed from the pool and skipped. */ +UNIV_INTERN +void +dict_stats_recalc_pool_add( +/*=======================*/ + const dict_table_t* table); /*!< in: table to add */ + +/*****************************************************************//** +Delete a given table from the auto recalc pool. +dict_stats_recalc_pool_del() */ +UNIV_INTERN +void +dict_stats_recalc_pool_del( +/*=======================*/ + const dict_table_t* table); /*!< in: table to remove */ + +/** Yield the data dictionary latch when waiting +for the background thread to stop accessing a table. +@param trx transaction holding the data dictionary locks */ +#define DICT_STATS_BG_YIELD(trx) do { \ + row_mysql_unlock_data_dictionary(trx); \ + os_thread_sleep(250000); \ + row_mysql_lock_data_dictionary(trx); \ +} while (0) + +/*****************************************************************//** +Request the background collection of statistics to stop for a table. +@retval true when no background process is active +@retval false when it is not safe to modify the table definition */ +UNIV_INLINE +bool +dict_stats_stop_bg( +/*===============*/ + dict_table_t* table) /*!< in/out: table */ + __attribute__((warn_unused_result)); + +/*****************************************************************//** +Wait until background stats thread has stopped using the specified table. +The caller must have locked the data dictionary using +row_mysql_lock_data_dictionary() and this function may unlock it temporarily +and restore the lock before it exits. +The background stats thread is guaranteed not to start using the specified +table after this function returns and before the caller unlocks the data +dictionary because it sets the BG_STAT_IN_PROGRESS bit in table->stats_bg_flag +under dict_sys->mutex. */ +UNIV_INTERN +void +dict_stats_wait_bg_to_stop_using_table( +/*===================================*/ + dict_table_t* table, /*!< in/out: table */ + trx_t* trx); /*!< in/out: transaction to use for + unlocking/locking the data dict */ +/*****************************************************************//** +Initialize global variables needed for the operation of dict_stats_thread(). +Must be called before dict_stats_thread() is started. */ +UNIV_INTERN +void +dict_stats_thread_init(); +/*====================*/ + +/*****************************************************************//** +Free resources allocated by dict_stats_thread_init(), must be called +after dict_stats_thread() has exited. */ +UNIV_INTERN +void +dict_stats_thread_deinit(); +/*======================*/ + +/*****************************************************************//** +This is the thread for background stats gathering. It pops tables, from +the auto recalc list and proceeds them, eventually recalculating their +statistics. +@return this function does not return, it calls os_thread_exit() */ +extern "C" UNIV_INTERN +os_thread_ret_t +DECLARE_THREAD(dict_stats_thread)( +/*==============================*/ + void* arg); /*!< in: a dummy parameter + required by os_thread_create */ + +# ifndef UNIV_NONINL +# include "dict0stats_bg.ic" +# endif + +#endif /* dict0stats_bg_h */ diff --git a/storage/innobase/include/dict0stats_bg.ic b/storage/innobase/include/dict0stats_bg.ic new file mode 100644 index 00000000000..87e3225de58 --- /dev/null +++ b/storage/innobase/include/dict0stats_bg.ic @@ -0,0 +1,45 @@ +/***************************************************************************** + +Copyright (c) 2012, 2013, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/dict0stats_bg.ic +Code used for background table and index stats gathering. + +Created Feb 8, 2013 Marko Makela +*******************************************************/ + +/*****************************************************************//** +Request the background collection of statistics to stop for a table. +@retval true when no background process is active +@retval false when it is not safe to modify the table definition */ +UNIV_INLINE +bool +dict_stats_stop_bg( +/*===============*/ + dict_table_t* table) /*!< in/out: table */ +{ + ut_ad(!srv_read_only_mode); + ut_ad(mutex_own(&dict_sys->mutex)); + + if (!(table->stats_bg_flag & BG_STAT_IN_PROGRESS)) { + return(true); + } + + table->stats_bg_flag |= BG_STAT_SHOULD_QUIT; + return(false); +} diff --git a/storage/innobase/include/dict0types.h b/storage/innobase/include/dict0types.h new file mode 100644 index 00000000000..d34b6f7eab3 --- /dev/null +++ b/storage/innobase/include/dict0types.h @@ -0,0 +1,91 @@ +/***************************************************************************** + +Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/dict0types.h +Data dictionary global types + +Created 1/8/1996 Heikki Tuuri +*******************************************************/ + +#ifndef dict0types_h +#define dict0types_h + +struct dict_sys_t; +struct dict_col_t; +struct dict_field_t; +struct dict_index_t; +struct dict_table_t; +struct dict_foreign_t; + +struct ind_node_t; +struct tab_node_t; + +/* Space id and page no where the dictionary header resides */ +#define DICT_HDR_SPACE 0 /* the SYSTEM tablespace */ +#define DICT_HDR_PAGE_NO FSP_DICT_HDR_PAGE_NO + +/* The ibuf table and indexes's ID are assigned as the number +DICT_IBUF_ID_MIN plus the space id */ +#define DICT_IBUF_ID_MIN 0xFFFFFFFF00000000ULL + +typedef ib_id_t table_id_t; +typedef ib_id_t index_id_t; + +/** Error to ignore when we load table dictionary into memory. However, +the table and index will be marked as "corrupted", and caller will +be responsible to deal with corrupted table or index. +Note: please define the IGNORE_ERR_* as bits, so their value can +be or-ed together */ +enum dict_err_ignore_t { + DICT_ERR_IGNORE_NONE = 0, /*!< no error to ignore */ + DICT_ERR_IGNORE_INDEX_ROOT = 1, /*!< ignore error if index root + page is FIL_NULL or incorrect value */ + DICT_ERR_IGNORE_CORRUPT = 2, /*!< skip corrupted indexes */ + DICT_ERR_IGNORE_FK_NOKEY = 4, /*!< ignore error if any foreign + key is missing */ + DICT_ERR_IGNORE_RECOVER_LOCK = 8, + /*!< Used when recovering table locks + for resurrected transactions. + Silently load a missing + tablespace, and do not load + incomplete index definitions. */ + DICT_ERR_IGNORE_ALL = 0xFFFF /*!< ignore all errors */ +}; + +/** Quiescing states for flushing tables to disk. */ +enum ib_quiesce_t { + QUIESCE_NONE, + QUIESCE_START, /*!< Initialise, prepare to start */ + QUIESCE_COMPLETE /*!< All done */ +}; + +/** Prefix for tmp tables, adopted from sql/table.h */ +#define tmp_file_prefix "#sql" +#define tmp_file_prefix_length 4 +#define TEMP_FILE_PREFIX_INNODB "#sql-ib" + +#define TEMP_TABLE_PREFIX "#sql" +#define TEMP_TABLE_PATH_PREFIX "/" TEMP_TABLE_PREFIX + +#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG +/** Flag to control insert buffer debugging. */ +extern uint ibuf_debug; +#endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */ + +#endif diff --git a/storage/innobase/include/dyn0dyn.h b/storage/innobase/include/dyn0dyn.h new file mode 100644 index 00000000000..7f23302d1ff --- /dev/null +++ b/storage/innobase/include/dyn0dyn.h @@ -0,0 +1,199 @@ +/***************************************************************************** + +Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/dyn0dyn.h +The dynamically allocated array + +Created 2/5/1996 Heikki Tuuri +*******************************************************/ + +#ifndef dyn0dyn_h +#define dyn0dyn_h + +#include "univ.i" +#include "ut0lst.h" +#include "mem0mem.h" + +/** A block in a dynamically allocated array */ +struct dyn_block_t; +/** Dynamically allocated array */ +typedef dyn_block_t dyn_array_t; + +/** This is the initial 'payload' size of a dynamic array; +this must be > MLOG_BUF_MARGIN + 30! */ +#define DYN_ARRAY_DATA_SIZE 512 + +/*********************************************************************//** +Initializes a dynamic array. +@return initialized dyn array */ +UNIV_INLINE +dyn_array_t* +dyn_array_create( +/*=============*/ + dyn_array_t* arr) /*!< in/out memory buffer of + size sizeof(dyn_array_t) */ + __attribute__((nonnull)); +/************************************************************//** +Frees a dynamic array. */ +UNIV_INLINE +void +dyn_array_free( +/*===========*/ + dyn_array_t* arr) /*!< in,own: dyn array */ + __attribute__((nonnull)); +/*********************************************************************//** +Makes room on top of a dyn array and returns a pointer to a buffer in it. +After copying the elements, the caller must close the buffer using +dyn_array_close. +@return pointer to the buffer */ +UNIV_INLINE +byte* +dyn_array_open( +/*===========*/ + dyn_array_t* arr, /*!< in: dynamic array */ + ulint size) /*!< in: size in bytes of the buffer; MUST be + smaller than DYN_ARRAY_DATA_SIZE! */ + __attribute__((nonnull, warn_unused_result)); +/*********************************************************************//** +Closes the buffer returned by dyn_array_open. */ +UNIV_INLINE +void +dyn_array_close( +/*============*/ + dyn_array_t* arr, /*!< in: dynamic array */ + const byte* ptr) /*!< in: end of used space */ + __attribute__((nonnull)); +/*********************************************************************//** +Makes room on top of a dyn array and returns a pointer to +the added element. The caller must copy the element to +the pointer returned. +@return pointer to the element */ +UNIV_INLINE +void* +dyn_array_push( +/*===========*/ + dyn_array_t* arr, /*!< in/out: dynamic array */ + ulint size) /*!< in: size in bytes of the element */ + __attribute__((nonnull, warn_unused_result)); +/************************************************************//** +Returns pointer to an element in dyn array. +@return pointer to element */ +UNIV_INLINE +void* +dyn_array_get_element( +/*==================*/ + const dyn_array_t* arr, /*!< in: dyn array */ + ulint pos) /*!< in: position of element + in bytes from array start */ + __attribute__((nonnull, warn_unused_result)); +/************************************************************//** +Returns the size of stored data in a dyn array. +@return data size in bytes */ +UNIV_INLINE +ulint +dyn_array_get_data_size( +/*====================*/ + const dyn_array_t* arr) /*!< in: dyn array */ + __attribute__((nonnull, warn_unused_result, pure)); +/************************************************************//** +Gets the first block in a dyn array. +@param arr dyn array +@return first block */ +#define dyn_array_get_first_block(arr) (arr) +/************************************************************//** +Gets the last block in a dyn array. +@param arr dyn array +@return last block */ +#define dyn_array_get_last_block(arr) \ + ((arr)->heap ? UT_LIST_GET_LAST((arr)->base) : (arr)) +/********************************************************************//** +Gets the next block in a dyn array. +@param arr dyn array +@param block dyn array block +@return pointer to next, NULL if end of list */ +#define dyn_array_get_next_block(arr, block) \ + ((arr)->heap ? UT_LIST_GET_NEXT(list, block) : NULL) +/********************************************************************//** +Gets the previous block in a dyn array. +@param arr dyn array +@param block dyn array block +@return pointer to previous, NULL if end of list */ +#define dyn_array_get_prev_block(arr, block) \ + ((arr)->heap ? UT_LIST_GET_PREV(list, block) : NULL) +/********************************************************************//** +Gets the number of used bytes in a dyn array block. +@return number of bytes used */ +UNIV_INLINE +ulint +dyn_block_get_used( +/*===============*/ + const dyn_block_t* block) /*!< in: dyn array block */ + __attribute__((nonnull, warn_unused_result, pure)); +/********************************************************************//** +Gets pointer to the start of data in a dyn array block. +@return pointer to data */ +UNIV_INLINE +byte* +dyn_block_get_data( +/*===============*/ + const dyn_block_t* block) /*!< in: dyn array block */ + __attribute__((nonnull, warn_unused_result, pure)); +/********************************************************//** +Pushes n bytes to a dyn array. */ +UNIV_INLINE +void +dyn_push_string( +/*============*/ + dyn_array_t* arr, /*!< in/out: dyn array */ + const byte* str, /*!< in: string to write */ + ulint len) /*!< in: string length */ + __attribute__((nonnull)); + +/*#################################################################*/ + +/** @brief A block in a dynamically allocated array. +NOTE! Do not access the fields of the struct directly: the definition +appears here only for the compiler to know its size! */ +struct dyn_block_t{ + mem_heap_t* heap; /*!< in the first block this is != NULL + if dynamic allocation has been needed */ + ulint used; /*!< number of data bytes used in this block; + DYN_BLOCK_FULL_FLAG is set when the block + becomes full */ + byte data[DYN_ARRAY_DATA_SIZE]; + /*!< storage for array elements */ + UT_LIST_BASE_NODE_T(dyn_block_t) base; + /*!< linear list of dyn blocks: this node is + used only in the first block */ + UT_LIST_NODE_T(dyn_block_t) list; + /*!< linear list node: used in all blocks */ +#ifdef UNIV_DEBUG + ulint buf_end;/*!< only in the debug version: if dyn + array is opened, this is the buffer + end offset, else this is 0 */ + ulint magic_n;/*!< magic number (DYN_BLOCK_MAGIC_N) */ +#endif +}; + + +#ifndef UNIV_NONINL +#include "dyn0dyn.ic" +#endif + +#endif diff --git a/storage/innobase/include/dyn0dyn.ic b/storage/innobase/include/dyn0dyn.ic new file mode 100644 index 00000000000..0296554e2ee --- /dev/null +++ b/storage/innobase/include/dyn0dyn.ic @@ -0,0 +1,306 @@ +/***************************************************************************** + +Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/dyn0dyn.ic +The dynamically allocated array + +Created 2/5/1996 Heikki Tuuri +*******************************************************/ + +/** Value of dyn_block_t::magic_n */ +#define DYN_BLOCK_MAGIC_N 375767 +/** Flag for dyn_block_t::used that indicates a full block */ +#define DYN_BLOCK_FULL_FLAG 0x1000000UL + +/************************************************************//** +Adds a new block to a dyn array. +@return created block */ +UNIV_INTERN +dyn_block_t* +dyn_array_add_block( +/*================*/ + dyn_array_t* arr) /*!< in/out: dyn array */ + __attribute__((nonnull, warn_unused_result)); + +/********************************************************************//** +Gets the number of used bytes in a dyn array block. +@return number of bytes used */ +UNIV_INLINE +ulint +dyn_block_get_used( +/*===============*/ + const dyn_block_t* block) /*!< in: dyn array block */ +{ + ut_ad(block); + + return((block->used) & ~DYN_BLOCK_FULL_FLAG); +} + +/********************************************************************//** +Gets pointer to the start of data in a dyn array block. +@return pointer to data */ +UNIV_INLINE +byte* +dyn_block_get_data( +/*===============*/ + const dyn_block_t* block) /*!< in: dyn array block */ +{ + ut_ad(block); + + return(const_cast<byte*>(block->data)); +} + +/*********************************************************************//** +Initializes a dynamic array. +@return initialized dyn array */ +UNIV_INLINE +dyn_array_t* +dyn_array_create( +/*=============*/ + dyn_array_t* arr) /*!< in/out: memory buffer of + size sizeof(dyn_array_t) */ +{ + ut_ad(arr); +#if DYN_ARRAY_DATA_SIZE >= DYN_BLOCK_FULL_FLAG +# error "DYN_ARRAY_DATA_SIZE >= DYN_BLOCK_FULL_FLAG" +#endif + + arr->heap = NULL; + arr->used = 0; + + ut_d(arr->buf_end = 0); + ut_d(arr->magic_n = DYN_BLOCK_MAGIC_N); + + return(arr); +} + +/************************************************************//** +Frees a dynamic array. */ +UNIV_INLINE +void +dyn_array_free( +/*===========*/ + dyn_array_t* arr) /*!< in: dyn array */ +{ + if (arr->heap != NULL) { + mem_heap_free(arr->heap); + } + + ut_d(arr->magic_n = 0); +} + +/*********************************************************************//** +Makes room on top of a dyn array and returns a pointer to the added element. +The caller must copy the element to the pointer returned. +@return pointer to the element */ +UNIV_INLINE +void* +dyn_array_push( +/*===========*/ + dyn_array_t* arr, /*!< in/out: dynamic array */ + ulint size) /*!< in: size in bytes of the element */ +{ + dyn_block_t* block; + ulint used; + + ut_ad(arr); + ut_ad(arr->magic_n == DYN_BLOCK_MAGIC_N); + ut_ad(size <= DYN_ARRAY_DATA_SIZE); + ut_ad(size); + + block = arr; + + if (block->used + size > DYN_ARRAY_DATA_SIZE) { + /* Get the last array block */ + + block = dyn_array_get_last_block(arr); + + if (block->used + size > DYN_ARRAY_DATA_SIZE) { + block = dyn_array_add_block(arr); + } + } + + used = block->used; + + block->used = used + size; + ut_ad(block->used <= DYN_ARRAY_DATA_SIZE); + + return(block->data + used); +} + +/*********************************************************************//** +Makes room on top of a dyn array and returns a pointer to a buffer in it. +After copying the elements, the caller must close the buffer using +dyn_array_close. +@return pointer to the buffer */ +UNIV_INLINE +byte* +dyn_array_open( +/*===========*/ + dyn_array_t* arr, /*!< in: dynamic array */ + ulint size) /*!< in: size in bytes of the buffer; MUST be + smaller than DYN_ARRAY_DATA_SIZE! */ +{ + dyn_block_t* block; + + ut_ad(arr); + ut_ad(arr->magic_n == DYN_BLOCK_MAGIC_N); + ut_ad(size <= DYN_ARRAY_DATA_SIZE); + ut_ad(size); + + block = arr; + + if (block->used + size > DYN_ARRAY_DATA_SIZE) { + /* Get the last array block */ + + block = dyn_array_get_last_block(arr); + + if (block->used + size > DYN_ARRAY_DATA_SIZE) { + block = dyn_array_add_block(arr); + ut_a(size <= DYN_ARRAY_DATA_SIZE); + } + } + + ut_ad(block->used <= DYN_ARRAY_DATA_SIZE); + ut_ad(arr->buf_end == 0); + ut_d(arr->buf_end = block->used + size); + + return(block->data + block->used); +} + +/*********************************************************************//** +Closes the buffer returned by dyn_array_open. */ +UNIV_INLINE +void +dyn_array_close( +/*============*/ + dyn_array_t* arr, /*!< in/out: dynamic array */ + const byte* ptr) /*!< in: end of used space */ +{ + dyn_block_t* block; + + ut_ad(arr); + ut_ad(arr->magic_n == DYN_BLOCK_MAGIC_N); + + block = dyn_array_get_last_block(arr); + + ut_ad(arr->buf_end + block->data >= ptr); + + block->used = ptr - block->data; + + ut_ad(block->used <= DYN_ARRAY_DATA_SIZE); + + ut_d(arr->buf_end = 0); +} + +/************************************************************//** +Returns pointer to an element in dyn array. +@return pointer to element */ +UNIV_INLINE +void* +dyn_array_get_element( +/*==================*/ + const dyn_array_t* arr, /*!< in: dyn array */ + ulint pos) /*!< in: position of element + in bytes from array start */ +{ + const dyn_block_t* block; + + ut_ad(arr); + ut_ad(arr->magic_n == DYN_BLOCK_MAGIC_N); + + /* Get the first array block */ + block = dyn_array_get_first_block(arr); + + if (arr->heap != NULL) { + for (;;) { + ulint used = dyn_block_get_used(block); + + if (pos < used) { + break; + } + + pos -= used; + block = UT_LIST_GET_NEXT(list, block); + ut_ad(block); + } + } + + ut_ad(block); + ut_ad(dyn_block_get_used(block) >= pos); + + return(const_cast<byte*>(block->data) + pos); +} + +/************************************************************//** +Returns the size of stored data in a dyn array. +@return data size in bytes */ +UNIV_INLINE +ulint +dyn_array_get_data_size( +/*====================*/ + const dyn_array_t* arr) /*!< in: dyn array */ +{ + const dyn_block_t* block; + ulint sum = 0; + + ut_ad(arr); + ut_ad(arr->magic_n == DYN_BLOCK_MAGIC_N); + + if (arr->heap == NULL) { + + return(arr->used); + } + + /* Get the first array block */ + block = dyn_array_get_first_block(arr); + + while (block != NULL) { + sum += dyn_block_get_used(block); + block = dyn_array_get_next_block(arr, block); + } + + return(sum); +} + +/********************************************************//** +Pushes n bytes to a dyn array. */ +UNIV_INLINE +void +dyn_push_string( +/*============*/ + dyn_array_t* arr, /*!< in/out: dyn array */ + const byte* str, /*!< in: string to write */ + ulint len) /*!< in: string length */ +{ + ulint n_copied; + + while (len > 0) { + if (len > DYN_ARRAY_DATA_SIZE) { + n_copied = DYN_ARRAY_DATA_SIZE; + } else { + n_copied = len; + } + + memcpy(dyn_array_push(arr, n_copied), str, n_copied); + + str += n_copied; + len -= n_copied; + } +} diff --git a/storage/innobase/include/eval0eval.h b/storage/innobase/include/eval0eval.h new file mode 100644 index 00000000000..e3b1e6c16b6 --- /dev/null +++ b/storage/innobase/include/eval0eval.h @@ -0,0 +1,114 @@ +/***************************************************************************** + +Copyright (c) 1997, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/eval0eval.h +SQL evaluator: evaluates simple data structures, like expressions, in +a query graph + +Created 12/29/1997 Heikki Tuuri +*******************************************************/ + +#ifndef eval0eval_h +#define eval0eval_h + +#include "univ.i" +#include "que0types.h" +#include "pars0sym.h" +#include "pars0pars.h" + +/*****************************************************************//** +Free the buffer from global dynamic memory for a value of a que_node, +if it has been allocated in the above function. The freeing for pushed +column values is done in sel_col_prefetch_buf_free. */ +UNIV_INTERN +void +eval_node_free_val_buf( +/*===================*/ + que_node_t* node); /*!< in: query graph node */ +/*****************************************************************//** +Evaluates a symbol table symbol. */ +UNIV_INLINE +void +eval_sym( +/*=====*/ + sym_node_t* sym_node); /*!< in: symbol table node */ +/*****************************************************************//** +Evaluates an expression. */ +UNIV_INLINE +void +eval_exp( +/*=====*/ + que_node_t* exp_node); /*!< in: expression */ +/*****************************************************************//** +Sets an integer value as the value of an expression node. */ +UNIV_INLINE +void +eval_node_set_int_val( +/*==================*/ + que_node_t* node, /*!< in: expression node */ + lint val); /*!< in: value to set */ +/*****************************************************************//** +Gets an integer value from an expression node. +@return integer value */ +UNIV_INLINE +lint +eval_node_get_int_val( +/*==================*/ + que_node_t* node); /*!< in: expression node */ +/*****************************************************************//** +Copies a binary string value as the value of a query graph node. Allocates a +new buffer if necessary. */ +UNIV_INLINE +void +eval_node_copy_and_alloc_val( +/*=========================*/ + que_node_t* node, /*!< in: query graph node */ + const byte* str, /*!< in: binary string */ + ulint len); /*!< in: string length or UNIV_SQL_NULL */ +/*****************************************************************//** +Copies a query node value to another node. */ +UNIV_INLINE +void +eval_node_copy_val( +/*===============*/ + que_node_t* node1, /*!< in: node to copy to */ + que_node_t* node2); /*!< in: node to copy from */ +/*****************************************************************//** +Gets a iboolean value from a query node. +@return iboolean value */ +UNIV_INLINE +ibool +eval_node_get_ibool_val( +/*====================*/ + que_node_t* node); /*!< in: query graph node */ +/*****************************************************************//** +Evaluates a comparison node. +@return the result of the comparison */ +UNIV_INTERN +ibool +eval_cmp( +/*=====*/ + func_node_t* cmp_node); /*!< in: comparison node */ + + +#ifndef UNIV_NONINL +#include "eval0eval.ic" +#endif + +#endif diff --git a/storage/innobase/include/eval0eval.ic b/storage/innobase/include/eval0eval.ic new file mode 100644 index 00000000000..e4b1dd08017 --- /dev/null +++ b/storage/innobase/include/eval0eval.ic @@ -0,0 +1,255 @@ +/***************************************************************************** + +Copyright (c) 1997, 2011, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/eval0eval.ic +SQL evaluator: evaluates simple data structures, like expressions, in +a query graph + +Created 12/29/1997 Heikki Tuuri +*******************************************************/ + +#include "que0que.h" +#include "rem0cmp.h" +#include "pars0grm.h" + +/*****************************************************************//** +Evaluates a function node. */ +UNIV_INTERN +void +eval_func( +/*======*/ + func_node_t* func_node); /*!< in: function node */ +/*****************************************************************//** +Allocate a buffer from global dynamic memory for a value of a que_node. +NOTE that this memory must be explicitly freed when the query graph is +freed. If the node already has allocated buffer, that buffer is freed +here. NOTE that this is the only function where dynamic memory should be +allocated for a query node val field. +@return pointer to allocated buffer */ +UNIV_INTERN +byte* +eval_node_alloc_val_buf( +/*====================*/ + que_node_t* node, /*!< in: query graph node; sets the val field + data field to point to the new buffer, and + len field equal to size */ + ulint size); /*!< in: buffer size */ + + +/*****************************************************************//** +Allocates a new buffer if needed. +@return pointer to buffer */ +UNIV_INLINE +byte* +eval_node_ensure_val_buf( +/*=====================*/ + que_node_t* node, /*!< in: query graph node; sets the val field + data field to point to the new buffer, and + len field equal to size */ + ulint size) /*!< in: buffer size */ +{ + dfield_t* dfield; + byte* data; + + dfield = que_node_get_val(node); + dfield_set_len(dfield, size); + + data = static_cast<byte*>(dfield_get_data(dfield)); + + if (!data || que_node_get_val_buf_size(node) < size) { + + data = eval_node_alloc_val_buf(node, size); + } + + return(data); +} + +/*****************************************************************//** +Evaluates a symbol table symbol. */ +UNIV_INLINE +void +eval_sym( +/*=====*/ + sym_node_t* sym_node) /*!< in: symbol table node */ +{ + + ut_ad(que_node_get_type(sym_node) == QUE_NODE_SYMBOL); + + if (sym_node->indirection) { + /* The symbol table node is an alias for a variable or a + column */ + + dfield_copy_data(que_node_get_val(sym_node), + que_node_get_val(sym_node->indirection)); + } +} + +/*****************************************************************//** +Evaluates an expression. */ +UNIV_INLINE +void +eval_exp( +/*=====*/ + que_node_t* exp_node) /*!< in: expression */ +{ + if (que_node_get_type(exp_node) == QUE_NODE_SYMBOL) { + + eval_sym((sym_node_t*) exp_node); + + return; + } + + eval_func(static_cast<func_node_t*>(exp_node)); +} + +/*****************************************************************//** +Sets an integer value as the value of an expression node. */ +UNIV_INLINE +void +eval_node_set_int_val( +/*==================*/ + que_node_t* node, /*!< in: expression node */ + lint val) /*!< in: value to set */ +{ + dfield_t* dfield; + byte* data; + + dfield = que_node_get_val(node); + + data = static_cast<byte*>(dfield_get_data(dfield)); + + if (data == NULL) { + data = eval_node_alloc_val_buf(node, 4); + } + + ut_ad(dfield_get_len(dfield) == 4); + + mach_write_to_4(data, (ulint) val); +} + +/*****************************************************************//** +Gets an integer non-SQL null value from an expression node. +@return integer value */ +UNIV_INLINE +lint +eval_node_get_int_val( +/*==================*/ + que_node_t* node) /*!< in: expression node */ +{ + const byte* ptr; + dfield_t* dfield; + + dfield = que_node_get_val(node); + ptr = static_cast<byte*>(dfield_get_data(dfield)); + + ut_ad(dfield_get_len(dfield) == 4); + + return((int) mach_read_from_4(ptr)); +} + +/*****************************************************************//** +Gets a iboolean value from a query node. +@return iboolean value */ +UNIV_INLINE +ibool +eval_node_get_ibool_val( +/*====================*/ + que_node_t* node) /*!< in: query graph node */ +{ + dfield_t* dfield; + byte* data; + + dfield = que_node_get_val(node); + + data = static_cast<byte*>(dfield_get_data(dfield)); + + ut_ad(data != NULL); + + return(mach_read_from_1(data)); +} + +/*****************************************************************//** +Sets a iboolean value as the value of a function node. */ +UNIV_INLINE +void +eval_node_set_ibool_val( +/*====================*/ + func_node_t* func_node, /*!< in: function node */ + ibool val) /*!< in: value to set */ +{ + dfield_t* dfield; + byte* data; + + dfield = que_node_get_val(func_node); + + data = static_cast<byte*>(dfield_get_data(dfield)); + + if (data == NULL) { + /* Allocate 1 byte to hold the value */ + + data = eval_node_alloc_val_buf(func_node, 1); + } + + ut_ad(dfield_get_len(dfield) == 1); + + mach_write_to_1(data, val); +} + +/*****************************************************************//** +Copies a binary string value as the value of a query graph node. Allocates a +new buffer if necessary. */ +UNIV_INLINE +void +eval_node_copy_and_alloc_val( +/*=========================*/ + que_node_t* node, /*!< in: query graph node */ + const byte* str, /*!< in: binary string */ + ulint len) /*!< in: string length or UNIV_SQL_NULL */ +{ + byte* data; + + if (len == UNIV_SQL_NULL) { + dfield_set_len(que_node_get_val(node), len); + + return; + } + + data = eval_node_ensure_val_buf(node, len); + + ut_memcpy(data, str, len); +} + +/*****************************************************************//** +Copies a query node value to another node. */ +UNIV_INLINE +void +eval_node_copy_val( +/*===============*/ + que_node_t* node1, /*!< in: node to copy to */ + que_node_t* node2) /*!< in: node to copy from */ +{ + dfield_t* dfield2; + + dfield2 = que_node_get_val(node2); + + eval_node_copy_and_alloc_val( + node1, + static_cast<byte*>(dfield_get_data(dfield2)), + dfield_get_len(dfield2)); +} diff --git a/storage/innobase/include/eval0proc.h b/storage/innobase/include/eval0proc.h new file mode 100644 index 00000000000..7755fb10343 --- /dev/null +++ b/storage/innobase/include/eval0proc.h @@ -0,0 +1,104 @@ +/***************************************************************************** + +Copyright (c) 1998, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/eval0proc.h +Executes SQL stored procedures and their control structures + +Created 1/20/1998 Heikki Tuuri +*******************************************************/ + +#ifndef eval0proc_h +#define eval0proc_h + +#include "univ.i" +#include "que0types.h" +#include "pars0sym.h" +#include "pars0pars.h" + +/**********************************************************************//** +Performs an execution step of a procedure node. +@return query thread to run next or NULL */ +UNIV_INLINE +que_thr_t* +proc_step( +/*======*/ + que_thr_t* thr); /*!< in: query thread */ +/**********************************************************************//** +Performs an execution step of an if-statement node. +@return query thread to run next or NULL */ +UNIV_INTERN +que_thr_t* +if_step( +/*====*/ + que_thr_t* thr); /*!< in: query thread */ +/**********************************************************************//** +Performs an execution step of a while-statement node. +@return query thread to run next or NULL */ +UNIV_INTERN +que_thr_t* +while_step( +/*=======*/ + que_thr_t* thr); /*!< in: query thread */ +/**********************************************************************//** +Performs an execution step of a for-loop node. +@return query thread to run next or NULL */ +UNIV_INTERN +que_thr_t* +for_step( +/*=====*/ + que_thr_t* thr); /*!< in: query thread */ +/**********************************************************************//** +Performs an execution step of an assignment statement node. +@return query thread to run next or NULL */ +UNIV_INTERN +que_thr_t* +assign_step( +/*========*/ + que_thr_t* thr); /*!< in: query thread */ +/**********************************************************************//** +Performs an execution step of a procedure call node. +@return query thread to run next or NULL */ +UNIV_INLINE +que_thr_t* +proc_eval_step( +/*===========*/ + que_thr_t* thr); /*!< in: query thread */ +/**********************************************************************//** +Performs an execution step of an exit statement node. +@return query thread to run next or NULL */ +UNIV_INTERN +que_thr_t* +exit_step( +/*======*/ + que_thr_t* thr); /*!< in: query thread */ +/**********************************************************************//** +Performs an execution step of a return-statement node. +@return query thread to run next or NULL */ +UNIV_INTERN +que_thr_t* +return_step( +/*========*/ + que_thr_t* thr); /*!< in: query thread */ + + +#ifndef UNIV_NONINL +#include "eval0proc.ic" +#endif + +#endif diff --git a/storage/innobase/include/eval0proc.ic b/storage/innobase/include/eval0proc.ic new file mode 100644 index 00000000000..81418bae2c9 --- /dev/null +++ b/storage/innobase/include/eval0proc.ic @@ -0,0 +1,88 @@ +/***************************************************************************** + +Copyright (c) 1998, 2011, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/eval0proc.ic +Executes SQL stored procedures and their control structures + +Created 1/20/1998 Heikki Tuuri +*******************************************************/ + +#include "pars0pars.h" +#include "que0que.h" +#include "eval0eval.h" + +/**********************************************************************//** +Performs an execution step of a procedure node. +@return query thread to run next or NULL */ +UNIV_INLINE +que_thr_t* +proc_step( +/*======*/ + que_thr_t* thr) /*!< in: query thread */ +{ + proc_node_t* node; + + ut_ad(thr); + + node = static_cast<proc_node_t*>(thr->run_node); + ut_ad(que_node_get_type(node) == QUE_NODE_PROC); + + if (thr->prev_node == que_node_get_parent(node)) { + /* Start execution from the first statement in the statement + list */ + + thr->run_node = node->stat_list; + } else { + /* Move to the next statement */ + ut_ad(que_node_get_next(thr->prev_node) == NULL); + + thr->run_node = NULL; + } + + if (thr->run_node == NULL) { + thr->run_node = que_node_get_parent(node); + } + + return(thr); +} + +/**********************************************************************//** +Performs an execution step of a procedure call node. +@return query thread to run next or NULL */ +UNIV_INLINE +que_thr_t* +proc_eval_step( +/*===========*/ + que_thr_t* thr) /*!< in: query thread */ +{ + func_node_t* node; + + ut_ad(thr); + + node = static_cast<func_node_t*>(thr->run_node); + ut_ad(que_node_get_type(node) == QUE_NODE_FUNC); + + /* Evaluate the procedure */ + + eval_exp(node); + + thr->run_node = que_node_get_parent(node); + + return(thr); +} diff --git a/storage/innobase/include/fil0fil.h b/storage/innobase/include/fil0fil.h new file mode 100644 index 00000000000..168f2f5b594 --- /dev/null +++ b/storage/innobase/include/fil0fil.h @@ -0,0 +1,1019 @@ +/***************************************************************************** + +Copyright (c) 1995, 2014, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/fil0fil.h +The low-level file system + +Created 10/25/1995 Heikki Tuuri +*******************************************************/ + +#ifndef fil0fil_h +#define fil0fil_h + +#include "univ.i" + +#ifndef UNIV_INNOCHECKSUM + +#include "dict0types.h" +#include "ut0byte.h" +#include "os0file.h" +#ifndef UNIV_HOTBACKUP +#include "sync0rw.h" +#include "ibuf0types.h" +#include "log0log.h" +#endif /* !UNIV_HOTBACKUP */ + +#include <list> + +extern my_bool lower_case_file_system; +// Forward declaration +struct trx_t; +struct fil_space_t; + +typedef std::list<const char*> space_name_list_t; + +/** When mysqld is run, the default directory "." is the mysqld datadir, +but in the MySQL Embedded Server Library and mysqlbackup it is not the default +directory, and we must set the base file path explicitly */ +extern const char* fil_path_to_mysql_datadir; + +/** Initial size of a single-table tablespace in pages */ +#define FIL_IBD_FILE_INITIAL_SIZE 4 + +/** 'null' (undefined) page offset in the context of file spaces */ +#define FIL_NULL ULINT32_UNDEFINED + +/* Space address data type; this is intended to be used when +addresses accurate to a byte are stored in file pages. If the page part +of the address is FIL_NULL, the address is considered undefined. */ + +typedef byte fil_faddr_t; /*!< 'type' definition in C: an address + stored in a file page is a string of bytes */ +#define FIL_ADDR_PAGE 0 /* first in address is the page offset */ +#define FIL_ADDR_BYTE 4 /* then comes 2-byte byte offset within page*/ + +#define FIL_ADDR_SIZE 6 /* address size is 6 bytes */ + +/** File space address */ +struct fil_addr_t{ + ulint page; /*!< page number within a space */ + ulint boffset; /*!< byte offset within the page */ +}; + +/** The null file address */ +extern fil_addr_t fil_addr_null; + +#endif /* !UNIV_INNOCHECKSUM */ + +/** The byte offsets on a file page for various variables @{ */ +#define FIL_PAGE_SPACE_OR_CHKSUM 0 /*!< in < MySQL-4.0.14 space id the + page belongs to (== 0) but in later + versions the 'new' checksum of the + page */ +#define FIL_PAGE_OFFSET 4 /*!< page offset inside space */ +#define FIL_PAGE_PREV 8 /*!< if there is a 'natural' + predecessor of the page, its + offset. Otherwise FIL_NULL. + This field is not set on BLOB + pages, which are stored as a + singly-linked list. See also + FIL_PAGE_NEXT. */ +#define FIL_PAGE_NEXT 12 /*!< if there is a 'natural' successor + of the page, its offset. + Otherwise FIL_NULL. + B-tree index pages + (FIL_PAGE_TYPE contains FIL_PAGE_INDEX) + on the same PAGE_LEVEL are maintained + as a doubly linked list via + FIL_PAGE_PREV and FIL_PAGE_NEXT + in the collation order of the + smallest user record on each page. */ +#define FIL_PAGE_LSN 16 /*!< lsn of the end of the newest + modification log record to the page */ +#define FIL_PAGE_TYPE 24 /*!< file page type: FIL_PAGE_INDEX,..., + 2 bytes. + + The contents of this field can only + be trusted in the following case: + if the page is an uncompressed + B-tree index page, then it is + guaranteed that the value is + FIL_PAGE_INDEX. + The opposite does not hold. + + In tablespaces created by + MySQL/InnoDB 5.1.7 or later, the + contents of this field is valid + for all uncompressed pages. */ +#define FIL_PAGE_FILE_FLUSH_LSN 26 /*!< this is only defined for the + first page in a system tablespace + data file (ibdata*, not *.ibd): + the file has been flushed to disk + at least up to this lsn */ +#define FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID 34 /*!< starting from 4.1.x this + contains the space id of the page */ +#define FIL_PAGE_SPACE_ID FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID + +#define FIL_PAGE_DATA 38 /*!< start of the data on the page */ +/* @} */ +/** File page trailer @{ */ +#define FIL_PAGE_END_LSN_OLD_CHKSUM 8 /*!< the low 4 bytes of this are used + to store the page checksum, the + last 4 bytes should be identical + to the last 4 bytes of FIL_PAGE_LSN */ +#define FIL_PAGE_DATA_END 8 /*!< size of the page trailer */ +/* @} */ + +#ifndef UNIV_INNOCHECKSUM + +/** File page types (values of FIL_PAGE_TYPE) @{ */ +#define FIL_PAGE_INDEX 17855 /*!< B-tree node */ +#define FIL_PAGE_UNDO_LOG 2 /*!< Undo log page */ +#define FIL_PAGE_INODE 3 /*!< Index node */ +#define FIL_PAGE_IBUF_FREE_LIST 4 /*!< Insert buffer free list */ +/* File page types introduced in MySQL/InnoDB 5.1.7 */ +#define FIL_PAGE_TYPE_ALLOCATED 0 /*!< Freshly allocated page */ +#define FIL_PAGE_IBUF_BITMAP 5 /*!< Insert buffer bitmap */ +#define FIL_PAGE_TYPE_SYS 6 /*!< System page */ +#define FIL_PAGE_TYPE_TRX_SYS 7 /*!< Transaction system data */ +#define FIL_PAGE_TYPE_FSP_HDR 8 /*!< File space header */ +#define FIL_PAGE_TYPE_XDES 9 /*!< Extent descriptor page */ +#define FIL_PAGE_TYPE_BLOB 10 /*!< Uncompressed BLOB page */ +#define FIL_PAGE_TYPE_ZBLOB 11 /*!< First compressed BLOB page */ +#define FIL_PAGE_TYPE_ZBLOB2 12 /*!< Subsequent compressed BLOB page */ +#define FIL_PAGE_TYPE_LAST FIL_PAGE_TYPE_ZBLOB2 + /*!< Last page type */ +/* @} */ + +/** Space types @{ */ +#define FIL_TABLESPACE 501 /*!< tablespace */ +#define FIL_LOG 502 /*!< redo log */ +/* @} */ + +/** The number of fsyncs done to the log */ +extern ulint fil_n_log_flushes; + +/** Number of pending redo log flushes */ +extern ulint fil_n_pending_log_flushes; +/** Number of pending tablespace flushes */ +extern ulint fil_n_pending_tablespace_flushes; + +/** Number of files currently open */ +extern ulint fil_n_file_opened; + +struct fsp_open_info { + ibool success; /*!< Has the tablespace been opened? */ + const char* check_msg; /*!< fil_check_first_page() message */ + ibool valid; /*!< Is the tablespace valid? */ + os_file_t file; /*!< File handle */ + char* filepath; /*!< File path to open */ + lsn_t lsn; /*!< Flushed LSN from header page */ + ulint id; /*!< Space ID */ + ulint flags; /*!< Tablespace flags */ +#ifdef UNIV_LOG_ARCHIVE + ulint arch_log_no; /*!< latest archived log file number */ +#endif /* UNIV_LOG_ARCHIVE */ +}; + +#ifndef UNIV_HOTBACKUP +/*******************************************************************//** +Returns the version number of a tablespace, -1 if not found. +@return version number, -1 if the tablespace does not exist in the +memory cache */ +UNIV_INTERN +ib_int64_t +fil_space_get_version( +/*==================*/ + ulint id); /*!< in: space id */ +/*******************************************************************//** +Returns the latch of a file space. +@return latch protecting storage allocation */ +UNIV_INTERN +rw_lock_t* +fil_space_get_latch( +/*================*/ + ulint id, /*!< in: space id */ + ulint* zip_size);/*!< out: compressed page size, or + 0 for uncompressed tablespaces */ +/*******************************************************************//** +Returns the type of a file space. +@return FIL_TABLESPACE or FIL_LOG */ +UNIV_INTERN +ulint +fil_space_get_type( +/*===============*/ + ulint id); /*!< in: space id */ +#endif /* !UNIV_HOTBACKUP */ +/*******************************************************************//** +Appends a new file to the chain of files of a space. File must be closed. +@return pointer to the file name, or NULL on error */ +UNIV_INTERN +char* +fil_node_create( +/*============*/ + const char* name, /*!< in: file name (file must be closed) */ + ulint size, /*!< in: file size in database blocks, rounded + downwards to an integer */ + ulint id, /*!< in: space id where to append */ + ibool is_raw) /*!< in: TRUE if a raw device or + a raw disk partition */ + __attribute__((nonnull, warn_unused_result)); +#ifdef UNIV_LOG_ARCHIVE +/****************************************************************//** +Drops files from the start of a file space, so that its size is cut by +the amount given. */ +UNIV_INTERN +void +fil_space_truncate_start( +/*=====================*/ + ulint id, /*!< in: space id */ + ulint trunc_len); /*!< in: truncate by this much; it is an error + if this does not equal to the combined size of + some initial files in the space */ +#endif /* UNIV_LOG_ARCHIVE */ +/*******************************************************************//** +Creates a space memory object and puts it to the 'fil system' hash table. +If there is an error, prints an error message to the .err log. +@return TRUE if success */ +UNIV_INTERN +ibool +fil_space_create( +/*=============*/ + const char* name, /*!< in: space name */ + ulint id, /*!< in: space id */ + ulint zip_size,/*!< in: compressed page size, or + 0 for uncompressed tablespaces */ + ulint purpose);/*!< in: FIL_TABLESPACE, or FIL_LOG if log */ +/*******************************************************************//** +Assigns a new space id for a new single-table tablespace. This works simply by +incrementing the global counter. If 4 billion id's is not enough, we may need +to recycle id's. +@return TRUE if assigned, FALSE if not */ +UNIV_INTERN +ibool +fil_assign_new_space_id( +/*====================*/ + ulint* space_id); /*!< in/out: space id */ +/*******************************************************************//** +Returns the path from the first fil_node_t found for the space ID sent. +The caller is responsible for freeing the memory allocated here for the +value returned. +@return a copy of fil_node_t::path, NULL if space is zero or not found. */ +UNIV_INTERN +char* +fil_space_get_first_path( +/*=====================*/ + ulint id); /*!< in: space id */ +/*******************************************************************//** +Returns the size of the space in pages. The tablespace must be cached in the +memory cache. +@return space size, 0 if space not found */ +UNIV_INTERN +ulint +fil_space_get_size( +/*===============*/ + ulint id); /*!< in: space id */ +/*******************************************************************//** +Returns the flags of the space. The tablespace must be cached +in the memory cache. +@return flags, ULINT_UNDEFINED if space not found */ +UNIV_INTERN +ulint +fil_space_get_flags( +/*================*/ + ulint id); /*!< in: space id */ +/*******************************************************************//** +Returns the compressed page size of the space, or 0 if the space +is not compressed. The tablespace must be cached in the memory cache. +@return compressed page size, ULINT_UNDEFINED if space not found */ +UNIV_INTERN +ulint +fil_space_get_zip_size( +/*===================*/ + ulint id); /*!< in: space id */ +/*******************************************************************//** +Checks if the pair space, page_no refers to an existing page in a tablespace +file space. The tablespace must be cached in the memory cache. +@return TRUE if the address is meaningful */ +UNIV_INTERN +ibool +fil_check_adress_in_tablespace( +/*===========================*/ + ulint id, /*!< in: space id */ + ulint page_no);/*!< in: page number */ +/****************************************************************//** +Initializes the tablespace memory cache. */ +UNIV_INTERN +void +fil_init( +/*=====*/ + ulint hash_size, /*!< in: hash table size */ + ulint max_n_open); /*!< in: max number of open files */ +/*******************************************************************//** +Initializes the tablespace memory cache. */ +UNIV_INTERN +void +fil_close(void); +/*===========*/ +/*******************************************************************//** +Opens all log files and system tablespace data files. They stay open until the +database server shutdown. This should be called at a server startup after the +space objects for the log and the system tablespace have been created. The +purpose of this operation is to make sure we never run out of file descriptors +if we need to read from the insert buffer or to write to the log. */ +UNIV_INTERN +void +fil_open_log_and_system_tablespace_files(void); +/*==========================================*/ +/*******************************************************************//** +Closes all open files. There must not be any pending i/o's or not flushed +modifications in the files. */ +UNIV_INTERN +void +fil_close_all_files(void); +/*=====================*/ +/*******************************************************************//** +Closes the redo log files. There must not be any pending i/o's or not +flushed modifications in the files. */ +UNIV_INTERN +void +fil_close_log_files( +/*================*/ + bool free); /*!< in: whether to free the memory object */ +/*******************************************************************//** +Sets the max tablespace id counter if the given number is bigger than the +previous value. */ +UNIV_INTERN +void +fil_set_max_space_id_if_bigger( +/*===========================*/ + ulint max_id);/*!< in: maximum known id */ +#ifndef UNIV_HOTBACKUP +/****************************************************************//** +Writes the flushed lsn and the latest archived log number to the page +header of the first page of each data file in the system tablespace. +@return DB_SUCCESS or error number */ +UNIV_INTERN +dberr_t +fil_write_flushed_lsn_to_data_files( +/*================================*/ + lsn_t lsn, /*!< in: lsn to write */ + ulint arch_log_no); /*!< in: latest archived log file number */ +/*******************************************************************//** +Reads the flushed lsn, arch no, and tablespace flag fields from a data +file at database startup. +@retval NULL on success, or if innodb_force_recovery is set +@return pointer to an error message string */ +UNIV_INTERN +const char* +fil_read_first_page( +/*================*/ + os_file_t data_file, /*!< in: open data file */ + ibool one_read_already, /*!< in: TRUE if min and max + parameters below already + contain sensible data */ + ulint* flags, /*!< out: tablespace flags */ + ulint* space_id, /*!< out: tablespace ID */ +#ifdef UNIV_LOG_ARCHIVE + ulint* min_arch_log_no, /*!< out: min of archived + log numbers in data files */ + ulint* max_arch_log_no, /*!< out: max of archived + log numbers in data files */ +#endif /* UNIV_LOG_ARCHIVE */ + lsn_t* min_flushed_lsn, /*!< out: min of flushed + lsn values in data files */ + lsn_t* max_flushed_lsn) /*!< out: max of flushed + lsn values in data files */ + __attribute__((warn_unused_result)); +/*******************************************************************//** +Increments the count of pending operation, if space is not being deleted. +@return TRUE if being deleted, and operation should be skipped */ +UNIV_INTERN +ibool +fil_inc_pending_ops( +/*================*/ + ulint id, /*!< in: space id */ + ibool print_err); /*!< in: need to print error or not */ +/*******************************************************************//** +Decrements the count of pending operations. */ +UNIV_INTERN +void +fil_decr_pending_ops( +/*=================*/ + ulint id); /*!< in: space id */ +#endif /* !UNIV_HOTBACKUP */ +/*******************************************************************//** +Parses the body of a log record written about an .ibd file operation. That is, +the log record part after the standard (type, space id, page no) header of the +log record. + +If desired, also replays the delete or rename operation if the .ibd file +exists and the space id in it matches. Replays the create operation if a file +at that path does not exist yet. If the database directory for the file to be +created does not exist, then we create the directory, too. + +Note that mysqlbackup --apply-log sets fil_path_to_mysql_datadir to point to +the datadir that we should use in replaying the file operations. +@return end of log record, or NULL if the record was not completely +contained between ptr and end_ptr */ +UNIV_INTERN +byte* +fil_op_log_parse_or_replay( +/*=======================*/ + byte* ptr, /*!< in: buffer containing the log record body, + or an initial segment of it, if the record does + not fir completely between ptr and end_ptr */ + byte* end_ptr, /*!< in: buffer end */ + ulint type, /*!< in: the type of this log record */ + ulint space_id, /*!< in: the space id of the tablespace in + question, or 0 if the log record should + only be parsed but not replayed */ + ulint log_flags); /*!< in: redo log flags + (stored in the page number parameter) */ +/*******************************************************************//** +Deletes a single-table tablespace. The tablespace must be cached in the +memory cache. +@return TRUE if success */ +UNIV_INTERN +dberr_t +fil_delete_tablespace( +/*==================*/ + ulint id, /*!< in: space id */ + buf_remove_t buf_remove); /*!< in: specify the action to take + on the tables pages in the buffer + pool */ +/*******************************************************************//** +Closes a single-table tablespace. The tablespace must be cached in the +memory cache. Free all pages used by the tablespace. +@return DB_SUCCESS or error */ +UNIV_INTERN +dberr_t +fil_close_tablespace( +/*=================*/ + trx_t* trx, /*!< in/out: Transaction covering the close */ + ulint id); /*!< in: space id */ +#ifndef UNIV_HOTBACKUP +/*******************************************************************//** +Discards a single-table tablespace. The tablespace must be cached in the +memory cache. Discarding is like deleting a tablespace, but + + 1. We do not drop the table from the data dictionary; + + 2. We remove all insert buffer entries for the tablespace immediately; + in DROP TABLE they are only removed gradually in the background; + + 3. When the user does IMPORT TABLESPACE, the tablespace will have the + same id as it originally had. + + 4. Free all the pages in use by the tablespace if rename=TRUE. +@return DB_SUCCESS or error */ +UNIV_INTERN +dberr_t +fil_discard_tablespace( +/*===================*/ + ulint id) /*!< in: space id */ + __attribute__((warn_unused_result)); +#endif /* !UNIV_HOTBACKUP */ +/*******************************************************************//** +Renames a single-table tablespace. The tablespace must be cached in the +tablespace memory cache. +@return TRUE if success */ +UNIV_INTERN +ibool +fil_rename_tablespace( +/*==================*/ + const char* old_name_in, /*!< in: old table name in the + standard databasename/tablename + format of InnoDB, or NULL if we + do the rename based on the space + id only */ + ulint id, /*!< in: space id */ + const char* new_name, /*!< in: new table name in the + standard databasename/tablename + format of InnoDB */ + const char* new_path); /*!< in: new full datafile path + if the tablespace is remotely + located, or NULL if it is located + in the normal data directory. */ + +/*******************************************************************//** +Allocates a file name for a single-table tablespace. The string must be freed +by caller with mem_free(). +@return own: file name */ +UNIV_INTERN +char* +fil_make_ibd_name( +/*==============*/ + const char* name, /*!< in: table name or a dir path */ + bool is_full_path); /*!< in: TRUE if it is a dir path */ +/*******************************************************************//** +Allocates a file name for a tablespace ISL file (InnoDB Symbolic Link). +The string must be freed by caller with mem_free(). +@return own: file name */ +UNIV_INTERN +char* +fil_make_isl_name( +/*==============*/ + const char* name); /*!< in: table name */ +/*******************************************************************//** +Creates a new InnoDB Symbolic Link (ISL) file. It is always created +under the 'datadir' of MySQL. The datadir is the directory of a +running mysqld program. We can refer to it by simply using the path '.'. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +fil_create_link_file( +/*=================*/ + const char* tablename, /*!< in: tablename */ + const char* filepath); /*!< in: pathname of tablespace */ +/*******************************************************************//** +Deletes an InnoDB Symbolic Link (ISL) file. */ +UNIV_INTERN +void +fil_delete_link_file( +/*==================*/ + const char* tablename); /*!< in: name of table */ +/*******************************************************************//** +Reads an InnoDB Symbolic Link (ISL) file. +It is always created under the 'datadir' of MySQL. The name is of the +form {databasename}/{tablename}. and the isl file is expected to be in a +'{databasename}' directory called '{tablename}.isl'. The caller must free +the memory of the null-terminated path returned if it is not null. +@return own: filepath found in link file, NULL if not found. */ +UNIV_INTERN +char* +fil_read_link_file( +/*===============*/ + const char* name); /*!< in: tablespace name */ +/*******************************************************************//** +Creates a new single-table tablespace to a database directory of MySQL. +Database directories are under the 'datadir' of MySQL. The datadir is the +directory of a running mysqld program. We can refer to it by simply the +path '.'. Tables created with CREATE TEMPORARY TABLE we place in the temp +dir of the mysqld server. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +fil_create_new_single_table_tablespace( +/*===================================*/ + ulint space_id, /*!< in: space id */ + const char* tablename, /*!< in: the table name in the usual + databasename/tablename format + of InnoDB */ + const char* dir_path, /*!< in: NULL or a dir path */ + ulint flags, /*!< in: tablespace flags */ + ulint flags2, /*!< in: table flags2 */ + ulint size) /*!< in: the initial size of the + tablespace file in pages, + must be >= FIL_IBD_FILE_INITIAL_SIZE */ + __attribute__((nonnull, warn_unused_result)); +#ifndef UNIV_HOTBACKUP +/********************************************************************//** +Tries to open a single-table tablespace and optionally checks the space id is +right in it. If does not succeed, prints an error message to the .err log. This +function is used to open a tablespace when we start up mysqld, and also in +IMPORT TABLESPACE. +NOTE that we assume this operation is used either at the database startup +or under the protection of the dictionary mutex, so that two users cannot +race here. This operation does not leave the file associated with the +tablespace open, but closes it after we have looked at the space id in it. + +If the validate boolean is set, we read the first page of the file and +check that the space id in the file is what we expect. We assume that +this function runs much faster if no check is made, since accessing the +file inode probably is much faster (the OS caches them) than accessing +the first page of the file. This boolean may be initially FALSE, but if +a remote tablespace is found it will be changed to true. + +If the fix_dict boolean is set, then it is safe to use an internal SQL +statement to update the dictionary tables if they are incorrect. + +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +fil_open_single_table_tablespace( +/*=============================*/ + bool validate, /*!< in: Do we validate tablespace? */ + bool fix_dict, /*!< in: Can we fix the dictionary? */ + ulint id, /*!< in: space id */ + ulint flags, /*!< in: tablespace flags */ + const char* tablename, /*!< in: table name in the + databasename/tablename format */ + const char* filepath) /*!< in: tablespace filepath */ + __attribute__((nonnull(5), warn_unused_result)); + +#endif /* !UNIV_HOTBACKUP */ +/********************************************************************//** +At the server startup, if we need crash recovery, scans the database +directories under the MySQL datadir, looking for .ibd files. Those files are +single-table tablespaces. We need to know the space id in each of them so that +we know into which file we should look to check the contents of a page stored +in the doublewrite buffer, also to know where to apply log records where the +space id is != 0. +@return DB_SUCCESS or error number */ +UNIV_INTERN +dberr_t +fil_load_single_table_tablespaces(void); +/*===================================*/ +/*******************************************************************//** +Returns TRUE if a single-table tablespace does not exist in the memory cache, +or is being deleted there. +@return TRUE if does not exist or is being deleted */ +UNIV_INTERN +ibool +fil_tablespace_deleted_or_being_deleted_in_mem( +/*===========================================*/ + ulint id, /*!< in: space id */ + ib_int64_t version);/*!< in: tablespace_version should be this; if + you pass -1 as the value of this, then this + parameter is ignored */ +/*******************************************************************//** +Returns TRUE if a single-table tablespace exists in the memory cache. +@return TRUE if exists */ +UNIV_INTERN +ibool +fil_tablespace_exists_in_mem( +/*=========================*/ + ulint id); /*!< in: space id */ +#ifndef UNIV_HOTBACKUP +/*******************************************************************//** +Returns TRUE if a matching tablespace exists in the InnoDB tablespace memory +cache. Note that if we have not done a crash recovery at the database startup, +there may be many tablespaces which are not yet in the memory cache. +@return TRUE if a matching tablespace exists in the memory cache */ +UNIV_INTERN +ibool +fil_space_for_table_exists_in_mem( +/*==============================*/ + ulint id, /*!< in: space id */ + const char* name, /*!< in: table name in the standard + 'databasename/tablename' format */ + ibool mark_space, /*!< in: in crash recovery, at database + startup we mark all spaces which have + an associated table in the InnoDB + data dictionary, so that + we can print a warning about orphaned + tablespaces */ + ibool print_error_if_does_not_exist, + /*!< in: print detailed error + information to the .err log if a + matching tablespace is not found from + memory */ + bool adjust_space, /*!< in: whether to adjust space id + when find table space mismatch */ + mem_heap_t* heap, /*!< in: heap memory */ + table_id_t table_id); /*!< in: table id */ +#else /* !UNIV_HOTBACKUP */ +/********************************************************************//** +Extends all tablespaces to the size stored in the space header. During the +mysqlbackup --apply-log phase we extended the spaces on-demand so that log +records could be appllied, but that may have left spaces still too small +compared to the size stored in the space header. */ +UNIV_INTERN +void +fil_extend_tablespaces_to_stored_len(void); +/*======================================*/ +#endif /* !UNIV_HOTBACKUP */ +/**********************************************************************//** +Tries to extend a data file so that it would accommodate the number of pages +given. The tablespace must be cached in the memory cache. If the space is big +enough already, does nothing. +@return TRUE if success */ +UNIV_INTERN +ibool +fil_extend_space_to_desired_size( +/*=============================*/ + ulint* actual_size, /*!< out: size of the space after extension; + if we ran out of disk space this may be lower + than the desired size */ + ulint space_id, /*!< in: space id */ + ulint size_after_extend);/*!< in: desired size in pages after the + extension; if the current space size is bigger + than this already, the function does nothing */ +/*******************************************************************//** +Tries to reserve free extents in a file space. +@return TRUE if succeed */ +UNIV_INTERN +ibool +fil_space_reserve_free_extents( +/*===========================*/ + ulint id, /*!< in: space id */ + ulint n_free_now, /*!< in: number of free extents now */ + ulint n_to_reserve); /*!< in: how many one wants to reserve */ +/*******************************************************************//** +Releases free extents in a file space. */ +UNIV_INTERN +void +fil_space_release_free_extents( +/*===========================*/ + ulint id, /*!< in: space id */ + ulint n_reserved); /*!< in: how many one reserved */ +/*******************************************************************//** +Gets the number of reserved extents. If the database is silent, this number +should be zero. */ +UNIV_INTERN +ulint +fil_space_get_n_reserved_extents( +/*=============================*/ + ulint id); /*!< in: space id */ +/********************************************************************//** +Reads or writes data. This operation is asynchronous (aio). +@return DB_SUCCESS, or DB_TABLESPACE_DELETED if we are trying to do +i/o on a tablespace which does not exist */ +UNIV_INTERN +dberr_t +fil_io( +/*===*/ + ulint type, /*!< in: OS_FILE_READ or OS_FILE_WRITE, + ORed to OS_FILE_LOG, if a log i/o + and ORed to OS_AIO_SIMULATED_WAKE_LATER + if simulated aio and we want to post a + batch of i/os; NOTE that a simulated batch + may introduce hidden chances of deadlocks, + because i/os are not actually handled until + all have been posted: use with great + caution! */ + bool sync, /*!< in: true if synchronous aio is desired */ + ulint space_id, /*!< in: space id */ + ulint zip_size, /*!< in: compressed page size in bytes; + 0 for uncompressed pages */ + ulint block_offset, /*!< in: offset in number of blocks */ + ulint byte_offset, /*!< in: remainder of offset in bytes; in + aio this must be divisible by the OS block + size */ + ulint len, /*!< in: how many bytes to read or write; this + must not cross a file boundary; in aio this + must be a block size multiple */ + void* buf, /*!< in/out: buffer where to store read data + or from where to write; in aio this must be + appropriately aligned */ + void* message) /*!< in: message for aio handler if non-sync + aio used, else ignored */ + __attribute__((nonnull(8))); +/**********************************************************************//** +Waits for an aio operation to complete. This function is used to write the +handler for completed requests. The aio array of pending requests is divided +into segments (see os0file.cc for more info). The thread specifies which +segment it wants to wait for. */ +UNIV_INTERN +void +fil_aio_wait( +/*=========*/ + ulint segment); /*!< in: the number of the segment in the aio + array to wait for */ +/**********************************************************************//** +Flushes to disk possible writes cached by the OS. If the space does not exist +or is being dropped, does not do anything. */ +UNIV_INTERN +void +fil_flush( +/*======*/ + ulint space_id); /*!< in: file space id (this can be a group of + log files or a tablespace of the database) */ +/**********************************************************************//** +Flushes to disk writes in file spaces of the given type possibly cached by +the OS. */ +UNIV_INTERN +void +fil_flush_file_spaces( +/*==================*/ + ulint purpose); /*!< in: FIL_TABLESPACE, FIL_LOG */ +/******************************************************************//** +Checks the consistency of the tablespace cache. +@return TRUE if ok */ +UNIV_INTERN +ibool +fil_validate(void); +/*==============*/ +/********************************************************************//** +Returns TRUE if file address is undefined. +@return TRUE if undefined */ +UNIV_INTERN +ibool +fil_addr_is_null( +/*=============*/ + fil_addr_t addr); /*!< in: address */ +/********************************************************************//** +Get the predecessor of a file page. +@return FIL_PAGE_PREV */ +UNIV_INTERN +ulint +fil_page_get_prev( +/*==============*/ + const byte* page); /*!< in: file page */ +/********************************************************************//** +Get the successor of a file page. +@return FIL_PAGE_NEXT */ +UNIV_INTERN +ulint +fil_page_get_next( +/*==============*/ + const byte* page); /*!< in: file page */ +/*********************************************************************//** +Sets the file page type. */ +UNIV_INTERN +void +fil_page_set_type( +/*==============*/ + byte* page, /*!< in/out: file page */ + ulint type); /*!< in: type */ +/*********************************************************************//** +Gets the file page type. +@return type; NOTE that if the type has not been written to page, the +return value not defined */ +UNIV_INTERN +ulint +fil_page_get_type( +/*==============*/ + const byte* page); /*!< in: file page */ + +/*******************************************************************//** +Returns TRUE if a single-table tablespace is being deleted. +@return TRUE if being deleted */ +UNIV_INTERN +ibool +fil_tablespace_is_being_deleted( +/*============================*/ + ulint id); /*!< in: space id */ + +/********************************************************************//** +Delete the tablespace file and any related files like .cfg. +This should not be called for temporary tables. */ +UNIV_INTERN +void +fil_delete_file( +/*============*/ + const char* path); /*!< in: filepath of the ibd tablespace */ + +/** Callback functor. */ +struct PageCallback { + + /** + Default constructor */ + PageCallback() + : + m_zip_size(), + m_page_size(), + m_filepath() UNIV_NOTHROW {} + + virtual ~PageCallback() UNIV_NOTHROW {} + + /** + Called for page 0 in the tablespace file at the start. + @param file_size - size of the file in bytes + @param block - contents of the first page in the tablespace file + @retval DB_SUCCESS or error code.*/ + virtual dberr_t init( + os_offset_t file_size, + const buf_block_t* block) UNIV_NOTHROW = 0; + + /** + Called for every page in the tablespace. If the page was not + updated then its state must be set to BUF_PAGE_NOT_USED. For + compressed tables the page descriptor memory will be at offset: + block->frame + UNIV_PAGE_SIZE; + @param offset - physical offset within the file + @param block - block read from file, note it is not from the buffer pool + @retval DB_SUCCESS or error code. */ + virtual dberr_t operator()( + os_offset_t offset, + buf_block_t* block) UNIV_NOTHROW = 0; + + /** + Set the name of the physical file and the file handle that is used + to open it for the file that is being iterated over. + @param filename - then physical name of the tablespace file. + @param file - OS file handle */ + void set_file(const char* filename, os_file_t file) UNIV_NOTHROW + { + m_file = file; + m_filepath = filename; + } + + /** + @return the space id of the tablespace */ + virtual ulint get_space_id() const UNIV_NOTHROW = 0; + + /** The compressed page size + @return the compressed page size */ + ulint get_zip_size() const + { + return(m_zip_size); + } + + /** + Set the tablespace compressed table size. + @return DB_SUCCESS if it is valie or DB_CORRUPTION if not */ + dberr_t set_zip_size(const buf_frame_t* page) UNIV_NOTHROW; + + /** The compressed page size + @return the compressed page size */ + ulint get_page_size() const + { + return(m_page_size); + } + + /** Compressed table page size */ + ulint m_zip_size; + + /** The tablespace page size. */ + ulint m_page_size; + + /** File handle to the tablespace */ + os_file_t m_file; + + /** Physical file path. */ + const char* m_filepath; + +protected: + // Disable copying + PageCallback(const PageCallback&); + PageCallback& operator=(const PageCallback&); +}; + +/********************************************************************//** +Iterate over all the pages in the tablespace. +@param table - the table definiton in the server +@param n_io_buffers - number of blocks to read and write together +@param callback - functor that will do the page updates +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +fil_tablespace_iterate( +/*===================*/ + dict_table_t* table, + ulint n_io_buffers, + PageCallback& callback) + __attribute__((nonnull, warn_unused_result)); + +/*******************************************************************//** +Checks if a single-table tablespace for a given table name exists in the +tablespace memory cache. +@return space id, ULINT_UNDEFINED if not found */ +UNIV_INTERN +ulint +fil_get_space_id_for_table( +/*=======================*/ + const char* name); /*!< in: table name in the standard + 'databasename/tablename' format */ + +/** +Iterate over all the spaces in the space list and fetch the +tablespace names. It will return a copy of the name that must be +freed by the caller using: delete[]. +@return DB_SUCCESS if all OK. */ +UNIV_INTERN +dberr_t +fil_get_space_names( +/*================*/ + space_name_list_t& space_name_list) + /*!< in/out: Vector for collecting the names. */ + __attribute__((warn_unused_result)); + +/****************************************************************//** +Generate redo logs for swapping two .ibd files */ +UNIV_INTERN +void +fil_mtr_rename_log( +/*===============*/ + ulint old_space_id, /*!< in: tablespace id of the old + table. */ + const char* old_name, /*!< in: old table name */ + ulint new_space_id, /*!< in: tablespace id of the new + table */ + const char* new_name, /*!< in: new table name */ + const char* tmp_name, /*!< in: temp table name used while + swapping */ + mtr_t* mtr) /*!< in/out: mini-transaction */ + __attribute__((nonnull)); + +/*******************************************************************//** +Finds the given page_no of the given space id from the double write buffer, +and copies it to the corresponding .ibd file. +@return true if copy was successful, or false. */ +bool +fil_user_tablespace_restore_page( +/*==============================*/ + fsp_open_info* fsp, /* in: contains space id and .ibd + file information */ + ulint page_no); /* in: page_no to obtain from double + write buffer */ + +#endif /* !UNIV_INNOCHECKSUM */ +#endif /* fil0fil_h */ diff --git a/storage/innobase/include/fsp0fsp.h b/storage/innobase/include/fsp0fsp.h new file mode 100644 index 00000000000..a587ccc9f20 --- /dev/null +++ b/storage/innobase/include/fsp0fsp.h @@ -0,0 +1,747 @@ +/***************************************************************************** + +Copyright (c) 1995, 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/fsp0fsp.h +File space management + +Created 12/18/1995 Heikki Tuuri +*******************************************************/ + +#ifndef fsp0fsp_h +#define fsp0fsp_h + +#include "univ.i" + +#ifndef UNIV_INNOCHECKSUM + +#include "mtr0mtr.h" +#include "fut0lst.h" +#include "ut0byte.h" +#include "page0types.h" +#include "fsp0types.h" + +#endif /* !UNIV_INNOCHECKSUM */ + +/* @defgroup fsp_flags InnoDB Tablespace Flag Constants @{ */ + +/** Width of the POST_ANTELOPE flag */ +#define FSP_FLAGS_WIDTH_POST_ANTELOPE 1 +/** Number of flag bits used to indicate the tablespace zip page size */ +#define FSP_FLAGS_WIDTH_ZIP_SSIZE 4 +/** Width of the ATOMIC_BLOBS flag. The ability to break up a long +column into an in-record prefix and an externally stored part is available +to the two Barracuda row formats COMPRESSED and DYNAMIC. */ +#define FSP_FLAGS_WIDTH_ATOMIC_BLOBS 1 +/** Number of flag bits used to indicate the tablespace page size */ +#define FSP_FLAGS_WIDTH_PAGE_SSIZE 4 +/** Width of the DATA_DIR flag. This flag indicates that the tablespace +is found in a remote location, not the default data directory. */ +#define FSP_FLAGS_WIDTH_DATA_DIR 1 +/** Width of all the currently known tablespace flags */ +#define FSP_FLAGS_WIDTH (FSP_FLAGS_WIDTH_POST_ANTELOPE \ + + FSP_FLAGS_WIDTH_ZIP_SSIZE \ + + FSP_FLAGS_WIDTH_ATOMIC_BLOBS \ + + FSP_FLAGS_WIDTH_PAGE_SSIZE \ + + FSP_FLAGS_WIDTH_DATA_DIR) + +/** A mask of all the known/used bits in tablespace flags */ +#define FSP_FLAGS_MASK (~(~0 << FSP_FLAGS_WIDTH)) + +/** Zero relative shift position of the POST_ANTELOPE field */ +#define FSP_FLAGS_POS_POST_ANTELOPE 0 +/** Zero relative shift position of the ZIP_SSIZE field */ +#define FSP_FLAGS_POS_ZIP_SSIZE (FSP_FLAGS_POS_POST_ANTELOPE \ + + FSP_FLAGS_WIDTH_POST_ANTELOPE) +/** Zero relative shift position of the ATOMIC_BLOBS field */ +#define FSP_FLAGS_POS_ATOMIC_BLOBS (FSP_FLAGS_POS_ZIP_SSIZE \ + + FSP_FLAGS_WIDTH_ZIP_SSIZE) +/** Zero relative shift position of the PAGE_SSIZE field */ +#define FSP_FLAGS_POS_PAGE_SSIZE (FSP_FLAGS_POS_ATOMIC_BLOBS \ + + FSP_FLAGS_WIDTH_ATOMIC_BLOBS) +/** Zero relative shift position of the start of the UNUSED bits */ +#define FSP_FLAGS_POS_DATA_DIR (FSP_FLAGS_POS_PAGE_SSIZE \ + + FSP_FLAGS_WIDTH_PAGE_SSIZE) +/** Zero relative shift position of the start of the UNUSED bits */ +#define FSP_FLAGS_POS_UNUSED (FSP_FLAGS_POS_DATA_DIR \ + + FSP_FLAGS_WIDTH_DATA_DIR) + +/** Bit mask of the POST_ANTELOPE field */ +#define FSP_FLAGS_MASK_POST_ANTELOPE \ + ((~(~0 << FSP_FLAGS_WIDTH_POST_ANTELOPE)) \ + << FSP_FLAGS_POS_POST_ANTELOPE) +/** Bit mask of the ZIP_SSIZE field */ +#define FSP_FLAGS_MASK_ZIP_SSIZE \ + ((~(~0 << FSP_FLAGS_WIDTH_ZIP_SSIZE)) \ + << FSP_FLAGS_POS_ZIP_SSIZE) +/** Bit mask of the ATOMIC_BLOBS field */ +#define FSP_FLAGS_MASK_ATOMIC_BLOBS \ + ((~(~0 << FSP_FLAGS_WIDTH_ATOMIC_BLOBS)) \ + << FSP_FLAGS_POS_ATOMIC_BLOBS) +/** Bit mask of the PAGE_SSIZE field */ +#define FSP_FLAGS_MASK_PAGE_SSIZE \ + ((~(~0 << FSP_FLAGS_WIDTH_PAGE_SSIZE)) \ + << FSP_FLAGS_POS_PAGE_SSIZE) +/** Bit mask of the DATA_DIR field */ +#define FSP_FLAGS_MASK_DATA_DIR \ + ((~(~0 << FSP_FLAGS_WIDTH_DATA_DIR)) \ + << FSP_FLAGS_POS_DATA_DIR) + +/** Return the value of the POST_ANTELOPE field */ +#define FSP_FLAGS_GET_POST_ANTELOPE(flags) \ + ((flags & FSP_FLAGS_MASK_POST_ANTELOPE) \ + >> FSP_FLAGS_POS_POST_ANTELOPE) +/** Return the value of the ZIP_SSIZE field */ +#define FSP_FLAGS_GET_ZIP_SSIZE(flags) \ + ((flags & FSP_FLAGS_MASK_ZIP_SSIZE) \ + >> FSP_FLAGS_POS_ZIP_SSIZE) +/** Return the value of the ATOMIC_BLOBS field */ +#define FSP_FLAGS_HAS_ATOMIC_BLOBS(flags) \ + ((flags & FSP_FLAGS_MASK_ATOMIC_BLOBS) \ + >> FSP_FLAGS_POS_ATOMIC_BLOBS) +/** Return the value of the PAGE_SSIZE field */ +#define FSP_FLAGS_GET_PAGE_SSIZE(flags) \ + ((flags & FSP_FLAGS_MASK_PAGE_SSIZE) \ + >> FSP_FLAGS_POS_PAGE_SSIZE) +/** Return the value of the DATA_DIR field */ +#define FSP_FLAGS_HAS_DATA_DIR(flags) \ + ((flags & FSP_FLAGS_MASK_DATA_DIR) \ + >> FSP_FLAGS_POS_DATA_DIR) +/** Return the contents of the UNUSED bits */ +#define FSP_FLAGS_GET_UNUSED(flags) \ + (flags >> FSP_FLAGS_POS_UNUSED) + +/** Set a PAGE_SSIZE into the correct bits in a given +tablespace flags. */ +#define FSP_FLAGS_SET_PAGE_SSIZE(flags, ssize) \ + (flags | (ssize << FSP_FLAGS_POS_PAGE_SSIZE)) + +/* @} */ + +/* @defgroup Tablespace Header Constants (moved from fsp0fsp.c) @{ */ + +/** Offset of the space header within a file page */ +#define FSP_HEADER_OFFSET FIL_PAGE_DATA + +/* The data structures in files are defined just as byte strings in C */ +typedef byte fsp_header_t; +typedef byte xdes_t; + +/* SPACE HEADER + ============ + +File space header data structure: this data structure is contained in the +first page of a space. The space for this header is reserved in every extent +descriptor page, but used only in the first. */ + +/*-------------------------------------*/ +#define FSP_SPACE_ID 0 /* space id */ +#define FSP_NOT_USED 4 /* this field contained a value up to + which we know that the modifications + in the database have been flushed to + the file space; not used now */ +#define FSP_SIZE 8 /* Current size of the space in + pages */ +#define FSP_FREE_LIMIT 12 /* Minimum page number for which the + free list has not been initialized: + the pages >= this limit are, by + definition, free; note that in a + single-table tablespace where size + < 64 pages, this number is 64, i.e., + we have initialized the space + about the first extent, but have not + physically allocted those pages to the + file */ +#define FSP_SPACE_FLAGS 16 /* fsp_space_t.flags, similar to + dict_table_t::flags */ +#define FSP_FRAG_N_USED 20 /* number of used pages in the + FSP_FREE_FRAG list */ +#define FSP_FREE 24 /* list of free extents */ +#define FSP_FREE_FRAG (24 + FLST_BASE_NODE_SIZE) + /* list of partially free extents not + belonging to any segment */ +#define FSP_FULL_FRAG (24 + 2 * FLST_BASE_NODE_SIZE) + /* list of full extents not belonging + to any segment */ +#define FSP_SEG_ID (24 + 3 * FLST_BASE_NODE_SIZE) + /* 8 bytes which give the first unused + segment id */ +#define FSP_SEG_INODES_FULL (32 + 3 * FLST_BASE_NODE_SIZE) + /* list of pages containing segment + headers, where all the segment inode + slots are reserved */ +#define FSP_SEG_INODES_FREE (32 + 4 * FLST_BASE_NODE_SIZE) + /* list of pages containing segment + headers, where not all the segment + header slots are reserved */ +/*-------------------------------------*/ +/* File space header size */ +#define FSP_HEADER_SIZE (32 + 5 * FLST_BASE_NODE_SIZE) + +#define FSP_FREE_ADD 4 /* this many free extents are added + to the free list from above + FSP_FREE_LIMIT at a time */ +/* @} */ + +#ifndef UNIV_INNOCHECKSUM + +/* @defgroup File Segment Inode Constants (moved from fsp0fsp.c) @{ */ + +/* FILE SEGMENT INODE + ================== + +Segment inode which is created for each segment in a tablespace. NOTE: in +purge we assume that a segment having only one currently used page can be +freed in a few steps, so that the freeing cannot fill the file buffer with +bufferfixed file pages. */ + +typedef byte fseg_inode_t; + +#define FSEG_INODE_PAGE_NODE FSEG_PAGE_DATA + /* the list node for linking + segment inode pages */ + +#define FSEG_ARR_OFFSET (FSEG_PAGE_DATA + FLST_NODE_SIZE) +/*-------------------------------------*/ +#define FSEG_ID 0 /* 8 bytes of segment id: if this is 0, + it means that the header is unused */ +#define FSEG_NOT_FULL_N_USED 8 + /* number of used segment pages in + the FSEG_NOT_FULL list */ +#define FSEG_FREE 12 + /* list of free extents of this + segment */ +#define FSEG_NOT_FULL (12 + FLST_BASE_NODE_SIZE) + /* list of partially free extents */ +#define FSEG_FULL (12 + 2 * FLST_BASE_NODE_SIZE) + /* list of full extents */ +#define FSEG_MAGIC_N (12 + 3 * FLST_BASE_NODE_SIZE) + /* magic number used in debugging */ +#define FSEG_FRAG_ARR (16 + 3 * FLST_BASE_NODE_SIZE) + /* array of individual pages + belonging to this segment in fsp + fragment extent lists */ +#define FSEG_FRAG_ARR_N_SLOTS (FSP_EXTENT_SIZE / 2) + /* number of slots in the array for + the fragment pages */ +#define FSEG_FRAG_SLOT_SIZE 4 /* a fragment page slot contains its + page number within space, FIL_NULL + means that the slot is not in use */ +/*-------------------------------------*/ +#define FSEG_INODE_SIZE \ + (16 + 3 * FLST_BASE_NODE_SIZE \ + + FSEG_FRAG_ARR_N_SLOTS * FSEG_FRAG_SLOT_SIZE) + +#define FSP_SEG_INODES_PER_PAGE(zip_size) \ + (((zip_size ? zip_size : UNIV_PAGE_SIZE) \ + - FSEG_ARR_OFFSET - 10) / FSEG_INODE_SIZE) + /* Number of segment inodes which fit on a + single page */ + +#define FSEG_MAGIC_N_VALUE 97937874 + +#define FSEG_FILLFACTOR 8 /* If this value is x, then if + the number of unused but reserved + pages in a segment is less than + reserved pages * 1/x, and there are + at least FSEG_FRAG_LIMIT used pages, + then we allow a new empty extent to + be added to the segment in + fseg_alloc_free_page. Otherwise, we + use unused pages of the segment. */ + +#define FSEG_FRAG_LIMIT FSEG_FRAG_ARR_N_SLOTS + /* If the segment has >= this many + used pages, it may be expanded by + allocating extents to the segment; + until that only individual fragment + pages are allocated from the space */ + +#define FSEG_FREE_LIST_LIMIT 40 /* If the reserved size of a segment + is at least this many extents, we + allow extents to be put to the free + list of the extent: at most + FSEG_FREE_LIST_MAX_LEN many */ +#define FSEG_FREE_LIST_MAX_LEN 4 +/* @} */ + +/* @defgroup Extent Descriptor Constants (moved from fsp0fsp.c) @{ */ + +/* EXTENT DESCRIPTOR + ================= + +File extent descriptor data structure: contains bits to tell which pages in +the extent are free and which contain old tuple version to clean. */ + +/*-------------------------------------*/ +#define XDES_ID 0 /* The identifier of the segment + to which this extent belongs */ +#define XDES_FLST_NODE 8 /* The list node data structure + for the descriptors */ +#define XDES_STATE (FLST_NODE_SIZE + 8) + /* contains state information + of the extent */ +#define XDES_BITMAP (FLST_NODE_SIZE + 12) + /* Descriptor bitmap of the pages + in the extent */ +/*-------------------------------------*/ + +#define XDES_BITS_PER_PAGE 2 /* How many bits are there per page */ +#define XDES_FREE_BIT 0 /* Index of the bit which tells if + the page is free */ +#define XDES_CLEAN_BIT 1 /* NOTE: currently not used! + Index of the bit which tells if + there are old versions of tuples + on the page */ +/* States of a descriptor */ +#define XDES_FREE 1 /* extent is in free list of space */ +#define XDES_FREE_FRAG 2 /* extent is in free fragment list of + space */ +#define XDES_FULL_FRAG 3 /* extent is in full fragment list of + space */ +#define XDES_FSEG 4 /* extent belongs to a segment */ + +/** File extent data structure size in bytes. */ +#define XDES_SIZE \ + (XDES_BITMAP \ + + UT_BITS_IN_BYTES(FSP_EXTENT_SIZE * XDES_BITS_PER_PAGE)) + +/** File extent data structure size in bytes for MAX page size. */ +#define XDES_SIZE_MAX \ + (XDES_BITMAP \ + + UT_BITS_IN_BYTES(FSP_EXTENT_SIZE_MAX * XDES_BITS_PER_PAGE)) + +/** File extent data structure size in bytes for MIN page size. */ +#define XDES_SIZE_MIN \ + (XDES_BITMAP \ + + UT_BITS_IN_BYTES(FSP_EXTENT_SIZE_MIN * XDES_BITS_PER_PAGE)) + +/** Offset of the descriptor array on a descriptor page */ +#define XDES_ARR_OFFSET (FSP_HEADER_OFFSET + FSP_HEADER_SIZE) + +/* @} */ + +/**********************************************************************//** +Initializes the file space system. */ +UNIV_INTERN +void +fsp_init(void); +/*==========*/ +/**********************************************************************//** +Gets the size of the system tablespace from the tablespace header. If +we do not have an auto-extending data file, this should be equal to +the size of the data files. If there is an auto-extending data file, +this can be smaller. +@return size in pages */ +UNIV_INTERN +ulint +fsp_header_get_tablespace_size(void); +/*================================*/ +/**********************************************************************//** +Reads the file space size stored in the header page. +@return tablespace size stored in the space header */ +UNIV_INTERN +ulint +fsp_get_size_low( +/*=============*/ + page_t* page); /*!< in: header page (page 0 in the tablespace) */ +/**********************************************************************//** +Reads the space id from the first page of a tablespace. +@return space id, ULINT UNDEFINED if error */ +UNIV_INTERN +ulint +fsp_header_get_space_id( +/*====================*/ + const page_t* page); /*!< in: first page of a tablespace */ +/**********************************************************************//** +Reads the space flags from the first page of a tablespace. +@return flags */ +UNIV_INTERN +ulint +fsp_header_get_flags( +/*=================*/ + const page_t* page); /*!< in: first page of a tablespace */ +/**********************************************************************//** +Reads the compressed page size from the first page of a tablespace. +@return compressed page size in bytes, or 0 if uncompressed */ +UNIV_INTERN +ulint +fsp_header_get_zip_size( +/*====================*/ + const page_t* page); /*!< in: first page of a tablespace */ +/**********************************************************************//** +Writes the space id and flags to a tablespace header. The flags contain +row type, physical/compressed page size, and logical/uncompressed page +size of the tablespace. */ +UNIV_INTERN +void +fsp_header_init_fields( +/*===================*/ + page_t* page, /*!< in/out: first page in the space */ + ulint space_id, /*!< in: space id */ + ulint flags); /*!< in: tablespace flags (FSP_SPACE_FLAGS): + 0, or table->flags if newer than COMPACT */ +/**********************************************************************//** +Initializes the space header of a new created space and creates also the +insert buffer tree root if space == 0. */ +UNIV_INTERN +void +fsp_header_init( +/*============*/ + ulint space, /*!< in: space id */ + ulint size, /*!< in: current size in blocks */ + mtr_t* mtr); /*!< in/out: mini-transaction */ +/**********************************************************************//** +Increases the space size field of a space. */ +UNIV_INTERN +void +fsp_header_inc_size( +/*================*/ + ulint space, /*!< in: space id */ + ulint size_inc, /*!< in: size increment in pages */ + mtr_t* mtr); /*!< in/out: mini-transaction */ +/**********************************************************************//** +Creates a new segment. +@return the block where the segment header is placed, x-latched, NULL +if could not create segment because of lack of space */ +UNIV_INTERN +buf_block_t* +fseg_create( +/*========*/ + ulint space, /*!< in: space id */ + ulint page, /*!< in: page where the segment header is placed: if + this is != 0, the page must belong to another segment, + if this is 0, a new page will be allocated and it + will belong to the created segment */ + ulint byte_offset, /*!< in: byte offset of the created segment header + on the page */ + mtr_t* mtr); /*!< in/out: mini-transaction */ +/**********************************************************************//** +Creates a new segment. +@return the block where the segment header is placed, x-latched, NULL +if could not create segment because of lack of space */ +UNIV_INTERN +buf_block_t* +fseg_create_general( +/*================*/ + ulint space, /*!< in: space id */ + ulint page, /*!< in: page where the segment header is placed: if + this is != 0, the page must belong to another segment, + if this is 0, a new page will be allocated and it + will belong to the created segment */ + ulint byte_offset, /*!< in: byte offset of the created segment header + on the page */ + ibool has_done_reservation, /*!< in: TRUE if the caller has already + done the reservation for the pages with + fsp_reserve_free_extents (at least 2 extents: one for + the inode and the other for the segment) then there is + no need to do the check for this individual + operation */ + mtr_t* mtr); /*!< in/out: mini-transaction */ +/**********************************************************************//** +Calculates the number of pages reserved by a segment, and how many pages are +currently used. +@return number of reserved pages */ +UNIV_INTERN +ulint +fseg_n_reserved_pages( +/*==================*/ + fseg_header_t* header, /*!< in: segment header */ + ulint* used, /*!< out: number of pages used (<= reserved) */ + mtr_t* mtr); /*!< in/out: mini-transaction */ +/**********************************************************************//** +Allocates a single free page from a segment. This function implements +the intelligent allocation strategy which tries to minimize +file space fragmentation. +@param[in/out] seg_header segment header +@param[in] hint hint of which page would be desirable +@param[in] direction if the new page is needed because + of an index page split, and records are + inserted there in order, into which + direction they go alphabetically: FSP_DOWN, + FSP_UP, FSP_NO_DIR +@param[in/out] mtr mini-transaction +@return X-latched block, or NULL if no page could be allocated */ +#define fseg_alloc_free_page(seg_header, hint, direction, mtr) \ + fseg_alloc_free_page_general(seg_header, hint, direction, \ + FALSE, mtr, mtr) +/**********************************************************************//** +Allocates a single free page from a segment. This function implements +the intelligent allocation strategy which tries to minimize file space +fragmentation. +@retval NULL if no page could be allocated +@retval block, rw_lock_x_lock_count(&block->lock) == 1 if allocation succeeded +(init_mtr == mtr, or the page was not previously freed in mtr) +@retval block (not allocated or initialized) otherwise */ +UNIV_INTERN +buf_block_t* +fseg_alloc_free_page_general( +/*=========================*/ + fseg_header_t* seg_header,/*!< in/out: segment header */ + ulint hint, /*!< in: hint of which page would be + desirable */ + byte direction,/*!< in: if the new page is needed because + of an index page split, and records are + inserted there in order, into which + direction they go alphabetically: FSP_DOWN, + FSP_UP, FSP_NO_DIR */ + ibool has_done_reservation, /*!< in: TRUE if the caller has + already done the reservation for the page + with fsp_reserve_free_extents, then there + is no need to do the check for this individual + page */ + mtr_t* mtr, /*!< in/out: mini-transaction */ + mtr_t* init_mtr)/*!< in/out: mtr or another mini-transaction + in which the page should be initialized. + If init_mtr!=mtr, but the page is already + latched in mtr, do not initialize the page. */ + __attribute__((warn_unused_result, nonnull)); +/**********************************************************************//** +Reserves free pages from a tablespace. All mini-transactions which may +use several pages from the tablespace should call this function beforehand +and reserve enough free extents so that they certainly will be able +to do their operation, like a B-tree page split, fully. Reservations +must be released with function fil_space_release_free_extents! + +The alloc_type below has the following meaning: FSP_NORMAL means an +operation which will probably result in more space usage, like an +insert in a B-tree; FSP_UNDO means allocation to undo logs: if we are +deleting rows, then this allocation will in the long run result in +less space usage (after a purge); FSP_CLEANING means allocation done +in a physical record delete (like in a purge) or other cleaning operation +which will result in less space usage in the long run. We prefer the latter +two types of allocation: when space is scarce, FSP_NORMAL allocations +will not succeed, but the latter two allocations will succeed, if possible. +The purpose is to avoid dead end where the database is full but the +user cannot free any space because these freeing operations temporarily +reserve some space. + +Single-table tablespaces whose size is < 32 pages are a special case. In this +function we would liberally reserve several 64 page extents for every page +split or merge in a B-tree. But we do not want to waste disk space if the table +only occupies < 32 pages. That is why we apply different rules in that special +case, just ensuring that there are 3 free pages available. +@return TRUE if we were able to make the reservation */ +UNIV_INTERN +ibool +fsp_reserve_free_extents( +/*=====================*/ + ulint* n_reserved,/*!< out: number of extents actually reserved; if we + return TRUE and the tablespace size is < 64 pages, + then this can be 0, otherwise it is n_ext */ + ulint space, /*!< in: space id */ + ulint n_ext, /*!< in: number of extents to reserve */ + ulint alloc_type,/*!< in: FSP_NORMAL, FSP_UNDO, or FSP_CLEANING */ + mtr_t* mtr); /*!< in: mini-transaction */ +/**********************************************************************//** +This function should be used to get information on how much we still +will be able to insert new data to the database without running out the +tablespace. Only free extents are taken into account and we also subtract +the safety margin required by the above function fsp_reserve_free_extents. +@return available space in kB */ +UNIV_INTERN +ullint +fsp_get_available_space_in_free_extents( +/*====================================*/ + ulint space); /*!< in: space id */ +/**********************************************************************//** +Frees a single page of a segment. */ +UNIV_INTERN +void +fseg_free_page( +/*===========*/ + fseg_header_t* seg_header, /*!< in: segment header */ + ulint space, /*!< in: space id */ + ulint page, /*!< in: page offset */ + mtr_t* mtr); /*!< in/out: mini-transaction */ +/**********************************************************************//** +Checks if a single page of a segment is free. +@return true if free */ +UNIV_INTERN +bool +fseg_page_is_free( +/*==============*/ + fseg_header_t* seg_header, /*!< in: segment header */ + ulint space, /*!< in: space id */ + ulint page) /*!< in: page offset */ + __attribute__((nonnull, warn_unused_result)); +/**********************************************************************//** +Frees part of a segment. This function can be used to free a segment +by repeatedly calling this function in different mini-transactions. +Doing the freeing in a single mini-transaction might result in +too big a mini-transaction. +@return TRUE if freeing completed */ +UNIV_INTERN +ibool +fseg_free_step( +/*===========*/ + fseg_header_t* header, /*!< in, own: segment header; NOTE: if the header + resides on the first page of the frag list + of the segment, this pointer becomes obsolete + after the last freeing step */ + mtr_t* mtr); /*!< in/out: mini-transaction */ +/**********************************************************************//** +Frees part of a segment. Differs from fseg_free_step because this function +leaves the header page unfreed. +@return TRUE if freeing completed, except the header page */ +UNIV_INTERN +ibool +fseg_free_step_not_header( +/*======================*/ + fseg_header_t* header, /*!< in: segment header which must reside on + the first fragment page of the segment */ + mtr_t* mtr); /*!< in/out: mini-transaction */ +/***********************************************************************//** +Checks if a page address is an extent descriptor page address. +@return TRUE if a descriptor page */ +UNIV_INLINE +ibool +fsp_descr_page( +/*===========*/ + ulint zip_size,/*!< in: compressed page size in bytes; + 0 for uncompressed pages */ + ulint page_no);/*!< in: page number */ +/***********************************************************//** +Parses a redo log record of a file page init. +@return end of log record or NULL */ +UNIV_INTERN +byte* +fsp_parse_init_file_page( +/*=====================*/ + byte* ptr, /*!< in: buffer */ + byte* end_ptr, /*!< in: buffer end */ + buf_block_t* block); /*!< in: block or NULL */ +/*******************************************************************//** +Validates the file space system and its segments. +@return TRUE if ok */ +UNIV_INTERN +ibool +fsp_validate( +/*=========*/ + ulint space); /*!< in: space id */ +/*******************************************************************//** +Prints info of a file space. */ +UNIV_INTERN +void +fsp_print( +/*======*/ + ulint space); /*!< in: space id */ +#ifdef UNIV_DEBUG +/*******************************************************************//** +Validates a segment. +@return TRUE if ok */ +UNIV_INTERN +ibool +fseg_validate( +/*==========*/ + fseg_header_t* header, /*!< in: segment header */ + mtr_t* mtr); /*!< in/out: mini-transaction */ +#endif /* UNIV_DEBUG */ +#ifdef UNIV_BTR_PRINT +/*******************************************************************//** +Writes info of a segment. */ +UNIV_INTERN +void +fseg_print( +/*=======*/ + fseg_header_t* header, /*!< in: segment header */ + mtr_t* mtr); /*!< in/out: mini-transaction */ +#endif /* UNIV_BTR_PRINT */ + +/********************************************************************//** +Validate and return the tablespace flags, which are stored in the +tablespace header at offset FSP_SPACE_FLAGS. They should be 0 for +ROW_FORMAT=COMPACT and ROW_FORMAT=REDUNDANT. The newer row formats, +COMPRESSED and DYNAMIC, use a file format > Antelope so they should +have a file format number plus the DICT_TF_COMPACT bit set. +@return true if check ok */ +UNIV_INLINE +bool +fsp_flags_is_valid( +/*===============*/ + ulint flags) /*!< in: tablespace flags */ + __attribute__((warn_unused_result, const)); +/********************************************************************//** +Determine if the tablespace is compressed from dict_table_t::flags. +@return TRUE if compressed, FALSE if not compressed */ +UNIV_INLINE +ibool +fsp_flags_is_compressed( +/*====================*/ + ulint flags); /*!< in: tablespace flags */ + +/********************************************************************//** +Calculates the descriptor index within a descriptor page. +@return descriptor index */ +UNIV_INLINE +ulint +xdes_calc_descriptor_index( +/*=======================*/ + ulint zip_size, /*!< in: compressed page size in bytes; + 0 for uncompressed pages */ + ulint offset); /*!< in: page offset */ + +/**********************************************************************//** +Gets a descriptor bit of a page. +@return TRUE if free */ +UNIV_INLINE +ibool +xdes_get_bit( +/*=========*/ + const xdes_t* descr, /*!< in: descriptor */ + ulint bit, /*!< in: XDES_FREE_BIT or XDES_CLEAN_BIT */ + ulint offset);/*!< in: page offset within extent: + 0 ... FSP_EXTENT_SIZE - 1 */ + +/********************************************************************//** +Calculates the page where the descriptor of a page resides. +@return descriptor page offset */ +UNIV_INLINE +ulint +xdes_calc_descriptor_page( +/*======================*/ + ulint zip_size, /*!< in: compressed page size in bytes; + 0 for uncompressed pages */ + ulint offset); /*!< in: page offset */ + +#endif /* !UNIV_INNOCHECKSUM */ + +/********************************************************************//** +Extract the zip size from tablespace flags. A tablespace has only one +physical page size whether that page is compressed or not. +@return compressed page size of the file-per-table tablespace in bytes, +or zero if the table is not compressed. */ +UNIV_INLINE +ulint +fsp_flags_get_zip_size( +/*====================*/ + ulint flags); /*!< in: tablespace flags */ +/********************************************************************//** +Extract the page size from tablespace flags. +@return page size of the tablespace in bytes */ +UNIV_INLINE +ulint +fsp_flags_get_page_size( +/*====================*/ + ulint flags); /*!< in: tablespace flags */ + +#ifndef UNIV_NONINL +#include "fsp0fsp.ic" +#endif + +#endif diff --git a/storage/innobase/include/fsp0fsp.ic b/storage/innobase/include/fsp0fsp.ic new file mode 100644 index 00000000000..0d81e817cc9 --- /dev/null +++ b/storage/innobase/include/fsp0fsp.ic @@ -0,0 +1,314 @@ +/***************************************************************************** + +Copyright (c) 1995, 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/fsp0fsp.ic +File space management + +Created 12/18/1995 Heikki Tuuri +*******************************************************/ + +#ifndef UNIV_INNOCHECKSUM + +/***********************************************************************//** +Checks if a page address is an extent descriptor page address. +@return TRUE if a descriptor page */ +UNIV_INLINE +ibool +fsp_descr_page( +/*===========*/ + ulint zip_size,/*!< in: compressed page size in bytes; + 0 for uncompressed pages */ + ulint page_no)/*!< in: page number */ +{ + ut_ad(ut_is_2pow(zip_size)); + + if (!zip_size) { + return((page_no & (UNIV_PAGE_SIZE - 1)) == FSP_XDES_OFFSET); + } + + return((page_no & (zip_size - 1)) == FSP_XDES_OFFSET); +} + +/********************************************************************//** +Validate and return the tablespace flags, which are stored in the +tablespace header at offset FSP_SPACE_FLAGS. They should be 0 for +ROW_FORMAT=COMPACT and ROW_FORMAT=REDUNDANT. The newer row formats, +COMPRESSED and DYNAMIC, use a file format > Antelope so they should +have a file format number plus the DICT_TF_COMPACT bit set. +@return true if check ok */ +UNIV_INLINE +bool +fsp_flags_is_valid( +/*===============*/ + ulint flags) /*!< in: tablespace flags */ +{ + ulint post_antelope = FSP_FLAGS_GET_POST_ANTELOPE(flags); + ulint zip_ssize = FSP_FLAGS_GET_ZIP_SSIZE(flags); + ulint atomic_blobs = FSP_FLAGS_HAS_ATOMIC_BLOBS(flags); + ulint page_ssize = FSP_FLAGS_GET_PAGE_SSIZE(flags); + ulint unused = FSP_FLAGS_GET_UNUSED(flags); + + DBUG_EXECUTE_IF("fsp_flags_is_valid_failure", return(false);); + + /* fsp_flags is zero unless atomic_blobs is set. */ + /* Make sure there are no bits that we do not know about. */ + if (unused != 0 || flags == 1) { + return(false); + } else if (post_antelope) { + /* The Antelope row formats REDUNDANT and COMPACT did + not use tablespace flags, so this flag and the entire + 4-byte field is zero for Antelope row formats. */ + + if (!atomic_blobs) { + return(false); + } + } + + if (!atomic_blobs) { + /* Barracuda row formats COMPRESSED and DYNAMIC build on + the page structure introduced for the COMPACT row format + by allowing long fields to be broken into prefix and + externally stored parts. */ + + if (post_antelope || zip_ssize != 0) { + return(false); + } + + } else if (!post_antelope || zip_ssize > PAGE_ZIP_SSIZE_MAX) { + return(false); + } else if (page_ssize > UNIV_PAGE_SSIZE_MAX) { + + /* The page size field can be used for any row type, or it may + be zero for an original 16k page size. + Validate the page shift size is within allowed range. */ + + return(false); + + } else if (UNIV_PAGE_SIZE != UNIV_PAGE_SIZE_ORIG && !page_ssize) { + return(false); + } + +#if UNIV_FORMAT_MAX != UNIV_FORMAT_B +# error "UNIV_FORMAT_MAX != UNIV_FORMAT_B, Add more validations." +#endif + + /* The DATA_DIR field can be used for any row type so there is + nothing here to validate. */ + + return(true); +} + +/********************************************************************//** +Determine if the tablespace is compressed from dict_table_t::flags. +@return TRUE if compressed, FALSE if not compressed */ +UNIV_INLINE +ibool +fsp_flags_is_compressed( +/*====================*/ + ulint flags) /*!< in: tablespace flags */ +{ + return(FSP_FLAGS_GET_ZIP_SSIZE(flags) != 0); +} + +#endif /* !UNIV_INNOCHECKSUM */ + +/********************************************************************//** +Extract the zip size from tablespace flags. +@return compressed page size of the file-per-table tablespace in bytes, +or zero if the table is not compressed. */ +UNIV_INLINE +ulint +fsp_flags_get_zip_size( +/*===================*/ + ulint flags) /*!< in: tablespace flags */ +{ + ulint zip_size = 0; + ulint ssize = FSP_FLAGS_GET_ZIP_SSIZE(flags); + + /* Convert from a 'log2 minus 9' to a page size in bytes. */ + if (ssize) { + zip_size = ((UNIV_ZIP_SIZE_MIN >> 1) << ssize); + + ut_ad(zip_size <= UNIV_ZIP_SIZE_MAX); + } + + return(zip_size); +} + +/********************************************************************//** +Extract the page size from tablespace flags. +@return page size of the tablespace in bytes */ +UNIV_INLINE +ulint +fsp_flags_get_page_size( +/*====================*/ + ulint flags) /*!< in: tablespace flags */ +{ + ulint page_size = 0; + ulint ssize = FSP_FLAGS_GET_PAGE_SSIZE(flags); + + /* Convert from a 'log2 minus 9' to a page size in bytes. */ + if (UNIV_UNLIKELY(ssize)) { + page_size = ((UNIV_ZIP_SIZE_MIN >> 1) << ssize); + + ut_ad(page_size <= UNIV_PAGE_SIZE_MAX); + } else { + /* If the page size was not stored, then it is the + original 16k. */ + page_size = UNIV_PAGE_SIZE_ORIG; + } + + return(page_size); +} + +#ifndef UNIV_INNOCHECKSUM + +/********************************************************************//** +Add the page size to the tablespace flags. +@return tablespace flags after page size is added */ +UNIV_INLINE +ulint +fsp_flags_set_page_size( +/*====================*/ + ulint flags, /*!< in: tablespace flags */ + ulint page_size) /*!< in: page size in bytes */ +{ + ulint ssize = 0; + ulint shift; + + /* Page size should be > UNIV_PAGE_SIZE_MIN */ + ut_ad(page_size >= UNIV_PAGE_SIZE_MIN); + ut_ad(page_size <= UNIV_PAGE_SIZE_MAX); + + if (page_size == UNIV_PAGE_SIZE_ORIG) { + ut_ad(0 == FSP_FLAGS_GET_PAGE_SSIZE(flags)); + return(flags); + } + + for (shift = UNIV_PAGE_SIZE_SHIFT_MAX; + shift >= UNIV_PAGE_SIZE_SHIFT_MIN; + shift--) { + ulint mask = (1 << shift); + if (page_size & mask) { + ut_ad(!(page_size & ~mask)); + ssize = shift - UNIV_ZIP_SIZE_SHIFT_MIN + 1; + break; + } + } + + ut_ad(ssize); + ut_ad(ssize <= UNIV_PAGE_SSIZE_MAX); + + flags = FSP_FLAGS_SET_PAGE_SSIZE(flags, ssize); + + ut_ad(fsp_flags_is_valid(flags)); + + return(flags); +} + +/********************************************************************//** +Calculates the descriptor index within a descriptor page. +@return descriptor index */ +UNIV_INLINE +ulint +xdes_calc_descriptor_index( +/*=======================*/ + ulint zip_size, /*!< in: compressed page size in bytes; + 0 for uncompressed pages */ + ulint offset) /*!< in: page offset */ +{ + ut_ad(ut_is_2pow(zip_size)); + + if (zip_size == 0) { + return(ut_2pow_remainder(offset, UNIV_PAGE_SIZE) + / FSP_EXTENT_SIZE); + } else { + return(ut_2pow_remainder(offset, zip_size) / FSP_EXTENT_SIZE); + } +} + +/**********************************************************************//** +Gets a descriptor bit of a page. +@return TRUE if free */ +UNIV_INLINE +ibool +xdes_get_bit( +/*=========*/ + const xdes_t* descr, /*!< in: descriptor */ + ulint bit, /*!< in: XDES_FREE_BIT or XDES_CLEAN_BIT */ + ulint offset) /*!< in: page offset within extent: + 0 ... FSP_EXTENT_SIZE - 1 */ +{ + ut_ad(offset < FSP_EXTENT_SIZE); + ut_ad(bit == XDES_FREE_BIT || bit == XDES_CLEAN_BIT); + + ulint index = bit + XDES_BITS_PER_PAGE * offset; + + ulint bit_index = index % 8; + ulint byte_index = index / 8; + + return(ut_bit_get_nth( + mach_read_ulint(descr + XDES_BITMAP + byte_index, + MLOG_1BYTE), + bit_index)); +} + +/********************************************************************//** +Calculates the page where the descriptor of a page resides. +@return descriptor page offset */ +UNIV_INLINE +ulint +xdes_calc_descriptor_page( +/*======================*/ + ulint zip_size, /*!< in: compressed page size in bytes; + 0 for uncompressed pages */ + ulint offset) /*!< in: page offset */ +{ +#ifndef DOXYGEN /* Doxygen gets confused by these */ +# if UNIV_PAGE_SIZE_MAX <= XDES_ARR_OFFSET \ + + (UNIV_PAGE_SIZE_MAX / FSP_EXTENT_SIZE_MAX) \ + * XDES_SIZE_MAX +# error +# endif +# if UNIV_ZIP_SIZE_MIN <= XDES_ARR_OFFSET \ + + (UNIV_ZIP_SIZE_MIN / FSP_EXTENT_SIZE_MIN) \ + * XDES_SIZE_MIN +# error +# endif +#endif /* !DOXYGEN */ + + ut_ad(UNIV_PAGE_SIZE > XDES_ARR_OFFSET + + (UNIV_PAGE_SIZE / FSP_EXTENT_SIZE) + * XDES_SIZE); + ut_ad(UNIV_ZIP_SIZE_MIN > XDES_ARR_OFFSET + + (UNIV_ZIP_SIZE_MIN / FSP_EXTENT_SIZE) + * XDES_SIZE); + + ut_ad(ut_is_2pow(zip_size)); + + if (zip_size == 0) { + return(ut_2pow_round(offset, UNIV_PAGE_SIZE)); + } else { + ut_ad(zip_size > XDES_ARR_OFFSET + + (zip_size / FSP_EXTENT_SIZE) * XDES_SIZE); + return(ut_2pow_round(offset, zip_size)); + } +} + +#endif /* !UNIV_INNOCHECKSUM */ diff --git a/storage/innobase/include/fsp0types.h b/storage/innobase/include/fsp0types.h new file mode 100644 index 00000000000..94fd908ab0c --- /dev/null +++ b/storage/innobase/include/fsp0types.h @@ -0,0 +1,116 @@ +/***************************************************************************** + +Copyright (c) 1995, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/****************************************************** +@file include/fsp0types.h +File space management types + +Created May 26, 2009 Vasil Dimov +*******************************************************/ + +#ifndef fsp0types_h +#define fsp0types_h + +#include "univ.i" + +#include "fil0fil.h" /* for FIL_PAGE_DATA */ + +/** @name Flags for inserting records in order +If records are inserted in order, there are the following +flags to tell this (their type is made byte for the compiler +to warn if direction and hint parameters are switched in +fseg_alloc_free_page) */ +/* @{ */ +#define FSP_UP ((byte)111) /*!< alphabetically upwards */ +#define FSP_DOWN ((byte)112) /*!< alphabetically downwards */ +#define FSP_NO_DIR ((byte)113) /*!< no order */ +/* @} */ + +/** File space extent size (one megabyte) in pages */ +#define FSP_EXTENT_SIZE (1048576U / UNIV_PAGE_SIZE) + +/** File space extent size (one megabyte) in pages for MAX page size */ +#define FSP_EXTENT_SIZE_MAX (1048576 / UNIV_PAGE_SIZE_MAX) + +/** File space extent size (one megabyte) in pages for MIN page size */ +#define FSP_EXTENT_SIZE_MIN (1048576 / UNIV_PAGE_SIZE_MIN) + +/** On a page of any file segment, data may be put starting from this +offset */ +#define FSEG_PAGE_DATA FIL_PAGE_DATA + +/** @name File segment header +The file segment header points to the inode describing the file segment. */ +/* @{ */ +/** Data type for file segment header */ +typedef byte fseg_header_t; + +#define FSEG_HDR_SPACE 0 /*!< space id of the inode */ +#define FSEG_HDR_PAGE_NO 4 /*!< page number of the inode */ +#define FSEG_HDR_OFFSET 8 /*!< byte offset of the inode */ + +#define FSEG_HEADER_SIZE 10 /*!< Length of the file system + header, in bytes */ +/* @} */ + +/** Flags for fsp_reserve_free_extents @{ */ +#define FSP_NORMAL 1000000 +#define FSP_UNDO 2000000 +#define FSP_CLEANING 3000000 +/* @} */ + +/* Number of pages described in a single descriptor page: currently each page +description takes less than 1 byte; a descriptor page is repeated every +this many file pages */ +/* #define XDES_DESCRIBED_PER_PAGE UNIV_PAGE_SIZE */ +/* This has been replaced with either UNIV_PAGE_SIZE or page_zip->size. */ + +/** @name The space low address page map +The pages at FSP_XDES_OFFSET and FSP_IBUF_BITMAP_OFFSET are repeated +every XDES_DESCRIBED_PER_PAGE pages in every tablespace. */ +/* @{ */ +/*--------------------------------------*/ +#define FSP_XDES_OFFSET 0 /* !< extent descriptor */ +#define FSP_IBUF_BITMAP_OFFSET 1 /* !< insert buffer bitmap */ + /* The ibuf bitmap pages are the ones whose + page number is the number above plus a + multiple of XDES_DESCRIBED_PER_PAGE */ + +#define FSP_FIRST_INODE_PAGE_NO 2 /*!< in every tablespace */ + /* The following pages exist + in the system tablespace (space 0). */ +#define FSP_IBUF_HEADER_PAGE_NO 3 /*!< insert buffer + header page, in + tablespace 0 */ +#define FSP_IBUF_TREE_ROOT_PAGE_NO 4 /*!< insert buffer + B-tree root page in + tablespace 0 */ + /* The ibuf tree root page number in + tablespace 0; its fseg inode is on the page + number FSP_FIRST_INODE_PAGE_NO */ +#define FSP_TRX_SYS_PAGE_NO 5 /*!< transaction + system header, in + tablespace 0 */ +#define FSP_FIRST_RSEG_PAGE_NO 6 /*!< first rollback segment + page, in tablespace 0 */ +#define FSP_DICT_HDR_PAGE_NO 7 /*!< data dictionary header + page, in tablespace 0 */ +/*--------------------------------------*/ +/* @} */ + +#endif /* fsp0types_h */ diff --git a/storage/innobase/include/fts0ast.h b/storage/innobase/include/fts0ast.h new file mode 100644 index 00000000000..b2380f78b39 --- /dev/null +++ b/storage/innobase/include/fts0ast.h @@ -0,0 +1,339 @@ +/***************************************************************************** + +Copyright (c) 2007, 2014, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/fts0ast.h +The FTS query parser (AST) abstract syntax tree routines + +Created 2007/03/16/03 Sunny Bains +*******************************************************/ + +#ifndef INNOBASE_FST0AST_H +#define INNOBASE_FST0AST_H + +#include "mem0mem.h" +#include "ha_prototypes.h" + +/* The type of AST Node */ +enum fts_ast_type_t { + FTS_AST_OPER, /*!< Operator */ + FTS_AST_NUMB, /*!< Number */ + FTS_AST_TERM, /*!< Term (or word) */ + FTS_AST_TEXT, /*!< Text string */ + FTS_AST_LIST, /*!< Expression list */ + FTS_AST_SUBEXP_LIST /*!< Sub-Expression list */ +}; + +/* The FTS query operators that we support */ +enum fts_ast_oper_t { + FTS_NONE, /*!< No operator */ + + FTS_IGNORE, /*!< Ignore rows that contain + this word */ + + FTS_EXIST, /*!< Include rows that contain + this word */ + + FTS_NEGATE, /*!< Include rows that contain + this word but rank them + lower*/ + + FTS_INCR_RATING, /*!< Increase the rank for this + word*/ + + FTS_DECR_RATING, /*!< Decrease the rank for this + word*/ + + FTS_DISTANCE, /*!< Proximity distance */ + FTS_IGNORE_SKIP, /*!< Transient node operator + signifies that this is a + FTS_IGNORE node, and ignored in + the first pass of + fts_ast_visit() */ + FTS_EXIST_SKIP /*!< Transient node operator + signifies that this ia a + FTS_EXIST node, and ignored in + the first pass of + fts_ast_visit() */ +}; + +/* Data types used by the FTS parser */ +struct fts_lexer_t; +struct fts_ast_node_t; +struct fts_ast_state_t; +struct fts_ast_string_t; + +typedef dberr_t (*fts_ast_callback)(fts_ast_oper_t, fts_ast_node_t*, void*); + +/******************************************************************** +Parse the string using the lexer setup within state.*/ +int +fts_parse( +/*======*/ + /* out: 0 on OK, 1 on error */ + fts_ast_state_t* state); /*!< in: ast state instance.*/ + +/******************************************************************** +Create an AST operator node */ +extern +fts_ast_node_t* +fts_ast_create_node_oper( +/*=====================*/ + void* arg, /*!< in: ast state */ + fts_ast_oper_t oper); /*!< in: ast operator */ +/******************************************************************** +Create an AST term node, makes a copy of ptr */ +extern +fts_ast_node_t* +fts_ast_create_node_term( +/*=====================*/ + void* arg, /*!< in: ast state */ + const fts_ast_string_t* ptr); /*!< in: term string */ +/******************************************************************** +Create an AST text node */ +extern +fts_ast_node_t* +fts_ast_create_node_text( +/*=====================*/ + void* arg, /*!< in: ast state */ + const fts_ast_string_t* ptr); /*!< in: text string */ +/******************************************************************** +Create an AST expr list node */ +extern +fts_ast_node_t* +fts_ast_create_node_list( +/*=====================*/ + void* arg, /*!< in: ast state */ + fts_ast_node_t* expr); /*!< in: ast expr */ +/******************************************************************** +Create a sub-expression list node. This function takes ownership of +expr and is responsible for deleting it. */ +extern +fts_ast_node_t* +fts_ast_create_node_subexp_list( +/*============================*/ + /* out: new node */ + void* arg, /*!< in: ast state instance */ + fts_ast_node_t* expr); /*!< in: ast expr instance */ +/******************************************************************** +Set the wildcard attribute of a term.*/ +extern +void +fts_ast_term_set_wildcard( +/*======================*/ + fts_ast_node_t* node); /*!< in: term to change */ +/******************************************************************** +Set the proximity attribute of a text node. */ + +void +fts_ast_term_set_distance( +/*======================*/ + fts_ast_node_t* node, /*!< in/out: text node */ + ulint distance); /*!< in: the text proximity + distance */ +/********************************************************************//** +Free a fts_ast_node_t instance. +@return next node to free */ +UNIV_INTERN +fts_ast_node_t* +fts_ast_free_node( +/*==============*/ + fts_ast_node_t* node); /*!< in: node to free */ +/******************************************************************** +Add a sub-expression to an AST*/ +extern +fts_ast_node_t* +fts_ast_add_node( +/*=============*/ + fts_ast_node_t* list, /*!< in: list node instance */ + fts_ast_node_t* node); /*!< in: (sub) expr to add */ +/******************************************************************** +Print the AST node recursively.*/ +extern +void +fts_ast_node_print( +/*===============*/ + fts_ast_node_t* node); /*!< in: ast node to print */ +/******************************************************************** +For tracking node allocations, in case there is an during parsing.*/ +extern +void +fts_ast_state_add_node( +/*===================*/ + fts_ast_state_t*state, /*!< in: ast state instance */ + fts_ast_node_t* node); /*!< in: node to add to state */ +/******************************************************************** +Free node and expr allocations.*/ +extern +void +fts_ast_state_free( +/*===============*/ + fts_ast_state_t*state); /*!< in: state instance + to free */ +/******************************************************************//** +Traverse the AST - in-order traversal. +@return DB_SUCCESS if all went well */ +UNIV_INTERN +dberr_t +fts_ast_visit( +/*==========*/ + fts_ast_oper_t oper, /*!< in: FTS operator */ + fts_ast_node_t* node, /*!< in: instance to traverse*/ + fts_ast_callback visitor, /*!< in: callback */ + void* arg, /*!< in: callback arg */ + bool* has_ignore) /*!< out: whether we encounter + and ignored processing an + operator, currently we only + ignore FTS_IGNORE operator */ + __attribute__((nonnull, warn_unused_result)); +/*****************************************************************//** +Process (nested) sub-expression, create a new result set to store the +sub-expression result by processing nodes under current sub-expression +list. Merge the sub-expression result with that of parent expression list. +@return DB_SUCCESS if all went well */ +UNIV_INTERN +dberr_t +fts_ast_visit_sub_exp( +/*==================*/ + fts_ast_node_t* node, /*!< in: instance to traverse*/ + fts_ast_callback visitor, /*!< in: callback */ + void* arg) /*!< in: callback arg */ + __attribute__((nonnull, warn_unused_result)); +/******************************************************************** +Create a lex instance.*/ +UNIV_INTERN +fts_lexer_t* +fts_lexer_create( +/*=============*/ + ibool boolean_mode, /*!< in: query type */ + const byte* query, /*!< in: query string */ + ulint query_len) /*!< in: query string len */ + __attribute__((nonnull, malloc, warn_unused_result)); +/******************************************************************** +Free an fts_lexer_t instance.*/ +UNIV_INTERN +void +fts_lexer_free( +/*===========*/ + fts_lexer_t* fts_lexer) /*!< in: lexer instance to + free */ + __attribute__((nonnull)); + +/** +Create an ast string object, with NUL-terminator, so the string +has one more byte than len +@param[in] str pointer to string +@param[in] len length of the string +@return ast string with NUL-terminator */ +UNIV_INTERN +fts_ast_string_t* +fts_ast_string_create( + const byte* str, + ulint len); + +/** +Free an ast string instance +@param[in,out] ast_str string to free */ +UNIV_INTERN +void +fts_ast_string_free( + fts_ast_string_t* ast_str); + +/** +Translate ast string of type FTS_AST_NUMB to unsigned long by strtoul +@param[in] str string to translate +@param[in] base the base +@return translated number */ +UNIV_INTERN +ulint +fts_ast_string_to_ul( + const fts_ast_string_t* ast_str, + int base); + +/** +Print the ast string +@param[in] str string to print */ +UNIV_INTERN +void +fts_ast_string_print( + const fts_ast_string_t* ast_str); + +/* String of length len. +We always store the string of length len with a terminating '\0', +regardless of there is any 0x00 in the string itself */ +struct fts_ast_string_t { + /*!< Pointer to string. */ + byte* str; + + /*!< Length of the string. */ + ulint len; +}; + +/* Query term type */ +struct fts_ast_term_t { + fts_ast_string_t* ptr; /*!< Pointer to term string.*/ + ibool wildcard; /*!< TRUE if wild card set.*/ +}; + +/* Query text type */ +struct fts_ast_text_t { + fts_ast_string_t* ptr; /*!< Pointer to text string.*/ + ulint distance; /*!< > 0 if proximity distance + set */ +}; + +/* The list of nodes in an expr list */ +struct fts_ast_list_t { + fts_ast_node_t* head; /*!< Children list head */ + fts_ast_node_t* tail; /*!< Children list tail */ +}; + +/* FTS AST node to store the term, text, operator and sub-expressions.*/ +struct fts_ast_node_t { + fts_ast_type_t type; /*!< The type of node */ + fts_ast_text_t text; /*!< Text node */ + fts_ast_term_t term; /*!< Term node */ + fts_ast_oper_t oper; /*!< Operator value */ + fts_ast_list_t list; /*!< Expression list */ + fts_ast_node_t* next; /*!< Link for expr list */ + fts_ast_node_t* next_alloc; /*!< For tracking allocations */ + bool visited; /*!< whether this node is + already processed */ +}; + +/* To track state during parsing */ +struct fts_ast_state_t { + mem_heap_t* heap; /*!< Heap to use for alloc */ + fts_ast_node_t* root; /*!< If all goes OK, then this + will point to the root.*/ + + fts_ast_list_t list; /*!< List of nodes allocated */ + + fts_lexer_t* lexer; /*!< Lexer callback + arg */ + CHARSET_INFO* charset; /*!< charset used for + tokenization */ +}; + +#ifdef UNIV_DEBUG +const char* +fts_ast_oper_name_get(fts_ast_oper_t oper); +const char* +fts_ast_node_type_get(fts_ast_type_t type); +#endif /* UNIV_DEBUG */ + +#endif /* INNOBASE_FSTS0AST_H */ diff --git a/storage/innobase/include/fts0blex.h b/storage/innobase/include/fts0blex.h new file mode 100644 index 00000000000..d0e4cae0678 --- /dev/null +++ b/storage/innobase/include/fts0blex.h @@ -0,0 +1,349 @@ +#ifndef fts0bHEADER_H +#define fts0bHEADER_H 1 +#define fts0bIN_HEADER 1 + +#line 6 "../include/fts0blex.h" + +#line 8 "../include/fts0blex.h" + +#define YY_INT_ALIGNED short int + +/* A lexical scanner generated by flex */ + +#define FLEX_SCANNER +#define YY_FLEX_MAJOR_VERSION 2 +#define YY_FLEX_MINOR_VERSION 5 +#define YY_FLEX_SUBMINOR_VERSION 35 +#if YY_FLEX_SUBMINOR_VERSION > 0 +#define FLEX_BETA +#endif + +/* First, we deal with platform-specific or compiler-specific issues. */ + +/* begin standard C headers. */ +#include <stdio.h> +#include <string.h> +#include <errno.h> +#include <stdlib.h> + +/* end standard C headers. */ + +/* flex integer type definitions */ + +#ifndef FLEXINT_H +#define FLEXINT_H + +/* C99 systems have <inttypes.h>. Non-C99 systems may or may not. */ + +#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L + +/* C99 says to define __STDC_LIMIT_MACROS before including stdint.h, + * if you want the limit (max/min) macros for int types. + */ +#ifndef __STDC_LIMIT_MACROS +#define __STDC_LIMIT_MACROS 1 +#endif + +#include <inttypes.h> +typedef int8_t flex_int8_t; +typedef uint8_t flex_uint8_t; +typedef int16_t flex_int16_t; +typedef uint16_t flex_uint16_t; +typedef int32_t flex_int32_t; +typedef uint32_t flex_uint32_t; +#else +typedef signed char flex_int8_t; +typedef short int flex_int16_t; +typedef int flex_int32_t; +typedef unsigned char flex_uint8_t; +typedef unsigned short int flex_uint16_t; +typedef unsigned int flex_uint32_t; + +/* Limits of integral types. */ +#ifndef INT8_MIN +#define INT8_MIN (-128) +#endif +#ifndef INT16_MIN +#define INT16_MIN (-32767-1) +#endif +#ifndef INT32_MIN +#define INT32_MIN (-2147483647-1) +#endif +#ifndef INT8_MAX +#define INT8_MAX (127) +#endif +#ifndef INT16_MAX +#define INT16_MAX (32767) +#endif +#ifndef INT32_MAX +#define INT32_MAX (2147483647) +#endif +#ifndef UINT8_MAX +#define UINT8_MAX (255U) +#endif +#ifndef UINT16_MAX +#define UINT16_MAX (65535U) +#endif +#ifndef UINT32_MAX +#define UINT32_MAX (4294967295U) +#endif + +#endif /* ! C99 */ + +#endif /* ! FLEXINT_H */ + +#ifdef __cplusplus + +/* The "const" storage-class-modifier is valid. */ +#define YY_USE_CONST + +#else /* ! __cplusplus */ + +/* C99 requires __STDC__ to be defined as 1. */ +#if defined (__STDC__) + +#define YY_USE_CONST + +#endif /* defined (__STDC__) */ +#endif /* ! __cplusplus */ + +#ifdef YY_USE_CONST +#define yyconst const +#else +#define yyconst +#endif + +/* An opaque pointer. */ +#ifndef YY_TYPEDEF_YY_SCANNER_T +#define YY_TYPEDEF_YY_SCANNER_T +typedef void* yyscan_t; +#endif + +/* For convenience, these vars (plus the bison vars far below) + are macros in the reentrant scanner. */ +#define yyin yyg->yyin_r +#define yyout yyg->yyout_r +#define yyextra yyg->yyextra_r +#define yyleng yyg->yyleng_r +#define yytext yyg->yytext_r +#define yylineno (YY_CURRENT_BUFFER_LVALUE->yy_bs_lineno) +#define yycolumn (YY_CURRENT_BUFFER_LVALUE->yy_bs_column) +#define yy_flex_debug yyg->yy_flex_debug_r + +/* Size of default input buffer. */ +#ifndef YY_BUF_SIZE +#ifdef __ia64__ +/* On IA-64, the buffer size is 16k, not 8k. + * Moreover, YY_BUF_SIZE is 2*YY_READ_BUF_SIZE in the general case. + * Ditto for the __ia64__ case accordingly. + */ +#define YY_BUF_SIZE 32768 +#else +#define YY_BUF_SIZE 16384 +#endif /* __ia64__ */ +#endif + +#ifndef YY_TYPEDEF_YY_BUFFER_STATE +#define YY_TYPEDEF_YY_BUFFER_STATE +typedef struct yy_buffer_state *YY_BUFFER_STATE; +#endif + +#ifndef YY_TYPEDEF_YY_SIZE_T +#define YY_TYPEDEF_YY_SIZE_T +typedef size_t yy_size_t; +#endif + +#ifndef YY_STRUCT_YY_BUFFER_STATE +#define YY_STRUCT_YY_BUFFER_STATE +struct yy_buffer_state + { + FILE *yy_input_file; + + char *yy_ch_buf; /* input buffer */ + char *yy_buf_pos; /* current position in input buffer */ + + /* Size of input buffer in bytes, not including room for EOB + * characters. + */ + yy_size_t yy_buf_size; + + /* Number of characters read into yy_ch_buf, not including EOB + * characters. + */ + int yy_n_chars; + + /* Whether we "own" the buffer - i.e., we know we created it, + * and can realloc() it to grow it, and should free() it to + * delete it. + */ + int yy_is_our_buffer; + + /* Whether this is an "interactive" input source; if so, and + * if we're using stdio for input, then we want to use getc() + * instead of fread(), to make sure we stop fetching input after + * each newline. + */ + int yy_is_interactive; + + /* Whether we're considered to be at the beginning of a line. + * If so, '^' rules will be active on the next match, otherwise + * not. + */ + int yy_at_bol; + + int yy_bs_lineno; /**< The line count. */ + int yy_bs_column; /**< The column count. */ + + /* Whether to try to fill the input buffer when we reach the + * end of it. + */ + int yy_fill_buffer; + + int yy_buffer_status; + + }; +#endif /* !YY_STRUCT_YY_BUFFER_STATE */ + +void fts0brestart (FILE *input_file ,yyscan_t yyscanner ); +void fts0b_switch_to_buffer (YY_BUFFER_STATE new_buffer ,yyscan_t yyscanner ); +YY_BUFFER_STATE fts0b_create_buffer (FILE *file,int size ,yyscan_t yyscanner ); +void fts0b_delete_buffer (YY_BUFFER_STATE b ,yyscan_t yyscanner ); +void fts0b_flush_buffer (YY_BUFFER_STATE b ,yyscan_t yyscanner ); +void fts0bpush_buffer_state (YY_BUFFER_STATE new_buffer ,yyscan_t yyscanner ); +void fts0bpop_buffer_state (yyscan_t yyscanner ); + +YY_BUFFER_STATE fts0b_scan_buffer (char *base,yy_size_t size ,yyscan_t yyscanner ); +YY_BUFFER_STATE fts0b_scan_string (yyconst char *yy_str ,yyscan_t yyscanner ); +YY_BUFFER_STATE fts0b_scan_bytes (yyconst char *bytes,int len ,yyscan_t yyscanner ); + +void *fts0balloc (yy_size_t ,yyscan_t yyscanner ); +void *fts0brealloc (void *,yy_size_t ,yyscan_t yyscanner ); +void fts0bfree (void * ,yyscan_t yyscanner ); + +/* Begin user sect3 */ + +#define fts0bwrap(n) 1 +#define YY_SKIP_YYWRAP + +#define yytext_ptr yytext_r + +#ifdef YY_HEADER_EXPORT_START_CONDITIONS +#define INITIAL 0 + +#endif + +#ifndef YY_NO_UNISTD_H +/* Special case for "unistd.h", since it is non-ANSI. We include it way + * down here because we want the user's section 1 to have been scanned first. + * The user has a chance to override it with an option. + */ +#include <unistd.h> +#endif + +#ifndef YY_EXTRA_TYPE +#define YY_EXTRA_TYPE void * +#endif + +int fts0blex_init (yyscan_t* scanner); + +int fts0blex_init_extra (YY_EXTRA_TYPE user_defined,yyscan_t* scanner); + +/* Accessor methods to globals. + These are made visible to non-reentrant scanners for convenience. */ + +int fts0blex_destroy (yyscan_t yyscanner ); + +int fts0bget_debug (yyscan_t yyscanner ); + +void fts0bset_debug (int debug_flag ,yyscan_t yyscanner ); + +YY_EXTRA_TYPE fts0bget_extra (yyscan_t yyscanner ); + +void fts0bset_extra (YY_EXTRA_TYPE user_defined ,yyscan_t yyscanner ); + +FILE *fts0bget_in (yyscan_t yyscanner ); + +void fts0bset_in (FILE * in_str ,yyscan_t yyscanner ); + +FILE *fts0bget_out (yyscan_t yyscanner ); + +void fts0bset_out (FILE * out_str ,yyscan_t yyscanner ); + +int fts0bget_leng (yyscan_t yyscanner ); + +char *fts0bget_text (yyscan_t yyscanner ); + +int fts0bget_lineno (yyscan_t yyscanner ); + +void fts0bset_lineno (int line_number ,yyscan_t yyscanner ); + +/* Macros after this point can all be overridden by user definitions in + * section 1. + */ + +#ifndef YY_SKIP_YYWRAP +#ifdef __cplusplus +extern "C" int fts0bwrap (yyscan_t yyscanner ); +#else +extern int fts0bwrap (yyscan_t yyscanner ); +#endif +#endif + +#ifndef yytext_ptr +static void yy_flex_strncpy (char *,yyconst char *,int ,yyscan_t yyscanner); +#endif + +#ifdef YY_NEED_STRLEN +static int yy_flex_strlen (yyconst char * ,yyscan_t yyscanner); +#endif + +#ifndef YY_NO_INPUT + +#endif + +/* Amount of stuff to slurp up with each read. */ +#ifndef YY_READ_BUF_SIZE +#ifdef __ia64__ +/* On IA-64, the buffer size is 16k, not 8k */ +#define YY_READ_BUF_SIZE 16384 +#else +#define YY_READ_BUF_SIZE 8192 +#endif /* __ia64__ */ +#endif + +/* Number of entries by which start-condition stack grows. */ +#ifndef YY_START_STACK_INCR +#define YY_START_STACK_INCR 25 +#endif + +/* Default declaration of generated scanner - a define so the user can + * easily add parameters. + */ +#ifndef YY_DECL +#define YY_DECL_IS_OURS 1 + +extern int fts0blex (yyscan_t yyscanner); + +#define YY_DECL int fts0blex (yyscan_t yyscanner) +#endif /* !YY_DECL */ + +/* yy_get_previous_state - get the state just before the EOB char was reached */ + +#undef YY_NEW_FILE +#undef YY_FLUSH_BUFFER +#undef yy_set_bol +#undef yy_new_buffer +#undef yy_set_interactive +#undef YY_DO_BEFORE_ACTION + +#ifdef YY_DECL_IS_OURS +#undef YY_DECL_IS_OURS +#undef YY_DECL +#endif + +#line 73 "fts0blex.l" + + +#line 348 "../include/fts0blex.h" +#undef fts0bIN_HEADER +#endif /* fts0bHEADER_H */ diff --git a/storage/innobase/include/fts0fts.h b/storage/innobase/include/fts0fts.h new file mode 100644 index 00000000000..a2996ecacc8 --- /dev/null +++ b/storage/innobase/include/fts0fts.h @@ -0,0 +1,1039 @@ +/***************************************************************************** + +Copyright (c) 2011, 2013, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/fts0fts.h +Full text search header file + +Created 2011/09/02 Sunny Bains +***********************************************************************/ + +#ifndef fts0fts_h +#define fts0fts_h + +#include "univ.i" + +#include "data0type.h" +#include "data0types.h" +#include "dict0types.h" +#include "hash0hash.h" +#include "mem0mem.h" +#include "rem0types.h" +#include "row0types.h" +#include "trx0types.h" +#include "ut0vec.h" +#include "ut0rbt.h" +#include "ut0wqueue.h" +#include "que0types.h" +#include "ft_global.h" + +/** "NULL" value of a document id. */ +#define FTS_NULL_DOC_ID 0 + +/** FTS hidden column that is used to map to and from the row */ +#define FTS_DOC_ID_COL_NAME "FTS_DOC_ID" + +/** The name of the index created by FTS */ +#define FTS_DOC_ID_INDEX_NAME "FTS_DOC_ID_INDEX" + +#define FTS_DOC_ID_INDEX_NAME_LEN 16 + +/** Doc ID is a 8 byte value */ +#define FTS_DOC_ID_LEN 8 + +/** The number of fields to sort when we build FT index with +FIC. Three fields are sort: (word, doc_id, position) */ +#define FTS_NUM_FIELDS_SORT 3 + +/** Maximum number of rows in a table, smaller than which, we will +optimize using a 4 byte Doc ID for FIC merge sort to reduce sort size */ +#define MAX_DOC_ID_OPT_VAL 1073741824 + +/** Document id type. */ +typedef ib_uint64_t doc_id_t; + +/** doc_id_t printf format */ +#define FTS_DOC_ID_FORMAT IB_ID_FMT + +/** Convert document id to the InnoDB (BIG ENDIAN) storage format. */ +#define fts_write_doc_id(d, s) mach_write_to_8(d, s) + +/** Read a document id to internal format. */ +#define fts_read_doc_id(s) mach_read_from_8(s) + +/** Bind the doc id to a variable */ +#define fts_bind_doc_id(i, n, v) pars_info_bind_int8_literal(i, n, v) + +/** Defines for FTS query mode, they have the same values as +those defined in mysql file ft_global.h */ +#define FTS_NL 0 +#define FTS_BOOL 1 +#define FTS_SORTED 2 +#define FTS_EXPAND 4 +#define FTS_PROXIMITY 8 +#define FTS_PHRASE 16 +#define FTS_OPT_RANKING 32 + +#define FTS_INDEX_TABLE_IND_NAME "FTS_INDEX_TABLE_IND" + +/** Threshold where our optimize thread automatically kicks in */ +#define FTS_OPTIMIZE_THRESHOLD 10000000 + +#define FTS_DOC_ID_MAX_STEP 10000 +/** Variable specifying the FTS parallel sort degree */ +extern ulong fts_sort_pll_degree; + +/** Variable specifying the number of word to optimize for each optimize table +call */ +extern ulong fts_num_word_optimize; + +/** Variable specifying whether we do additional FTS diagnostic printout +in the log */ +extern char fts_enable_diag_print; + +/** FTS rank type, which will be between 0 .. 1 inclusive */ +typedef float fts_rank_t; + +/** Type of a row during a transaction. FTS_NOTHING means the row can be +forgotten from the FTS system's POV, FTS_INVALID is an internal value used +to mark invalid states. + +NOTE: Do not change the order or value of these, fts_trx_row_get_new_state +depends on them being exactly as they are. */ +enum fts_row_state { + FTS_INSERT = 0, + FTS_MODIFY, + FTS_DELETE, + FTS_NOTHING, + FTS_INVALID +}; + +/** The FTS table types. */ +enum fts_table_type_t { + FTS_INDEX_TABLE, /*!< FTS auxiliary table that is + specific to a particular FTS index + on a table */ + + FTS_COMMON_TABLE /*!< FTS auxiliary table that is common + for all FTS index on a table */ +}; + +struct fts_doc_t; +struct fts_cache_t; +struct fts_token_t; +struct fts_doc_ids_t; +struct fts_index_cache_t; + + +/** Initialize the "fts_table" for internal query into FTS auxiliary +tables */ +#define FTS_INIT_FTS_TABLE(fts_table, m_suffix, m_type, m_table)\ +do { \ + (fts_table)->suffix = m_suffix; \ + (fts_table)->type = m_type; \ + (fts_table)->table_id = m_table->id; \ + (fts_table)->parent = m_table->name; \ + (fts_table)->table = m_table; \ +} while (0); + +#define FTS_INIT_INDEX_TABLE(fts_table, m_suffix, m_type, m_index)\ +do { \ + (fts_table)->suffix = m_suffix; \ + (fts_table)->type = m_type; \ + (fts_table)->table_id = m_index->table->id; \ + (fts_table)->parent = m_index->table->name; \ + (fts_table)->table = m_index->table; \ + (fts_table)->index_id = m_index->id; \ +} while (0); + +/** Information about changes in a single transaction affecting +the FTS system. */ +struct fts_trx_t { + trx_t* trx; /*!< InnoDB transaction */ + + ib_vector_t* savepoints; /*!< Active savepoints, must have at + least one element, the implied + savepoint */ + ib_vector_t* last_stmt; /*!< last_stmt */ + + mem_heap_t* heap; /*!< heap */ +}; + +/** Information required for transaction savepoint handling. */ +struct fts_savepoint_t { + char* name; /*!< First entry is always NULL, the + default instance. Otherwise the name + of the savepoint */ + + ib_rbt_t* tables; /*!< Modified FTS tables */ +}; + +/** Information about changed rows in a transaction for a single table. */ +struct fts_trx_table_t { + dict_table_t* table; /*!< table */ + + fts_trx_t* fts_trx; /*!< link to parent */ + + ib_rbt_t* rows; /*!< rows changed; indexed by doc-id, + cells are fts_trx_row_t* */ + + fts_doc_ids_t* added_doc_ids; /*!< list of added doc ids (NULL until + the first addition) */ + + /*!< for adding doc ids */ + que_t* docs_added_graph; +}; + +/** Information about one changed row in a transaction. */ +struct fts_trx_row_t { + doc_id_t doc_id; /*!< Id of the ins/upd/del document */ + + fts_row_state state; /*!< state of the row */ + + ib_vector_t* fts_indexes; /*!< The indexes that are affected */ +}; + +/** List of document ids that were added during a transaction. This +list is passed on to a background 'Add' thread and OPTIMIZE, so it +needs its own memory heap. */ +struct fts_doc_ids_t { + ib_vector_t* doc_ids; /*!< document ids (each element is + of type doc_id_t). */ + + ib_alloc_t* self_heap; /*!< Allocator used to create an + instance of this type and the + doc_ids vector */ +}; + +// FIXME: Get rid of this if possible. +/** Since MySQL's character set support for Unicode is woefully inadequate +(it supports basic operations like isalpha etc. only for 8-bit characters), +we have to implement our own. We use UTF-16 without surrogate processing +as our in-memory format. This typedef is a single such character. */ +typedef unsigned short ib_uc_t; + +/** An UTF-16 ro UTF-8 string. */ +struct fts_string_t { + byte* f_str; /*!< string, not necessary terminated in + any way */ + ulint f_len; /*!< Length of the string in bytes */ + ulint f_n_char; /*!< Number of characters */ +}; + +/** Query ranked doc ids. */ +struct fts_ranking_t { + doc_id_t doc_id; /*!< Document id */ + + fts_rank_t rank; /*!< Rank is between 0 .. 1 */ + + byte* words; /*!< this contains the words + that were queried + and found in this document */ + ulint words_len; /*!< words len */ +}; + +/** Query result. */ +struct fts_result_t { + ib_rbt_node_t* current; /*!< Current element */ + + ib_rbt_t* rankings_by_id; /*!< RB tree of type fts_ranking_t + indexed by doc id */ + ib_rbt_t* rankings_by_rank;/*!< RB tree of type fts_ranking_t + indexed by rank */ +}; + +/** This is used to generate the FTS auxiliary table name, we need the +table id and the index id to generate the column specific FTS auxiliary +table name. */ +struct fts_table_t { + const char* parent; /*!< Parent table name, this is + required only for the database + name */ + + fts_table_type_t + type; /*!< The auxiliary table type */ + + table_id_t table_id; /*!< The table id */ + + index_id_t index_id; /*!< The index id */ + + const char* suffix; /*!< The suffix of the fts auxiliary + table name, can be NULL, not used + everywhere (yet) */ + const dict_table_t* + table; /*!< Parent table */ + CHARSET_INFO* charset; /*!< charset info if it is for FTS + index auxiliary table */ +}; + +enum fts_status { + BG_THREAD_STOP = 1, /*!< TRUE if the FTS background thread + has finished reading the ADDED table, + meaning more items can be added to + the table. */ + + BG_THREAD_READY = 2, /*!< TRUE if the FTS background thread + is ready */ + + ADD_THREAD_STARTED = 4, /*!< TRUE if the FTS add thread + has started */ + + ADDED_TABLE_SYNCED = 8, /*!< TRUE if the ADDED table record is + sync-ed after crash recovery */ + + TABLE_DICT_LOCKED = 16 /*!< Set if the table has + dict_sys->mutex */ +}; + +typedef enum fts_status fts_status_t; + +/** The state of the FTS sub system. */ +struct fts_t { + /*!< mutex protecting bg_threads* and + fts_add_wq. */ + ib_mutex_t bg_threads_mutex; + + ulint bg_threads; /*!< number of background threads + accessing this table */ + + /*!< TRUE if background threads running + should stop themselves */ + ulint fts_status; /*!< Status bit regarding fts + running state */ + + ib_wqueue_t* add_wq; /*!< Work queue for scheduling jobs + for the FTS 'Add' thread, or NULL + if the thread has not yet been + created. Each work item is a + fts_trx_doc_ids_t*. */ + + fts_cache_t* cache; /*!< FTS memory buffer for this table, + or NULL if the table has no FTS + index. */ + + ulint doc_col; /*!< FTS doc id hidden column number + in the CLUSTERED index. */ + + ib_vector_t* indexes; /*!< Vector of FTS indexes, this is + mainly for caching purposes. */ + mem_heap_t* fts_heap; /*!< heap for fts_t allocation */ +}; + +struct fts_stopword_t; + +/** status bits for fts_stopword_t status field. */ +#define STOPWORD_NOT_INIT 0x1 +#define STOPWORD_OFF 0x2 +#define STOPWORD_FROM_DEFAULT 0x4 +#define STOPWORD_USER_TABLE 0x8 + +extern const char* fts_default_stopword[]; + +/** Variable specifying the maximum FTS cache size for each table */ +extern ulong fts_max_cache_size; + +/** Variable specifying the total memory allocated for FTS cache */ +extern ulong fts_max_total_cache_size; + +/** Variable specifying the FTS result cache limit for each query */ +extern ulong fts_result_cache_limit; + +/** Variable specifying the maximum FTS max token size */ +extern ulong fts_max_token_size; + +/** Variable specifying the minimum FTS max token size */ +extern ulong fts_min_token_size; + +/** Whether the total memory used for FTS cache is exhausted, and we will +need a sync to free some memory */ +extern bool fts_need_sync; + +/** Maximum possible Fulltext word length */ +#define FTS_MAX_WORD_LEN HA_FT_MAXBYTELEN + +/** Maximum possible Fulltext word length (in characters) */ +#define FTS_MAX_WORD_LEN_IN_CHAR HA_FT_MAXCHARLEN + +/** Variable specifying the table that has Fulltext index to display its +content through information schema table */ +extern char* fts_internal_tbl_name; + +#define fts_que_graph_free(graph) \ +do { \ + mutex_enter(&dict_sys->mutex); \ + que_graph_free(graph); \ + mutex_exit(&dict_sys->mutex); \ +} while (0) + +/******************************************************************//** +Create a FTS cache. */ +UNIV_INTERN +fts_cache_t* +fts_cache_create( +/*=============*/ + dict_table_t* table); /*!< table owns the FTS cache */ + +/******************************************************************//** +Create a FTS index cache. +@return Index Cache */ +UNIV_INTERN +fts_index_cache_t* +fts_cache_index_cache_create( +/*=========================*/ + dict_table_t* table, /*!< in: table with FTS index */ + dict_index_t* index); /*!< in: FTS index */ + +/******************************************************************//** +Get the next available document id. This function creates a new +transaction to generate the document id. +@return DB_SUCCESS if OK */ +UNIV_INTERN +dberr_t +fts_get_next_doc_id( +/*================*/ + const dict_table_t* table, /*!< in: table */ + doc_id_t* doc_id) /*!< out: new document id */ + __attribute__((nonnull)); +/*********************************************************************//** +Update the next and last Doc ID in the CONFIG table to be the input +"doc_id" value (+ 1). We would do so after each FTS index build or +table truncate */ +UNIV_INTERN +void +fts_update_next_doc_id( +/*===================*/ + trx_t* trx, /*!< in/out: transaction */ + const dict_table_t* table, /*!< in: table */ + const char* table_name, /*!< in: table name, or NULL */ + doc_id_t doc_id) /*!< in: DOC ID to set */ + __attribute__((nonnull(2))); + +/******************************************************************//** +Create a new document id . +@return DB_SUCCESS if all went well else error */ +UNIV_INTERN +dberr_t +fts_create_doc_id( +/*==============*/ + dict_table_t* table, /*!< in: row is of this + table. */ + dtuple_t* row, /*!< in/out: add doc id + value to this row. This is the + current row that is being + inserted. */ + mem_heap_t* heap) /*!< in: heap */ + __attribute__((nonnull)); +/******************************************************************//** +Create a new fts_doc_ids_t. +@return new fts_doc_ids_t. */ +UNIV_INTERN +fts_doc_ids_t* +fts_doc_ids_create(void); +/*=====================*/ + +/******************************************************************//** +Free a fts_doc_ids_t. */ +UNIV_INTERN +void +fts_doc_ids_free( +/*=============*/ + fts_doc_ids_t* doc_ids); /*!< in: doc_ids to free */ + +/******************************************************************//** +Notify the FTS system about an operation on an FTS-indexed table. */ +UNIV_INTERN +void +fts_trx_add_op( +/*===========*/ + trx_t* trx, /*!< in: InnoDB transaction */ + dict_table_t* table, /*!< in: table */ + doc_id_t doc_id, /*!< in: doc id */ + fts_row_state state, /*!< in: state of the row */ + ib_vector_t* fts_indexes) /*!< in: FTS indexes affected + (NULL=all) */ + __attribute__((nonnull(1,2))); + +/******************************************************************//** +Free an FTS trx. */ +UNIV_INTERN +void +fts_trx_free( +/*=========*/ + fts_trx_t* fts_trx); /*!< in, own: FTS trx */ + +/******************************************************************//** +Creates the common ancillary tables needed for supporting an FTS index +on the given table. row_mysql_lock_data_dictionary must have been +called before this. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +fts_create_common_tables( +/*=====================*/ + trx_t* trx, /*!< in: transaction handle */ + const dict_table_t* + table, /*!< in: table with one FTS + index */ + const char* name, /*!< in: table name */ + bool skip_doc_id_index) /*!< in: Skip index on doc id */ + __attribute__((nonnull, warn_unused_result)); +/******************************************************************//** +Wrapper function of fts_create_index_tables_low(), create auxiliary +tables for an FTS index +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +fts_create_index_tables( +/*====================*/ + trx_t* trx, /*!< in: transaction handle */ + const dict_index_t* index) /*!< in: the FTS index + instance */ + __attribute__((nonnull, warn_unused_result)); +/******************************************************************//** +Creates the column specific ancillary tables needed for supporting an +FTS index on the given table. row_mysql_lock_data_dictionary must have +been called before this. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +fts_create_index_tables_low( +/*========================*/ + trx_t* trx, /*!< in: transaction handle */ + const dict_index_t* + index, /*!< in: the FTS index + instance */ + const char* table_name, /*!< in: the table name */ + table_id_t table_id) /*!< in: the table id */ + __attribute__((nonnull, warn_unused_result)); +/******************************************************************//** +Add the FTS document id hidden column. */ +UNIV_INTERN +void +fts_add_doc_id_column( +/*==================*/ + dict_table_t* table, /*!< in/out: Table with FTS index */ + mem_heap_t* heap) /*!< in: temporary memory heap, or NULL */ + __attribute__((nonnull(1))); + +/*********************************************************************//** +Drops the ancillary tables needed for supporting an FTS index on the +given table. row_mysql_lock_data_dictionary must have been called before +this. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +fts_drop_tables( +/*============*/ + trx_t* trx, /*!< in: transaction */ + dict_table_t* table) /*!< in: table has the FTS + index */ + __attribute__((nonnull)); +/******************************************************************//** +The given transaction is about to be committed; do whatever is necessary +from the FTS system's POV. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +fts_commit( +/*=======*/ + trx_t* trx) /*!< in: transaction */ + __attribute__((nonnull, warn_unused_result)); + +/*******************************************************************//** +FTS Query entry point. +@return DB_SUCCESS if successful otherwise error code */ +UNIV_INTERN +dberr_t +fts_query( +/*======*/ + trx_t* trx, /*!< in: transaction */ + dict_index_t* index, /*!< in: FTS index to search */ + uint flags, /*!< in: FTS search mode */ + const byte* query, /*!< in: FTS query */ + ulint query_len, /*!< in: FTS query string len + in bytes */ + fts_result_t** result) /*!< out: query result, to be + freed by the caller.*/ + __attribute__((nonnull, warn_unused_result)); + +/******************************************************************//** +Retrieve the FTS Relevance Ranking result for doc with doc_id +@return the relevance ranking value. */ +UNIV_INTERN +float +fts_retrieve_ranking( +/*=================*/ + fts_result_t* result, /*!< in: FTS result structure */ + doc_id_t doc_id); /*!< in: the interested document + doc_id */ + +/******************************************************************//** +FTS Query sort result, returned by fts_query() on fts_ranking_t::rank. */ +UNIV_INTERN +void +fts_query_sort_result_on_rank( +/*==========================*/ + fts_result_t* result); /*!< out: result instance + to sort.*/ + +/******************************************************************//** +FTS Query free result, returned by fts_query(). */ +UNIV_INTERN +void +fts_query_free_result( +/*==================*/ + fts_result_t* result); /*!< in: result instance + to free.*/ + +/******************************************************************//** +Extract the doc id from the FTS hidden column. */ +UNIV_INTERN +doc_id_t +fts_get_doc_id_from_row( +/*====================*/ + dict_table_t* table, /*!< in: table */ + dtuple_t* row); /*!< in: row whose FTS doc id we + want to extract.*/ + +/******************************************************************//** +Extract the doc id from the FTS hidden column. */ +UNIV_INTERN +doc_id_t +fts_get_doc_id_from_rec( +/*====================*/ + dict_table_t* table, /*!< in: table */ + const rec_t* rec, /*!< in: rec */ + mem_heap_t* heap); /*!< in: heap */ + +/******************************************************************//** +Update the query graph with a new document id. +@return Doc ID used */ +UNIV_INTERN +doc_id_t +fts_update_doc_id( +/*==============*/ + dict_table_t* table, /*!< in: table */ + upd_field_t* ufield, /*!< out: update node */ + doc_id_t* next_doc_id); /*!< out: buffer for writing */ + +/******************************************************************//** +FTS initialize. */ +UNIV_INTERN +void +fts_startup(void); +/*==============*/ + +/******************************************************************//** +Signal FTS threads to initiate shutdown. */ +UNIV_INTERN +void +fts_start_shutdown( +/*===============*/ + dict_table_t* table, /*!< in: table with FTS + indexes */ + fts_t* fts); /*!< in: fts instance to + shutdown */ + +/******************************************************************//** +Wait for FTS threads to shutdown. */ +UNIV_INTERN +void +fts_shutdown( +/*=========*/ + dict_table_t* table, /*!< in: table with FTS + indexes */ + fts_t* fts); /*!< in: fts instance to + shutdown */ + +/******************************************************************//** +Create an instance of fts_t. +@return instance of fts_t */ +UNIV_INTERN +fts_t* +fts_create( +/*=======*/ + dict_table_t* table); /*!< out: table with FTS + indexes */ + +/**********************************************************************//** +Free the FTS resources. */ +UNIV_INTERN +void +fts_free( +/*=====*/ + dict_table_t* table); /*!< in/out: table with + FTS indexes */ + +/*********************************************************************//** +Run OPTIMIZE on the given table. +@return DB_SUCCESS if all OK */ +UNIV_INTERN +dberr_t +fts_optimize_table( +/*===============*/ + dict_table_t* table) /*!< in: table to optimiza */ + __attribute__((nonnull)); + +/**********************************************************************//** +Startup the optimize thread and create the work queue. */ +UNIV_INTERN +void +fts_optimize_init(void); +/*====================*/ + +/**********************************************************************//** +Check whether the work queue is initialized. +@return TRUE if optimze queue is initialized. */ +UNIV_INTERN +ibool +fts_optimize_is_init(void); +/*======================*/ + +/****************************************************************//** +Drops index ancillary tables for a FTS index +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +fts_drop_index_tables( +/*==================*/ + trx_t* trx, /*!< in: transaction */ + dict_index_t* index) /*!< in: Index to drop */ + __attribute__((nonnull, warn_unused_result)); + +/******************************************************************//** +Remove the table from the OPTIMIZER's list. We do wait for +acknowledgement from the consumer of the message. */ +UNIV_INTERN +void +fts_optimize_remove_table( +/*======================*/ + dict_table_t* table); /*!< in: table to remove */ + +/**********************************************************************//** +Signal the optimize thread to prepare for shutdown. */ +UNIV_INTERN +void +fts_optimize_start_shutdown(void); +/*==============================*/ + +/**********************************************************************//** +Inform optimize to clean up. */ +UNIV_INTERN +void +fts_optimize_end(void); +/*===================*/ + +/**********************************************************************//** +Take a FTS savepoint. */ +UNIV_INTERN +void +fts_savepoint_take( +/*===============*/ + trx_t* trx, /*!< in: transaction */ + fts_trx_t* fts_trx, /*!< in: fts transaction */ + const char* name) /*!< in: savepoint name */ + __attribute__((nonnull)); +/**********************************************************************//** +Refresh last statement savepoint. */ +UNIV_INTERN +void +fts_savepoint_laststmt_refresh( +/*===========================*/ + trx_t* trx) /*!< in: transaction */ + __attribute__((nonnull)); +/**********************************************************************//** +Release the savepoint data identified by name. */ +UNIV_INTERN +void +fts_savepoint_release( +/*==================*/ + trx_t* trx, /*!< in: transaction */ + const char* name); /*!< in: savepoint name */ + +/**********************************************************************//** +Free the FTS cache. */ +UNIV_INTERN +void +fts_cache_destroy( +/*==============*/ + fts_cache_t* cache); /*!< in: cache*/ + +/*********************************************************************//** +Clear cache. */ +UNIV_INTERN +void +fts_cache_clear( +/*============*/ + fts_cache_t* cache); /*!< in: cache */ + +/*********************************************************************//** +Initialize things in cache. */ +UNIV_INTERN +void +fts_cache_init( +/*===========*/ + fts_cache_t* cache); /*!< in: cache */ + +/*********************************************************************//** +Rollback to and including savepoint indentified by name. */ +UNIV_INTERN +void +fts_savepoint_rollback( +/*===================*/ + trx_t* trx, /*!< in: transaction */ + const char* name); /*!< in: savepoint name */ + +/*********************************************************************//** +Rollback to and including savepoint indentified by name. */ +UNIV_INTERN +void +fts_savepoint_rollback_last_stmt( +/*=============================*/ + trx_t* trx); /*!< in: transaction */ + +/***********************************************************************//** +Drop all orphaned FTS auxiliary tables, those that don't have a parent +table or FTS index defined on them. */ +UNIV_INTERN +void +fts_drop_orphaned_tables(void); +/*==========================*/ + +/******************************************************************//** +Since we do a horizontal split on the index table, we need to drop +all the split tables. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +fts_drop_index_split_tables( +/*========================*/ + trx_t* trx, /*!< in: transaction */ + dict_index_t* index) /*!< in: fts instance */ + __attribute__((nonnull, warn_unused_result)); + +/****************************************************************//** +Run SYNC on the table, i.e., write out data from the cache to the +FTS auxiliary INDEX table and clear the cache at the end. */ +UNIV_INTERN +dberr_t +fts_sync_table( +/*===========*/ + dict_table_t* table) /*!< in: table */ + __attribute__((nonnull)); + +/****************************************************************//** +Free the query graph but check whether dict_sys->mutex is already +held */ +UNIV_INTERN +void +fts_que_graph_free_check_lock( +/*==========================*/ + fts_table_t* fts_table, /*!< in: FTS table */ + const fts_index_cache_t*index_cache, /*!< in: FTS index cache */ + que_t* graph); /*!< in: query graph */ + +/****************************************************************//** +Create an FTS index cache. */ +UNIV_INTERN +CHARSET_INFO* +fts_index_get_charset( +/*==================*/ + dict_index_t* index); /*!< in: FTS index */ + +/*********************************************************************//** +Get the initial Doc ID by consulting the CONFIG table +@return initial Doc ID */ +UNIV_INTERN +doc_id_t +fts_init_doc_id( +/*============*/ + const dict_table_t* table); /*!< in: table */ + +/******************************************************************//** +compare two character string according to their charset. */ +extern +int +innobase_fts_text_cmp( +/*==================*/ + const void* cs, /*!< in: Character set */ + const void* p1, /*!< in: key */ + const void* p2); /*!< in: node */ + +/******************************************************************//** +Makes all characters in a string lower case. */ +extern +size_t +innobase_fts_casedn_str( +/*====================*/ + CHARSET_INFO* cs, /*!< in: Character set */ + char* src, /*!< in: string to put in + lower case */ + size_t src_len, /*!< in: input string length */ + char* dst, /*!< in: buffer for result + string */ + size_t dst_len); /*!< in: buffer size */ + + +/******************************************************************//** +compare two character string according to their charset. */ +extern +int +innobase_fts_text_cmp_prefix( +/*=========================*/ + const void* cs, /*!< in: Character set */ + const void* p1, /*!< in: key */ + const void* p2); /*!< in: node */ + +/*************************************************************//** +Get the next token from the given string and store it in *token. */ +extern +ulint +innobase_mysql_fts_get_token( +/*=========================*/ + CHARSET_INFO* charset, /*!< in: Character set */ + const byte* start, /*!< in: start of text */ + const byte* end, /*!< in: one character past + end of text */ + fts_string_t* token, /*!< out: token's text */ + ulint* offset); /*!< out: offset to token, + measured as characters from + 'start' */ + +/*********************************************************************//** +Fetch COUNT(*) from specified table. +@return the number of rows in the table */ +UNIV_INTERN +ulint +fts_get_rows_count( +/*===============*/ + fts_table_t* fts_table); /*!< in: fts table to read */ + +/*************************************************************//** +Get maximum Doc ID in a table if index "FTS_DOC_ID_INDEX" exists +@return max Doc ID or 0 if index "FTS_DOC_ID_INDEX" does not exist */ +UNIV_INTERN +doc_id_t +fts_get_max_doc_id( +/*===============*/ + dict_table_t* table); /*!< in: user table */ + +/******************************************************************//** +Check whether user supplied stopword table exists and is of +the right format. +@return the stopword column charset if qualifies */ +UNIV_INTERN +CHARSET_INFO* +fts_valid_stopword_table( +/*=====================*/ + const char* stopword_table_name); /*!< in: Stopword table + name */ +/****************************************************************//** +This function loads specified stopword into FTS cache +@return TRUE if success */ +UNIV_INTERN +ibool +fts_load_stopword( +/*==============*/ + const dict_table_t* + table, /*!< in: Table with FTS */ + trx_t* trx, /*!< in: Transaction */ + const char* global_stopword_table, /*!< in: Global stopword table + name */ + const char* session_stopword_table, /*!< in: Session stopword table + name */ + ibool stopword_is_on, /*!< in: Whether stopword + option is turned on/off */ + ibool reload); /*!< in: Whether it is during + reload of FTS table */ + +/****************************************************************//** +Create the vector of fts_get_doc_t instances. +@return vector of fts_get_doc_t instances */ +UNIV_INTERN +ib_vector_t* +fts_get_docs_create( +/*================*/ + fts_cache_t* cache); /*!< in: fts cache */ + +/****************************************************************//** +Read the rows from the FTS index +@return DB_SUCCESS if OK */ +UNIV_INTERN +dberr_t +fts_table_fetch_doc_ids( +/*====================*/ + trx_t* trx, /*!< in: transaction */ + fts_table_t* fts_table, /*!< in: aux table */ + fts_doc_ids_t* doc_ids); /*!< in: For collecting + doc ids */ +/****************************************************************//** +This function brings FTS index in sync when FTS index is first +used. There are documents that have not yet sync-ed to auxiliary +tables from last server abnormally shutdown, we will need to bring +such document into FTS cache before any further operations +@return TRUE if all OK */ +UNIV_INTERN +ibool +fts_init_index( +/*===========*/ + dict_table_t* table, /*!< in: Table with FTS */ + ibool has_cache_lock); /*!< in: Whether we already + have cache lock */ +/*******************************************************************//** +Add a newly create index in FTS cache */ +UNIV_INTERN +void +fts_add_index( +/*==========*/ + dict_index_t* index, /*!< FTS index to be added */ + dict_table_t* table); /*!< table */ + +/*******************************************************************//** +Drop auxiliary tables related to an FTS index +@return DB_SUCCESS or error number */ +UNIV_INTERN +dberr_t +fts_drop_index( +/*===========*/ + dict_table_t* table, /*!< in: Table where indexes are dropped */ + dict_index_t* index, /*!< in: Index to be dropped */ + trx_t* trx) /*!< in: Transaction for the drop */ + __attribute__((nonnull)); + +/****************************************************************//** +Rename auxiliary tables for all fts index for a table +@return DB_SUCCESS or error code */ + +dberr_t +fts_rename_aux_tables( +/*==================*/ + dict_table_t* table, /*!< in: user Table */ + const char* new_name, /*!< in: new table name */ + trx_t* trx); /*!< in: transaction */ + +/*******************************************************************//** +Check indexes in the fts->indexes is also present in index cache and +table->indexes list +@return TRUE if all indexes match */ +UNIV_INTERN +ibool +fts_check_cached_index( +/*===================*/ + dict_table_t* table); /*!< in: Table where indexes are dropped */ +#endif /*!< fts0fts.h */ + diff --git a/storage/innobase/include/fts0opt.h b/storage/innobase/include/fts0opt.h new file mode 100644 index 00000000000..92eaf8270d2 --- /dev/null +++ b/storage/innobase/include/fts0opt.h @@ -0,0 +1,37 @@ +/***************************************************************************** + +Copyright (c) 2001, 2011, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/fts0opt.h +Full Text Search optimize thread + +Created 2011-02-15 Jimmy Yang +***********************************************************************/ +#ifndef INNODB_FTS0OPT_H +#define INNODB_FTS0OPT_H + +/******************************************************************** +Callback function to fetch the rows in an FTS INDEX record. */ +UNIV_INTERN +ibool +fts_optimize_index_fetch_node( +/*==========================*/ + /* out: always returns non-NULL */ + void* row, /* in: sel_node_t* */ + void* user_arg); /* in: pointer to ib_vector_t */ +#endif diff --git a/storage/innobase/include/fts0pars.h b/storage/innobase/include/fts0pars.h new file mode 100644 index 00000000000..8108e811599 --- /dev/null +++ b/storage/innobase/include/fts0pars.h @@ -0,0 +1,72 @@ +/* A Bison parser, made by GNU Bison 2.5. */ + +/* Bison interface for Yacc-like parsers in C + + Copyright (C) 1984, 1989-1990, 2000-2011 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +/* As a special exception, you may create a larger work that contains + part or all of the Bison parser skeleton and distribute that work + under terms of your choice, so long as that work isn't itself a + parser generator using the skeleton or a modified version thereof + as a parser skeleton. Alternatively, if you modify or redistribute + the parser skeleton itself, you may (at your option) remove this + special exception, which will cause the skeleton and the resulting + Bison output files to be licensed under the GNU General Public + License without this special exception. + + This special exception was added by the Free Software Foundation in + version 2.2 of Bison. */ + + +/* Tokens. */ +#ifndef YYTOKENTYPE +# define YYTOKENTYPE + /* Put the tokens into the symbol table, so that GDB and other debuggers + know about them. */ + enum yytokentype { + FTS_OPER = 258, + FTS_TEXT = 259, + FTS_TERM = 260, + FTS_NUMB = 261 + }; +#endif + + + +#if ! defined YYSTYPE && ! defined YYSTYPE_IS_DECLARED +typedef union YYSTYPE +{ + +/* Line 2068 of yacc.c */ +#line 61 "fts0pars.y" + + int oper; + fts_ast_string_t* token; + fts_ast_node_t* node; + + + +/* Line 2068 of yacc.c */ +#line 64 "fts0pars.hh" +} YYSTYPE; +# define YYSTYPE_IS_TRIVIAL 1 +# define yystype YYSTYPE /* obsolescent; will be withdrawn */ +# define YYSTYPE_IS_DECLARED 1 +#endif + + + + diff --git a/storage/innobase/include/fts0priv.h b/storage/innobase/include/fts0priv.h new file mode 100644 index 00000000000..b4d9e1d41ec --- /dev/null +++ b/storage/innobase/include/fts0priv.h @@ -0,0 +1,653 @@ +/***************************************************************************** + +Copyright (c) 2011, 2013, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/fts0priv.h +Full text search internal header file + +Created 2011/09/02 Sunny Bains +***********************************************************************/ + +#ifndef INNOBASE_FTS0PRIV_H +#define INNOBASE_FTS0PRIV_H + +#include "dict0dict.h" +#include "pars0pars.h" +#include "que0que.h" +#include "que0types.h" +#include "fts0types.h" + +/* The various states of the FTS sub system pertaining to a table with +FTS indexes defined on it. */ +enum fts_table_state_enum { + /* !<This must be 0 since we insert + a hard coded '0' at create time + to the config table */ + + FTS_TABLE_STATE_RUNNING = 0, /*!< Auxiliary tables created OK */ + + FTS_TABLE_STATE_OPTIMIZING, /*!< This is a substate of RUNNING */ + + FTS_TABLE_STATE_DELETED /*!< All aux tables to be dropped when + it's safe to do so */ +}; + +typedef enum fts_table_state_enum fts_table_state_t; + +/** The default time to wait for the background thread (in microsecnds). */ +#define FTS_MAX_BACKGROUND_THREAD_WAIT 10000 + +/** Maximum number of iterations to wait before we complain */ +#define FTS_BACKGROUND_THREAD_WAIT_COUNT 1000 + +/** The maximum length of the config table's value column in bytes */ +#define FTS_MAX_CONFIG_NAME_LEN 64 + +/** The maximum length of the config table's value column in bytes */ +#define FTS_MAX_CONFIG_VALUE_LEN 1024 + +/** Approx. upper limit of ilist length in bytes. */ +#define FTS_ILIST_MAX_SIZE (64 * 1024) + +/** FTS config table name parameters */ + +/** The number of seconds after which an OPTIMIZE run will stop */ +#define FTS_OPTIMIZE_LIMIT_IN_SECS "optimize_checkpoint_limit" + +/** The next doc id */ +#define FTS_SYNCED_DOC_ID "synced_doc_id" + +/** The last word that was OPTIMIZED */ +#define FTS_LAST_OPTIMIZED_WORD "last_optimized_word" + +/** Total number of documents that have been deleted. The next_doc_id +minus this count gives us the total number of documents. */ +#define FTS_TOTAL_DELETED_COUNT "deleted_doc_count" + +/** Total number of words parsed from all documents */ +#define FTS_TOTAL_WORD_COUNT "total_word_count" + +/** Start of optimize of an FTS index */ +#define FTS_OPTIMIZE_START_TIME "optimize_start_time" + +/** End of optimize for an FTS index */ +#define FTS_OPTIMIZE_END_TIME "optimize_end_time" + +/** User specified stopword table name */ +#define FTS_STOPWORD_TABLE_NAME "stopword_table_name" + +/** Whether to use (turn on/off) stopword */ +#define FTS_USE_STOPWORD "use_stopword" + +/** State of the FTS system for this table. It can be one of + RUNNING, OPTIMIZING, DELETED. */ +#define FTS_TABLE_STATE "table_state" + +/** The minimum length of an FTS auxiliary table names's id component +e.g., For an auxiliary table name + + FTS_<TABLE_ID>_SUFFIX + +This constant is for the minimum length required to store the <TABLE_ID> +component. +*/ +#define FTS_AUX_MIN_TABLE_ID_LENGTH 48 + +/** Maximum length of an integer stored in the config table value column. */ +#define FTS_MAX_INT_LEN 32 + +/******************************************************************//** +Parse an SQL string. %s is replaced with the table's id. +@return query graph */ +UNIV_INTERN +que_t* +fts_parse_sql( +/*==========*/ + fts_table_t* fts_table, /*!< in: FTS aux table */ + pars_info_t* info, /*!< in: info struct, or NULL */ + const char* sql) /*!< in: SQL string to evaluate */ + __attribute__((nonnull(3), malloc, warn_unused_result)); +/******************************************************************//** +Evaluate a parsed SQL statement +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +fts_eval_sql( +/*=========*/ + trx_t* trx, /*!< in: transaction */ + que_t* graph) /*!< in: Parsed statement */ + __attribute__((nonnull, warn_unused_result)); +/******************************************************************//** +Construct the name of an ancillary FTS table for the given table. +@return own: table name, must be freed with mem_free() */ +UNIV_INTERN +char* +fts_get_table_name( +/*===============*/ + const fts_table_t* + fts_table) /*!< in: FTS aux table info */ + __attribute__((nonnull, malloc, warn_unused_result)); +/******************************************************************//** +Construct the column specification part of the SQL string for selecting the +indexed FTS columns for the given table. Adds the necessary bound +ids to the given 'info' and returns the SQL string. Examples: + +One indexed column named "text": + + "$sel0", + info/ids: sel0 -> "text" + +Two indexed columns named "subject" and "content": + + "$sel0, $sel1", + info/ids: sel0 -> "subject", sel1 -> "content", +@return heap-allocated WHERE string */ +UNIV_INTERN +const char* +fts_get_select_columns_str( +/*=======================*/ + dict_index_t* index, /*!< in: FTS index */ + pars_info_t* info, /*!< in/out: parser info */ + mem_heap_t* heap) /*!< in: memory heap */ + __attribute__((nonnull, warn_unused_result)); + +/** define for fts_doc_fetch_by_doc_id() "option" value, defines whether +we want to get Doc whose ID is equal to or greater or smaller than supplied +ID */ +#define FTS_FETCH_DOC_BY_ID_EQUAL 1 +#define FTS_FETCH_DOC_BY_ID_LARGE 2 +#define FTS_FETCH_DOC_BY_ID_SMALL 3 + +/*************************************************************//** +Fetch document (= a single row's indexed text) with the given +document id. +@return: DB_SUCCESS if fetch is successful, else error */ +UNIV_INTERN +dberr_t +fts_doc_fetch_by_doc_id( +/*====================*/ + fts_get_doc_t* get_doc, /*!< in: state */ + doc_id_t doc_id, /*!< in: id of document to fetch */ + dict_index_t* index_to_use, /*!< in: caller supplied FTS index, + or NULL */ + ulint option, /*!< in: search option, if it is + greater than doc_id or equal */ + fts_sql_callback + callback, /*!< in: callback to read + records */ + void* arg) /*!< in: callback arg */ + __attribute__((nonnull(6))); + +/*******************************************************************//** +Callback function for fetch that stores the text of an FTS document, +converting each column to UTF-16. +@return always FALSE */ +UNIV_INTERN +ibool +fts_query_expansion_fetch_doc( +/*==========================*/ + void* row, /*!< in: sel_node_t* */ + void* user_arg) /*!< in: fts_doc_t* */ + __attribute__((nonnull)); +/******************************************************************** +Write out a single word's data as new entry/entries in the INDEX table. +@return DB_SUCCESS if all OK. */ +UNIV_INTERN +dberr_t +fts_write_node( +/*===========*/ + trx_t* trx, /*!< in: transaction */ + que_t** graph, /*!< in: query graph */ + fts_table_t* fts_table, /*!< in: the FTS aux index */ + fts_string_t* word, /*!< in: word in UTF-8 */ + fts_node_t* node) /*!< in: node columns */ + __attribute__((nonnull, warn_unused_result)); +/*******************************************************************//** +Tokenize a document. */ +UNIV_INTERN +void +fts_tokenize_document( +/*==================*/ + fts_doc_t* doc, /*!< in/out: document to + tokenize */ + fts_doc_t* result) /*!< out: if provided, save + result tokens here */ + __attribute__((nonnull(1))); + +/*******************************************************************//** +Continue to tokenize a document. */ +UNIV_INTERN +void +fts_tokenize_document_next( +/*=======================*/ + fts_doc_t* doc, /*!< in/out: document to + tokenize */ + ulint add_pos, /*!< in: add this position to all + tokens from this tokenization */ + fts_doc_t* result) /*!< out: if provided, save + result tokens here */ + __attribute__((nonnull(1))); +/******************************************************************//** +Initialize a document. */ +UNIV_INTERN +void +fts_doc_init( +/*=========*/ + fts_doc_t* doc) /*!< in: doc to initialize */ + __attribute__((nonnull)); + +/******************************************************************//** +Do a binary search for a doc id in the array +@return +ve index if found -ve index where it should be + inserted if not found */ +UNIV_INTERN +int +fts_bsearch( +/*========*/ + fts_update_t* array, /*!< in: array to sort */ + int lower, /*!< in: lower bound of array*/ + int upper, /*!< in: upper bound of array*/ + doc_id_t doc_id) /*!< in: doc id to lookup */ + __attribute__((nonnull, warn_unused_result)); +/******************************************************************//** +Free document. */ +UNIV_INTERN +void +fts_doc_free( +/*=========*/ + fts_doc_t* doc) /*!< in: document */ + __attribute__((nonnull)); +/******************************************************************//** +Free fts_optimizer_word_t instanace.*/ +UNIV_INTERN +void +fts_word_free( +/*==========*/ + fts_word_t* word) /*!< in: instance to free.*/ + __attribute__((nonnull)); +/******************************************************************//** +Read the rows from the FTS inde +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +fts_index_fetch_nodes( +/*==================*/ + trx_t* trx, /*!< in: transaction */ + que_t** graph, /*!< in: prepared statement */ + fts_table_t* fts_table, /*!< in: FTS aux table */ + const fts_string_t* + word, /*!< in: the word to fetch */ + fts_fetch_t* fetch) /*!< in: fetch callback.*/ + __attribute__((nonnull)); +/******************************************************************//** +Create a fts_optimizer_word_t instance. +@return new instance */ +UNIV_INTERN +fts_word_t* +fts_word_init( +/*==========*/ + fts_word_t* word, /*!< in: word to initialize */ + byte* utf8, /*!< in: UTF-8 string */ + ulint len) /*!< in: length of string in bytes */ + __attribute__((nonnull)); +/******************************************************************//** +Compare two fts_trx_table_t instances, we actually compare the +table id's here. +@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */ +UNIV_INLINE +int +fts_trx_table_cmp( +/*==============*/ + const void* v1, /*!< in: id1 */ + const void* v2) /*!< in: id2 */ + __attribute__((nonnull, warn_unused_result)); +/******************************************************************//** +Compare a table id with a trx_table_t table id. +@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */ +UNIV_INLINE +int +fts_trx_table_id_cmp( +/*=================*/ + const void* p1, /*!< in: id1 */ + const void* p2) /*!< in: id2 */ + __attribute__((nonnull, warn_unused_result)); +/******************************************************************//** +Commit a transaction. +@return DB_SUCCESS if all OK */ +UNIV_INTERN +dberr_t +fts_sql_commit( +/*===========*/ + trx_t* trx) /*!< in: transaction */ + __attribute__((nonnull)); +/******************************************************************//** +Rollback a transaction. +@return DB_SUCCESS if all OK */ +UNIV_INTERN +dberr_t +fts_sql_rollback( +/*=============*/ + trx_t* trx) /*!< in: transaction */ + __attribute__((nonnull)); +/******************************************************************//** +Parse an SQL string. %s is replaced with the table's id. Don't acquire +the dict mutex +@return query graph */ +UNIV_INTERN +que_t* +fts_parse_sql_no_dict_lock( +/*=======================*/ + fts_table_t* fts_table, /*!< in: table with FTS index */ + pars_info_t* info, /*!< in: parser info */ + const char* sql) /*!< in: SQL string to evaluate */ + __attribute__((nonnull(3), malloc, warn_unused_result)); +/******************************************************************//** +Get value from config table. The caller must ensure that enough +space is allocated for value to hold the column contents +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +fts_config_get_value( +/*=================*/ + trx_t* trx, /* transaction */ + fts_table_t* fts_table, /*!< in: the indexed FTS table */ + const char* name, /*!< in: get config value for + this parameter name */ + fts_string_t* value) /*!< out: value read from + config table */ + __attribute__((nonnull)); +/******************************************************************//** +Get value specific to an FTS index from the config table. The caller +must ensure that enough space is allocated for value to hold the +column contents. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +fts_config_get_index_value( +/*=======================*/ + trx_t* trx, /*!< transaction */ + dict_index_t* index, /*!< in: index */ + const char* param, /*!< in: get config value for + this parameter name */ + fts_string_t* value) /*!< out: value read from + config table */ + __attribute__((nonnull, warn_unused_result)); +/******************************************************************//** +Set the value in the config table for name. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +fts_config_set_value( +/*=================*/ + trx_t* trx, /*!< transaction */ + fts_table_t* fts_table, /*!< in: the indexed FTS table */ + const char* name, /*!< in: get config value for + this parameter name */ + const fts_string_t* + value) /*!< in: value to update */ + __attribute__((nonnull)); +/****************************************************************//** +Set an ulint value in the config table. +@return DB_SUCCESS if all OK else error code */ +UNIV_INTERN +dberr_t +fts_config_set_ulint( +/*=================*/ + trx_t* trx, /*!< in: transaction */ + fts_table_t* fts_table, /*!< in: the indexed FTS table */ + const char* name, /*!< in: param name */ + ulint int_value) /*!< in: value */ + __attribute__((nonnull, warn_unused_result)); +/******************************************************************//** +Set the value specific to an FTS index in the config table. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +fts_config_set_index_value( +/*=======================*/ + trx_t* trx, /*!< transaction */ + dict_index_t* index, /*!< in: index */ + const char* param, /*!< in: get config value for + this parameter name */ + fts_string_t* value) /*!< out: value read from + config table */ + __attribute__((nonnull, warn_unused_result)); +/******************************************************************//** +Increment the value in the config table for column name. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +fts_config_increment_value( +/*=======================*/ + trx_t* trx, /*!< transaction */ + fts_table_t* fts_table, /*!< in: the indexed FTS table */ + const char* name, /*!< in: increment config value + for this parameter name */ + ulint delta) /*!< in: increment by this much */ + __attribute__((nonnull, warn_unused_result)); +/******************************************************************//** +Increment the per index value in the config table for column name. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +fts_config_increment_index_value( +/*=============================*/ + trx_t* trx, /*!< transaction */ + dict_index_t* index, /*!< in: FTS index */ + const char* name, /*!< in: increment config value + for this parameter name */ + ulint delta) /*!< in: increment by this much */ + __attribute__((nonnull)); +/******************************************************************//** +Get an ulint value from the config table. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +fts_config_get_index_ulint( +/*=======================*/ + trx_t* trx, /*!< in: transaction */ + dict_index_t* index, /*!< in: FTS index */ + const char* name, /*!< in: param name */ + ulint* int_value) /*!< out: value */ + __attribute__((nonnull, warn_unused_result)); +/******************************************************************//** +Set an ulint value int the config table. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +fts_config_set_index_ulint( +/*=======================*/ + trx_t* trx, /*!< in: transaction */ + dict_index_t* index, /*!< in: FTS index */ + const char* name, /*!< in: param name */ + ulint int_value) /*!< in: value */ + __attribute__((nonnull, warn_unused_result)); +/******************************************************************//** +Get an ulint value from the config table. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +fts_config_get_ulint( +/*=================*/ + trx_t* trx, /*!< in: transaction */ + fts_table_t* fts_table, /*!< in: the indexed FTS table */ + const char* name, /*!< in: param name */ + ulint* int_value) /*!< out: value */ + __attribute__((nonnull)); +/******************************************************************//** +Search cache for word. +@return the word node vector if found else NULL */ +UNIV_INTERN +const ib_vector_t* +fts_cache_find_word( +/*================*/ + const fts_index_cache_t* + index_cache, /*!< in: cache to search */ + const fts_string_t* + text) /*!< in: word to search for */ + __attribute__((nonnull, warn_unused_result)); +/******************************************************************//** +Check cache for deleted doc id. +@return TRUE if deleted */ +UNIV_INTERN +ibool +fts_cache_is_deleted_doc_id( +/*========================*/ + const fts_cache_t* + cache, /*!< in: cache ito search */ + doc_id_t doc_id) /*!< in: doc id to search for */ + __attribute__((nonnull, warn_unused_result)); +/******************************************************************//** +Append deleted doc ids to vector and sort the vector. */ +UNIV_INTERN +void +fts_cache_append_deleted_doc_ids( +/*=============================*/ + const fts_cache_t* + cache, /*!< in: cache to use */ + ib_vector_t* vector); /*!< in: append to this vector */ +/******************************************************************//** +Wait for the background thread to start. We poll to detect change +of state, which is acceptable, since the wait should happen only +once during startup. +@return true if the thread started else FALSE (i.e timed out) */ +UNIV_INTERN +ibool +fts_wait_for_background_thread_to_start( +/*====================================*/ + dict_table_t* table, /*!< in: table to which the thread + is attached */ + ulint max_wait); /*!< in: time in microseconds, if set + to 0 then it disables timeout + checking */ +#ifdef FTS_DOC_STATS_DEBUG +/******************************************************************//** +Get the total number of words in the FTS for a particular FTS index. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +fts_get_total_word_count( +/*=====================*/ + trx_t* trx, /*!< in: transaction */ + dict_index_t* index, /*!< in: for this index */ + ulint* total) /*!< out: total words */ + __attribute__((nonnull, warn_unused_result)); +#endif +/******************************************************************//** +Search the index specific cache for a particular FTS index. +@return the index specific cache else NULL */ +UNIV_INTERN +fts_index_cache_t* +fts_find_index_cache( +/*================*/ + const fts_cache_t* + cache, /*!< in: cache to search */ + const dict_index_t* + index) /*!< in: index to search for */ + __attribute__((nonnull, warn_unused_result)); +/******************************************************************//** +Write the table id to the given buffer (including final NUL). Buffer must be +at least FTS_AUX_MIN_TABLE_ID_LENGTH bytes long. +@return number of bytes written */ +UNIV_INLINE +int +fts_write_object_id( +/*================*/ + ib_id_t id, /*!< in: a table/index id */ + char* str, /*!< in: buffer to write the id to */ + bool hex_format __attribute__((unused))) + /*!< in: true for fixed hex format, + false for old ambiguous format */ + __attribute__((nonnull)); +/******************************************************************//** +Read the table id from the string generated by fts_write_object_id(). +@return TRUE if parse successful */ +UNIV_INLINE +ibool +fts_read_object_id( +/*===============*/ + ib_id_t* id, /*!< out: a table id */ + const char* str) /*!< in: buffer to read from */ + __attribute__((nonnull, warn_unused_result)); +/******************************************************************//** +Get the table id. +@return number of bytes written */ +UNIV_INTERN +int +fts_get_table_id( +/*=============*/ + const fts_table_t* + fts_table, /*!< in: FTS Auxiliary table */ + char* table_id) /*!< out: table id, must be at least + FTS_AUX_MIN_TABLE_ID_LENGTH bytes + long */ + __attribute__((nonnull, warn_unused_result)); +/******************************************************************//** +Add the table to add to the OPTIMIZER's list. */ +UNIV_INTERN +void +fts_optimize_add_table( +/*===================*/ + dict_table_t* table) /*!< in: table to add */ + __attribute__((nonnull)); +/******************************************************************//** +Optimize a table. */ +UNIV_INTERN +void +fts_optimize_do_table( +/*==================*/ + dict_table_t* table) /*!< in: table to optimize */ + __attribute__((nonnull)); +/******************************************************************//** +Construct the prefix name of an FTS table. +@return own: table name, must be freed with mem_free() */ +UNIV_INTERN +char* +fts_get_table_name_prefix( +/*======================*/ + const fts_table_t* + fts_table) /*!< in: Auxiliary table type */ + __attribute__((nonnull, malloc, warn_unused_result)); +/******************************************************************//** +Add node positions. */ +UNIV_INTERN +void +fts_cache_node_add_positions( +/*=========================*/ + fts_cache_t* cache, /*!< in: cache */ + fts_node_t* node, /*!< in: word node */ + doc_id_t doc_id, /*!< in: doc id */ + ib_vector_t* positions) /*!< in: fts_token_t::positions */ + __attribute__((nonnull(2,4))); + +/******************************************************************//** +Create the config table name for retrieving index specific value. +@return index config parameter name */ +UNIV_INTERN +char* +fts_config_create_index_param_name( +/*===============================*/ + const char* param, /*!< in: base name of param */ + const dict_index_t* index) /*!< in: index for config */ + __attribute__((nonnull, malloc, warn_unused_result)); + +#ifndef UNIV_NONINL +#include "fts0priv.ic" +#endif + +#endif /* INNOBASE_FTS0PRIV_H */ diff --git a/storage/innobase/include/fts0priv.ic b/storage/innobase/include/fts0priv.ic new file mode 100644 index 00000000000..2d07c60f980 --- /dev/null +++ b/storage/innobase/include/fts0priv.ic @@ -0,0 +1,130 @@ +/***************************************************************************** + +Copyright (c) 2011, 2013, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/fts0priv.ic +Full text search internal header file + +Created 2011/11/12 Sunny Bains +***********************************************************************/ + +/******************************************************************//** +Write the table id to the given buffer (including final NUL). Buffer must be +at least FTS_AUX_MIN_TABLE_ID_LENGTH bytes long. +@return number of bytes written */ +UNIV_INLINE +int +fts_write_object_id( +/*================*/ + ib_id_t id, /* in: a table/index id */ + char* str, /* in: buffer to write the id to */ + bool hex_format __attribute__((unused))) + /* in: true for fixed hex format, + false for old ambiguous format */ +{ + +#ifdef _WIN32 + + DBUG_EXECUTE_IF("innodb_test_wrong_non_windows_fts_aux_table_name", + return(sprintf(str, UINT64PFx, id));); + + /* Use this to construct old(5.6.14 and 5.7.3) windows + ambiguous aux table names */ + DBUG_EXECUTE_IF("innodb_test_wrong_fts_aux_table_name", + return(sprintf(str, "%016llu", id));); + +#else /* _WIN32 */ + + /* Use this to construct old(5.6.14 and 5.7.3) windows + ambiguous aux table names */ + DBUG_EXECUTE_IF("innodb_test_wrong_windows_fts_aux_table_name", + return(sprintf(str, "%016"PRIu64, id));); + + DBUG_EXECUTE_IF("innodb_test_wrong_fts_aux_table_name", + return(sprintf(str, UINT64PFx, id));); + +#endif /* _WIN32 */ + + /* As above, but this is only for those tables failing to rename. */ + if (!hex_format) { +#ifdef _WIN32 + // FIXME: Use ut_snprintf(), so does following one. + return(sprintf(str, "%016llu", id)); +#else /* _WIN32 */ + return(sprintf(str, "%016"PRIu64, id)); +#endif /* _WIN32 */ + } + + return(sprintf(str, UINT64PFx, id)); +} + +/******************************************************************//** +Read the table id from the string generated by fts_write_object_id(). +@return TRUE if parse successful */ +UNIV_INLINE +ibool +fts_read_object_id( +/*===============*/ + ib_id_t* id, /* out: an id */ + const char* str) /* in: buffer to read from */ +{ + /* NOTE: this func doesn't care about whether current table + is set with HEX_NAME, the user of the id read here will check + if the id is HEX or DEC and do the right thing with it. */ + return(sscanf(str, UINT64PFx, id) == 1); +} + +/******************************************************************//** +Compare two fts_trx_table_t instances. +@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */ +UNIV_INLINE +int +fts_trx_table_cmp( +/*==============*/ + const void* p1, /*!< in: id1 */ + const void* p2) /*!< in: id2 */ +{ + const dict_table_t* table1 = (*(const fts_trx_table_t**) p1)->table; + const dict_table_t* table2 = (*(const fts_trx_table_t**) p2)->table; + + return((table1->id > table2->id) + ? 1 + : (table1->id == table2->id) + ? 0 + : -1); +} + +/******************************************************************//** +Compare a table id with a fts_trx_table_t table id. +@return < 0 if n1 < n2, 0 if n1 == n2,> 0 if n1 > n2 */ +UNIV_INLINE +int +fts_trx_table_id_cmp( +/*=================*/ + const void* p1, /*!< in: id1 */ + const void* p2) /*!< in: id2 */ +{ + const ullint* table_id = (const ullint*) p1; + const dict_table_t* table2 = (*(const fts_trx_table_t**) p2)->table; + + return((*table_id > table2->id) + ? 1 + : (*table_id == table2->id) + ? 0 + : -1); +} diff --git a/storage/innobase/include/fts0tlex.h b/storage/innobase/include/fts0tlex.h new file mode 100644 index 00000000000..f91533803e8 --- /dev/null +++ b/storage/innobase/include/fts0tlex.h @@ -0,0 +1,349 @@ +#ifndef fts0tHEADER_H +#define fts0tHEADER_H 1 +#define fts0tIN_HEADER 1 + +#line 6 "../include/fts0tlex.h" + +#line 8 "../include/fts0tlex.h" + +#define YY_INT_ALIGNED short int + +/* A lexical scanner generated by flex */ + +#define FLEX_SCANNER +#define YY_FLEX_MAJOR_VERSION 2 +#define YY_FLEX_MINOR_VERSION 5 +#define YY_FLEX_SUBMINOR_VERSION 35 +#if YY_FLEX_SUBMINOR_VERSION > 0 +#define FLEX_BETA +#endif + +/* First, we deal with platform-specific or compiler-specific issues. */ + +/* begin standard C headers. */ +#include <stdio.h> +#include <string.h> +#include <errno.h> +#include <stdlib.h> + +/* end standard C headers. */ + +/* flex integer type definitions */ + +#ifndef FLEXINT_H +#define FLEXINT_H + +/* C99 systems have <inttypes.h>. Non-C99 systems may or may not. */ + +#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L + +/* C99 says to define __STDC_LIMIT_MACROS before including stdint.h, + * if you want the limit (max/min) macros for int types. + */ +#ifndef __STDC_LIMIT_MACROS +#define __STDC_LIMIT_MACROS 1 +#endif + +#include <inttypes.h> +typedef int8_t flex_int8_t; +typedef uint8_t flex_uint8_t; +typedef int16_t flex_int16_t; +typedef uint16_t flex_uint16_t; +typedef int32_t flex_int32_t; +typedef uint32_t flex_uint32_t; +#else +typedef signed char flex_int8_t; +typedef short int flex_int16_t; +typedef int flex_int32_t; +typedef unsigned char flex_uint8_t; +typedef unsigned short int flex_uint16_t; +typedef unsigned int flex_uint32_t; + +/* Limits of integral types. */ +#ifndef INT8_MIN +#define INT8_MIN (-128) +#endif +#ifndef INT16_MIN +#define INT16_MIN (-32767-1) +#endif +#ifndef INT32_MIN +#define INT32_MIN (-2147483647-1) +#endif +#ifndef INT8_MAX +#define INT8_MAX (127) +#endif +#ifndef INT16_MAX +#define INT16_MAX (32767) +#endif +#ifndef INT32_MAX +#define INT32_MAX (2147483647) +#endif +#ifndef UINT8_MAX +#define UINT8_MAX (255U) +#endif +#ifndef UINT16_MAX +#define UINT16_MAX (65535U) +#endif +#ifndef UINT32_MAX +#define UINT32_MAX (4294967295U) +#endif + +#endif /* ! C99 */ + +#endif /* ! FLEXINT_H */ + +#ifdef __cplusplus + +/* The "const" storage-class-modifier is valid. */ +#define YY_USE_CONST + +#else /* ! __cplusplus */ + +/* C99 requires __STDC__ to be defined as 1. */ +#if defined (__STDC__) + +#define YY_USE_CONST + +#endif /* defined (__STDC__) */ +#endif /* ! __cplusplus */ + +#ifdef YY_USE_CONST +#define yyconst const +#else +#define yyconst +#endif + +/* An opaque pointer. */ +#ifndef YY_TYPEDEF_YY_SCANNER_T +#define YY_TYPEDEF_YY_SCANNER_T +typedef void* yyscan_t; +#endif + +/* For convenience, these vars (plus the bison vars far below) + are macros in the reentrant scanner. */ +#define yyin yyg->yyin_r +#define yyout yyg->yyout_r +#define yyextra yyg->yyextra_r +#define yyleng yyg->yyleng_r +#define yytext yyg->yytext_r +#define yylineno (YY_CURRENT_BUFFER_LVALUE->yy_bs_lineno) +#define yycolumn (YY_CURRENT_BUFFER_LVALUE->yy_bs_column) +#define yy_flex_debug yyg->yy_flex_debug_r + +/* Size of default input buffer. */ +#ifndef YY_BUF_SIZE +#ifdef __ia64__ +/* On IA-64, the buffer size is 16k, not 8k. + * Moreover, YY_BUF_SIZE is 2*YY_READ_BUF_SIZE in the general case. + * Ditto for the __ia64__ case accordingly. + */ +#define YY_BUF_SIZE 32768 +#else +#define YY_BUF_SIZE 16384 +#endif /* __ia64__ */ +#endif + +#ifndef YY_TYPEDEF_YY_BUFFER_STATE +#define YY_TYPEDEF_YY_BUFFER_STATE +typedef struct yy_buffer_state *YY_BUFFER_STATE; +#endif + +#ifndef YY_TYPEDEF_YY_SIZE_T +#define YY_TYPEDEF_YY_SIZE_T +typedef size_t yy_size_t; +#endif + +#ifndef YY_STRUCT_YY_BUFFER_STATE +#define YY_STRUCT_YY_BUFFER_STATE +struct yy_buffer_state + { + FILE *yy_input_file; + + char *yy_ch_buf; /* input buffer */ + char *yy_buf_pos; /* current position in input buffer */ + + /* Size of input buffer in bytes, not including room for EOB + * characters. + */ + yy_size_t yy_buf_size; + + /* Number of characters read into yy_ch_buf, not including EOB + * characters. + */ + int yy_n_chars; + + /* Whether we "own" the buffer - i.e., we know we created it, + * and can realloc() it to grow it, and should free() it to + * delete it. + */ + int yy_is_our_buffer; + + /* Whether this is an "interactive" input source; if so, and + * if we're using stdio for input, then we want to use getc() + * instead of fread(), to make sure we stop fetching input after + * each newline. + */ + int yy_is_interactive; + + /* Whether we're considered to be at the beginning of a line. + * If so, '^' rules will be active on the next match, otherwise + * not. + */ + int yy_at_bol; + + int yy_bs_lineno; /**< The line count. */ + int yy_bs_column; /**< The column count. */ + + /* Whether to try to fill the input buffer when we reach the + * end of it. + */ + int yy_fill_buffer; + + int yy_buffer_status; + + }; +#endif /* !YY_STRUCT_YY_BUFFER_STATE */ + +void fts0trestart (FILE *input_file ,yyscan_t yyscanner ); +void fts0t_switch_to_buffer (YY_BUFFER_STATE new_buffer ,yyscan_t yyscanner ); +YY_BUFFER_STATE fts0t_create_buffer (FILE *file,int size ,yyscan_t yyscanner ); +void fts0t_delete_buffer (YY_BUFFER_STATE b ,yyscan_t yyscanner ); +void fts0t_flush_buffer (YY_BUFFER_STATE b ,yyscan_t yyscanner ); +void fts0tpush_buffer_state (YY_BUFFER_STATE new_buffer ,yyscan_t yyscanner ); +void fts0tpop_buffer_state (yyscan_t yyscanner ); + +YY_BUFFER_STATE fts0t_scan_buffer (char *base,yy_size_t size ,yyscan_t yyscanner ); +YY_BUFFER_STATE fts0t_scan_string (yyconst char *yy_str ,yyscan_t yyscanner ); +YY_BUFFER_STATE fts0t_scan_bytes (yyconst char *bytes,int len ,yyscan_t yyscanner ); + +void *fts0talloc (yy_size_t ,yyscan_t yyscanner ); +void *fts0trealloc (void *,yy_size_t ,yyscan_t yyscanner ); +void fts0tfree (void * ,yyscan_t yyscanner ); + +/* Begin user sect3 */ + +#define fts0twrap(n) 1 +#define YY_SKIP_YYWRAP + +#define yytext_ptr yytext_r + +#ifdef YY_HEADER_EXPORT_START_CONDITIONS +#define INITIAL 0 + +#endif + +#ifndef YY_NO_UNISTD_H +/* Special case for "unistd.h", since it is non-ANSI. We include it way + * down here because we want the user's section 1 to have been scanned first. + * The user has a chance to override it with an option. + */ +#include <unistd.h> +#endif + +#ifndef YY_EXTRA_TYPE +#define YY_EXTRA_TYPE void * +#endif + +int fts0tlex_init (yyscan_t* scanner); + +int fts0tlex_init_extra (YY_EXTRA_TYPE user_defined,yyscan_t* scanner); + +/* Accessor methods to globals. + These are made visible to non-reentrant scanners for convenience. */ + +int fts0tlex_destroy (yyscan_t yyscanner ); + +int fts0tget_debug (yyscan_t yyscanner ); + +void fts0tset_debug (int debug_flag ,yyscan_t yyscanner ); + +YY_EXTRA_TYPE fts0tget_extra (yyscan_t yyscanner ); + +void fts0tset_extra (YY_EXTRA_TYPE user_defined ,yyscan_t yyscanner ); + +FILE *fts0tget_in (yyscan_t yyscanner ); + +void fts0tset_in (FILE * in_str ,yyscan_t yyscanner ); + +FILE *fts0tget_out (yyscan_t yyscanner ); + +void fts0tset_out (FILE * out_str ,yyscan_t yyscanner ); + +int fts0tget_leng (yyscan_t yyscanner ); + +char *fts0tget_text (yyscan_t yyscanner ); + +int fts0tget_lineno (yyscan_t yyscanner ); + +void fts0tset_lineno (int line_number ,yyscan_t yyscanner ); + +/* Macros after this point can all be overridden by user definitions in + * section 1. + */ + +#ifndef YY_SKIP_YYWRAP +#ifdef __cplusplus +extern "C" int fts0twrap (yyscan_t yyscanner ); +#else +extern int fts0twrap (yyscan_t yyscanner ); +#endif +#endif + +#ifndef yytext_ptr +static void yy_flex_strncpy (char *,yyconst char *,int ,yyscan_t yyscanner); +#endif + +#ifdef YY_NEED_STRLEN +static int yy_flex_strlen (yyconst char * ,yyscan_t yyscanner); +#endif + +#ifndef YY_NO_INPUT + +#endif + +/* Amount of stuff to slurp up with each read. */ +#ifndef YY_READ_BUF_SIZE +#ifdef __ia64__ +/* On IA-64, the buffer size is 16k, not 8k */ +#define YY_READ_BUF_SIZE 16384 +#else +#define YY_READ_BUF_SIZE 8192 +#endif /* __ia64__ */ +#endif + +/* Number of entries by which start-condition stack grows. */ +#ifndef YY_START_STACK_INCR +#define YY_START_STACK_INCR 25 +#endif + +/* Default declaration of generated scanner - a define so the user can + * easily add parameters. + */ +#ifndef YY_DECL +#define YY_DECL_IS_OURS 1 + +extern int fts0tlex (yyscan_t yyscanner); + +#define YY_DECL int fts0tlex (yyscan_t yyscanner) +#endif /* !YY_DECL */ + +/* yy_get_previous_state - get the state just before the EOB char was reached */ + +#undef YY_NEW_FILE +#undef YY_FLUSH_BUFFER +#undef yy_set_bol +#undef yy_new_buffer +#undef yy_set_interactive +#undef YY_DO_BEFORE_ACTION + +#ifdef YY_DECL_IS_OURS +#undef YY_DECL_IS_OURS +#undef YY_DECL +#endif + +#line 68 "fts0tlex.l" + + +#line 348 "../include/fts0tlex.h" +#undef fts0tIN_HEADER +#endif /* fts0tHEADER_H */ diff --git a/storage/innobase/include/fts0types.h b/storage/innobase/include/fts0types.h new file mode 100644 index 00000000000..64677428331 --- /dev/null +++ b/storage/innobase/include/fts0types.h @@ -0,0 +1,474 @@ +/***************************************************************************** + +Copyright (c) 2007, 2011, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/fts0types.h +Full text search types file + +Created 2007-03-27 Sunny Bains +*******************************************************/ + +#ifndef INNOBASE_FTS0TYPES_H +#define INNOBASE_FTS0TYPES_H + +#include "que0types.h" +#include "ut0byte.h" +#include "fut0fut.h" +#include "ut0rbt.h" +#include "fts0fts.h" + +/** Types used within FTS. */ +struct fts_que_t; +struct fts_node_t; +struct fts_utf8_str_t; + +/** Callbacks used within FTS. */ +typedef pars_user_func_cb_t fts_sql_callback; +typedef void (*fts_filter)(void*, fts_node_t*, void*, ulint len); + +/** Statistics relevant to a particular document, used during retrieval. */ +struct fts_doc_stats_t { + doc_id_t doc_id; /*!< Document id */ + ulint word_count; /*!< Total words in the document */ +}; + +/** It's main purpose is to store the SQL prepared statements that +are required to retrieve a document from the database. */ +struct fts_get_doc_t { + fts_index_cache_t* + index_cache; /*!< The index cache instance */ + + /*!< Parsed sql statement */ + que_t* get_document_graph; + fts_cache_t* cache; /*!< The parent cache */ +}; + +/** Since we can have multiple FTS indexes on a table, we keep a +per index cache of words etc. */ +struct fts_index_cache_t { + dict_index_t* index; /*!< The FTS index instance */ + + ib_rbt_t* words; /*!< Nodes; indexed by fts_string_t*, + cells are fts_tokenizer_word_t*.*/ + + ib_vector_t* doc_stats; /*!< Array of the fts_doc_stats_t + contained in the memory buffer. + Must be in sorted order (ascending). + The ideal choice is an rb tree but + the rb tree imposes a space overhead + that we can do without */ + + que_t** ins_graph; /*!< Insert query graphs */ + + que_t** sel_graph; /*!< Select query graphs */ + CHARSET_INFO* charset; /*!< charset */ +}; + +/** For supporting the tracking of updates on multiple FTS indexes we need +to track which FTS indexes need to be updated. For INSERT and DELETE we +update all fts indexes. */ +struct fts_update_t { + doc_id_t doc_id; /*!< The doc id affected */ + + ib_vector_t* fts_indexes; /*!< The FTS indexes that need to be + updated. A NULL value means all + indexes need to be updated. This + vector is not allocated on the heap + and so must be freed explicitly, + when we are done with it */ +}; + +/** Stop word control infotmation. */ +struct fts_stopword_t { + ulint status; /*!< Status of the stopword tree */ + ib_alloc_t* heap; /*!< The memory allocator to use */ + ib_rbt_t* cached_stopword;/*!< This stores all active stopwords */ + CHARSET_INFO* charset; /*!< charset for stopword */ +}; + +/** The SYNC state of the cache. There is one instance of this struct +associated with each ADD thread. */ +struct fts_sync_t { + trx_t* trx; /*!< The transaction used for SYNCing + the cache to disk */ + dict_table_t* table; /*!< Table with FTS index(es) */ + ulint max_cache_size; /*!< Max size in bytes of the cache */ + ibool cache_full; /*!< flag, when true it indicates that + we need to sync the cache to disk */ + ulint lower_index; /*!< the start index of the doc id + vector from where to start adding + documents to the FTS cache */ + ulint upper_index; /*!< max index of the doc id vector to + add to the FTS cache */ + ibool interrupted; /*!< TRUE if SYNC was interrupted */ + doc_id_t min_doc_id; /*!< The smallest doc id added to the + cache. It should equal to + doc_ids[lower_index] */ + doc_id_t max_doc_id; /*!< The doc id at which the cache was + noted as being full, we use this to + set the upper_limit field */ + ib_time_t start_time; /*!< SYNC start time */ +}; + +/** The cache for the FTS system. It is a memory-based inverted index +that new entries are added to, until it grows over the configured maximum +size, at which time its contents are written to the INDEX table. */ +struct fts_cache_t { + rw_lock_t lock; /*!< lock protecting all access to the + memory buffer. FIXME: this needs to + be our new upgrade-capable rw-lock */ + + rw_lock_t init_lock; /*!< lock used for the cache + intialization, it has different + SYNC level as above cache lock */ + + ib_mutex_t optimize_lock; /*!< Lock for OPTIMIZE */ + + ib_mutex_t deleted_lock; /*!< Lock covering deleted_doc_ids */ + + ib_mutex_t doc_id_lock; /*!< Lock covering Doc ID */ + + ib_vector_t* deleted_doc_ids;/*!< Array of deleted doc ids, each + element is of type fts_update_t */ + + ib_vector_t* indexes; /*!< We store the stats and inverted + index for the individual FTS indexes + in this vector. Each element is + an instance of fts_index_cache_t */ + + ib_vector_t* get_docs; /*!< information required to read + the document from the table. Each + element is of type fts_doc_t */ + + ulint total_size; /*!< total size consumed by the ilist + field of all nodes. SYNC is run + whenever this gets too big */ + fts_sync_t* sync; /*!< sync structure to sync data to + disk */ + ib_alloc_t* sync_heap; /*!< The heap allocator, for indexes + and deleted_doc_ids, ie. transient + objects, they are recreated after + a SYNC is completed */ + + + ib_alloc_t* self_heap; /*!< This heap is the heap out of + which an instance of the cache itself + was created. Objects created using + this heap will last for the lifetime + of the cache */ + + doc_id_t next_doc_id; /*!< Next doc id */ + + doc_id_t synced_doc_id; /*!< Doc ID sync-ed to CONFIG table */ + + doc_id_t first_doc_id; /*!< first doc id since this table + was opened */ + + ulint deleted; /*!< Number of doc ids deleted since + last optimized. This variable is + covered by deleted_lock */ + + ulint added; /*!< Number of doc ids added since last + optimized. This variable is covered by + the deleted lock */ + + fts_stopword_t stopword_info; /*!< Cached stopwords for the FTS */ + mem_heap_t* cache_heap; /*!< Cache Heap */ +}; + +/** Columns of the FTS auxiliary INDEX table */ +struct fts_node_t { + doc_id_t first_doc_id; /*!< First document id in ilist. */ + + doc_id_t last_doc_id; /*!< Last document id in ilist. */ + + byte* ilist; /*!< Binary list of documents & word + positions the token appears in. + TODO: For now, these are simply + ut_malloc'd, but if testing shows + that they waste memory unacceptably, a + special memory allocator will have + to be written */ + + ulint doc_count; /*!< Number of doc ids in ilist */ + + ulint ilist_size; /*!< Used size of ilist in bytes. */ + + ulint ilist_size_alloc; + /*!< Allocated size of ilist in + bytes */ +}; + +/** A tokenizer word. Contains information about one word. */ +struct fts_tokenizer_word_t { + fts_string_t text; /*!< Token text. */ + + ib_vector_t* nodes; /*!< Word node ilists, each element is + of type fts_node_t */ +}; + +/** Word text plus it's array of nodes as on disk in FTS index */ +struct fts_word_t { + fts_string_t text; /*!< Word value in UTF-8 */ + ib_vector_t* nodes; /*!< Nodes read from disk */ + + ib_alloc_t* heap_alloc; /*!< For handling all allocations */ +}; + +/** Callback for reading and filtering nodes that are read from FTS index */ +struct fts_fetch_t { + void* read_arg; /*!< Arg for the sql_callback */ + + fts_sql_callback + read_record; /*!< Callback for reading index + record */ + ulint total_memory; /*!< Total memory used */ +}; + +/** For horizontally splitting an FTS auxiliary index */ +struct fts_index_selector_t { + ulint value; /*!< Character value at which + to split */ + + const char* suffix; /*!< FTS aux index suffix */ +}; + +/** This type represents a single document. */ +struct fts_doc_t { + fts_string_t text; /*!< document text */ + + ibool found; /*!< TRUE if the document was found + successfully in the database */ + + ib_rbt_t* tokens; /*!< This is filled when the document + is tokenized. Tokens; indexed by + fts_string_t*, cells are of type + fts_token_t* */ + + ib_alloc_t* self_heap; /*!< An instance of this type is + allocated from this heap along + with any objects that have the + same lifespan, most notably + the vector of token positions */ + CHARSET_INFO* charset; /*!< Document's charset info */ +}; + +/** A token and its positions within a document. */ +struct fts_token_t { + fts_string_t text; /*!< token text */ + + ib_vector_t* positions; /*!< an array of the positions the + token is found in; each item is + actually an ulint. */ +}; + +/** It's defined in fts/fts0fts.c */ +extern const fts_index_selector_t fts_index_selector[]; + +/******************************************************************//** +Compare two UTF-8 strings. */ +UNIV_INLINE +int +fts_utf8_string_cmp( +/*================*/ + /*!< out: + < 0 if n1 < n2, + 0 if n1 == n2, + > 0 if n1 > n2 */ + const void* p1, /*!< in: key */ + const void* p2); /*!< in: node */ + +/******************************************************************//** +Compare two UTF-8 strings, and return match (0) if +passed in "key" value equals or is the prefix of the "node" value. */ +UNIV_INLINE +int +fts_utf8_string_cmp_prefix( +/*=======================*/ + /*!< out: + < 0 if n1 < n2, + 0 if n1 == n2, + > 0 if n1 > n2 */ + const void* p1, /*!< in: key */ + const void* p2); /*!< in: node */ + +/******************************************************************//** +Compare two fts_trx_row_t instances doc_ids. */ +UNIV_INLINE +int +fts_trx_row_doc_id_cmp( +/*===================*/ + /*!< out: + < 0 if n1 < n2, + 0 if n1 == n2, + > 0 if n1 > n2 */ + const void* p1, /*!< in: id1 */ + const void* p2); /*!< in: id2 */ + +/******************************************************************//** +Compare two fts_ranking_t instances doc_ids. */ +UNIV_INLINE +int +fts_ranking_doc_id_cmp( +/*===================*/ + /*!< out: + < 0 if n1 < n2, + 0 if n1 == n2, + > 0 if n1 > n2 */ + const void* p1, /*!< in: id1 */ + const void* p2); /*!< in: id2 */ + +/******************************************************************//** +Compare two fts_update_t instances doc_ids. */ +UNIV_INLINE +int +fts_update_doc_id_cmp( +/*==================*/ + /*!< out: + < 0 if n1 < n2, + 0 if n1 == n2, + > 0 if n1 > n2 */ + const void* p1, /*!< in: id1 */ + const void* p2); /*!< in: id2 */ + +/******************************************************************//** +Decode and return the integer that was encoded using our VLC scheme.*/ +UNIV_INLINE +ulint +fts_decode_vlc( +/*===========*/ + /*!< out: value decoded */ + byte** ptr); /*!< in: ptr to decode from, this ptr is + incremented by the number of bytes decoded */ + +/******************************************************************//** +Duplicate an UTF-8 string. */ +UNIV_INLINE +void +fts_utf8_string_dup( +/*================*/ + /*!< out: + < 0 if n1 < n2, + 0 if n1 == n2, + > 0 if n1 > n2 */ + fts_string_t* dst, /*!< in: dup to here */ + const fts_string_t* src, /*!< in: src string */ + mem_heap_t* heap); /*!< in: heap to use */ + +/******************************************************************//** +Return length of val if it were encoded using our VLC scheme. */ +UNIV_INLINE +ulint +fts_get_encoded_len( +/*================*/ + /*!< out: length of value + encoded, in bytes */ + ulint val); /*!< in: value to encode */ + +/******************************************************************//** +Encode an integer using our VLC scheme and return the length in bytes. */ +UNIV_INLINE +ulint +fts_encode_int( +/*===========*/ + /*!< out: length of value + encoded, in bytes */ + ulint val, /*!< in: value to encode */ + byte* buf); /*!< in: buffer, must have + enough space */ + +/******************************************************************//** +Decode a UTF-8 character. + +http://www.unicode.org/versions/Unicode4.0.0/ch03.pdf: + + Scalar Value 1st Byte 2nd Byte 3rd Byte 4th Byte +00000000 0xxxxxxx 0xxxxxxx +00000yyy yyxxxxxx 110yyyyy 10xxxxxx +zzzzyyyy yyxxxxxx 1110zzzz 10yyyyyy 10xxxxxx +000uuuzz zzzzyyyy yyxxxxxx 11110uuu 10zzzzzz 10yyyyyy 10xxxxxx + +This function decodes UTF-8 sequences up to 6 bytes (31 bits). + +On error *ptr will point to the first byte that was not correctly +decoded. This will hopefully help in resyncing the input. */ +UNIV_INLINE +ulint +fts_utf8_decode( +/*============*/ + /*!< out: UTF8_ERROR if *ptr + did not point to a valid + UTF-8 sequence, or the + Unicode code point. */ + const byte** ptr); /*!< in/out: pointer to + UTF-8 string. The + pointer is advanced to + the start of the next + character. */ + +/******************************************************************//** +Lowercase an UTF-8 string. */ +UNIV_INLINE +void +fts_utf8_tolower( +/*=============*/ + fts_string_t* str); /*!< in: string */ + +/******************************************************************//** +Get the selected FTS aux INDEX suffix. */ +UNIV_INLINE +const char* +fts_get_suffix( +/*===========*/ + ulint selected); /*!< in: selected index */ + +/******************************************************************** +Get the number of index selectors. */ +UNIV_INLINE +ulint +fts_get_n_selectors(void); +/*=====================*/ + +/******************************************************************//** +Select the FTS auxiliary index for the given string. +@return the index to use for the string */ +UNIV_INLINE +ulint +fts_select_index( +/*=============*/ + const CHARSET_INFO* cs, /*!< Charset */ + const byte* str, /*!< in: word string */ + ulint len); /*!< in: string length */ + +/******************************************************************** +Select the next FTS auxiliary index for the given character. +@return the next index to use for character */ +UNIV_INLINE +ulint +fts_select_next_index( +/*==================*/ + const CHARSET_INFO* cs, /*!< Charset */ + const byte* str, /*!< in: string */ + ulint len); /*!< in: string length */ + +#ifndef UNIV_NONINL +#include "fts0types.ic" +#include "fts0vlc.ic" +#endif + +#endif /* INNOBASE_FTS0TYPES_H */ diff --git a/storage/innobase/include/fts0types.ic b/storage/innobase/include/fts0types.ic new file mode 100644 index 00000000000..f0dfd023a70 --- /dev/null +++ b/storage/innobase/include/fts0types.ic @@ -0,0 +1,388 @@ +/***************************************************************************** + +Copyright (c) 2007, 2011, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/fts0types.ic +Full text search types. + +Created 2007-03-27 Sunny Bains +*******************************************************/ + +#ifndef INNOBASE_FTS0TYPES_IC +#define INNOBASE_FTS0TYPES_IC + +#include <ctype.h> + +#include "rem0cmp.h" +#include "ha_prototypes.h" + +extern const ulint UTF8_ERROR; + +/* Determine if a UTF-8 continuation byte is valid. */ +#define fts_utf8_is_valid(b) (((b) & 0xC0) == 0x80) + +/******************************************************************//** +Duplicate an UTF-8 string. +@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */ +UNIV_INLINE +void +fts_utf8_string_dup( +/*================*/ + fts_string_t* dst, /*!< in: dup to here */ + const fts_string_t* src, /*!< in: src string */ + mem_heap_t* heap) /*!< in: heap to use */ +{ + dst->f_str = (byte*)mem_heap_alloc(heap, src->f_len + 1); + memcpy(dst->f_str, src->f_str, src->f_len); + + dst->f_len = src->f_len; + dst->f_str[src->f_len] = 0; + dst->f_n_char = src->f_n_char; +} + +/******************************************************************//** +Compare two fts_trx_row_t doc_ids. +@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */ +UNIV_INLINE +int +fts_trx_row_doc_id_cmp( +/*===================*/ + const void* p1, /*!< in: id1 */ + const void* p2) /*!< in: id2 */ +{ + const fts_trx_row_t* tr1 = (const fts_trx_row_t*) p1; + const fts_trx_row_t* tr2 = (const fts_trx_row_t*) p2; + + return((int)(tr1->doc_id - tr2->doc_id)); +} + +/******************************************************************//** +Compare two fts_ranking_t doc_ids. +@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */ +UNIV_INLINE +int +fts_ranking_doc_id_cmp( +/*===================*/ + const void* p1, /*!< in: id1 */ + const void* p2) /*!< in: id2 */ +{ + const fts_ranking_t* rk1 = (const fts_ranking_t*) p1; + const fts_ranking_t* rk2 = (const fts_ranking_t*) p2; + + return((int)(rk1->doc_id - rk2->doc_id)); +} + +/******************************************************************//** +Compare two fts_update_t doc_ids. +@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */ +UNIV_INLINE +int +fts_update_doc_id_cmp( +/*==================*/ + const void* p1, /*!< in: id1 */ + const void* p2) /*!< in: id2 */ +{ + const fts_update_t* up1 = (const fts_update_t*) p1; + const fts_update_t* up2 = (const fts_update_t*) p2; + + return((int)(up1->doc_id - up2->doc_id)); +} + + +/******************************************************************//** +Lowercase an UTF-8 string. */ +UNIV_INLINE +void +fts_utf8_tolower( +/*=============*/ + fts_string_t* str) /*!< in: string */ +{ + innobase_casedn_str((char*) str->f_str); +} + +/******************************************************************//** +Compare two UTF-8 strings. +@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */ +UNIV_INLINE +int +fts_utf8_string_cmp( +/*================*/ + const void* p1, /*!< in: key */ + const void* p2) /*!< in: node */ +{ + const fts_string_t* s1 = (const fts_string_t*) p1; + const fts_string_t* s2 = (const fts_string_t*) p2; + + return(cmp_data_data_slow_varchar( + s1->f_str, s1->f_len, s2->f_str, s2->f_len)); +} + +/******************************************************************//** +Compare two UTF-8 strings, and return match (0) if +passed in "key" value equals or is the prefix of the "node" value. +@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */ +UNIV_INLINE +int +fts_utf8_string_cmp_prefix( +/*=======================*/ + const void* p1, /*!< in: key */ + const void* p2) /*!< in: node */ +{ + int result; + ulint len; + + const fts_string_t* s1 = (const fts_string_t*) p1; + const fts_string_t* s2 = (const fts_string_t*) p2; + + len = ut_min(s1->f_len, s2->f_len); + + result = cmp_data_data_slow_varchar(s1->f_str, len, s2->f_str, len); + + if (result) { + return(result); + } + + if (s1->f_len > s2->f_len) { + return(1); + } + + return(0); +} + +/******************************************************************//** +Decode a UTF-8 character. + +http://www.unicode.org/versions/Unicode4.0.0/ch03.pdf: + + Scalar Value 1st Byte 2nd Byte 3rd Byte 4th Byte +00000000 0xxxxxxx 0xxxxxxx +00000yyy yyxxxxxx 110yyyyy 10xxxxxx +zzzzyyyy yyxxxxxx 1110zzzz 10yyyyyy 10xxxxxx +000uuuzz zzzzyyyy yyxxxxxx 11110uuu 10zzzzzz 10yyyyyy 10xxxxxx + +This function decodes UTF-8 sequences up to 6 bytes (31 bits). + +On error *ptr will point to the first byte that was not correctly +decoded. This will hopefully help in resyncing the input. +@return UTF8_ERROR if *ptr did not point to a valid +UTF-8 sequence, or the Unicode code point. */ +UNIV_INLINE +ulint +fts_utf8_decode( +/*============*/ + const byte** ptr) /*!< in/out: pointer to + UTF-8 string. The + pointer is advanced to + the start of the next + character. */ +{ + const byte* p = *ptr; + ulint ch = *p++; +#ifdef UNIV_DEBUG + ulint min_ch; +#endif /* UNIV_DEBUG */ + + if (UNIV_LIKELY(ch < 0x80)) { + /* 0xxxxxxx */ + } else if (UNIV_UNLIKELY(ch < 0xC0)) { + /* A continuation byte cannot start a code. */ + goto err_exit; + } else if (ch < 0xE0) { + /* 110yyyyy 10xxxxxx */ + ch &= 0x1F; + ut_d(min_ch = 0x80); + goto get1; + } else if (ch < 0xF0) { + /* 1110zzzz 10yyyyyy 10xxxxxx */ + ch &= 0x0F; + ut_d(min_ch = 0x800); + goto get2; + } else if (ch < 0xF8) { + /* 11110uuu 10zzzzzz 10yyyyyy 10xxxxxx */ + ch &= 0x07; + ut_d(min_ch = 0x10000); + goto get3; + } else if (ch < 0xFC) { + /* 111110tt 10uuuuuu 10zzzzzz 10yyyyyy 10xxxxxx */ + ch &= 0x03; + ut_d(min_ch = 0x200000); + goto get4; + } else if (ch < 0xFE) { + /* 1111110s 10tttttt 10uuuuuu 10zzzzzz 10yyyyyy 10xxxxxx */ + ut_d(min_ch = 0x4000000); + if (!fts_utf8_is_valid(*p)) { + goto err_exit; + } + ch <<= 6; + ch |= (*p++) & 0x3F; +get4: + if (!fts_utf8_is_valid(*p)) { + goto err_exit; + } + ch <<= 6; + ch |= (*p++) & 0x3F; +get3: + if (!fts_utf8_is_valid(*p)) { + goto err_exit; + } + ch <<= 6; + ch |= (*p++) & 0x3F; +get2: + if (!fts_utf8_is_valid(*p)) { + goto err_exit; + } + ch <<= 6; + ch |= (*p++) & 0x3F; +get1: + if (!fts_utf8_is_valid(*p)) { + goto err_exit; + } + ch <<= 6; + ch |= (*p++) & 0x3F; + + /* The following is needed in the 6-byte case + when ulint is wider than 32 bits. */ + ch &= 0xFFFFFFFF; + + /* The code positions U+D800 to U+DFFF (UTF-16 surrogate pairs) + and U+FFFE and U+FFFF cannot occur in valid UTF-8. */ + + if ( (ch >= 0xD800 && ch <= 0xDFFF) +#ifdef UNIV_DEBUG + || ch < min_ch +#endif /* UNIV_DEBUG */ + || ch == 0xFFFE || ch == 0xFFFF) { + + ch = UTF8_ERROR; + } + } else { +err_exit: + ch = UTF8_ERROR; + } + + *ptr = p; + + return(ch); +} + +/******************************************************************//** +Get the first character's code position for FTS index partition */ +extern +ulint +innobase_strnxfrm( +/*==============*/ + const CHARSET_INFO* cs, /*!< in: Character set */ + const uchar* p2, /*!< in: string */ + const ulint len2); /*!< in: string length */ + +/******************************************************************//** +Select the FTS auxiliary index for the given character. +@return the index to use for the string */ +UNIV_INLINE +ulint +fts_select_index( +/*=============*/ + const CHARSET_INFO* cs, /*!< in: Charset */ + const byte* str, /*!< in: string */ + ulint len) /*!< in: string length */ +{ + ulint selected = 0; + ulint value = innobase_strnxfrm(cs, str, len); + + while (fts_index_selector[selected].value != 0) { + + if (fts_index_selector[selected].value == value) { + + return(selected); + + } else if (fts_index_selector[selected].value > value) { + + return(selected > 0 ? selected - 1 : 0); + } + + ++selected; + } + + ut_ad(selected > 1); + + return(selected - 1); +} + +/******************************************************************//** +Select the next FTS auxiliary index for the given character. +@return the next index to use for character */ +UNIV_INLINE +ulint +fts_select_next_index( +/*==================*/ + const CHARSET_INFO* cs, /*!< in: Charset */ + const byte* str, /*!< in: string */ + ulint len) /*!< in: string length */ +{ + ulint selected = 0; + ulint value = innobase_strnxfrm(cs, str, len); + + while (fts_index_selector[selected].value != 0) { + + if (fts_index_selector[selected].value == value) { + + return(selected + 1); + + } else if (fts_index_selector[selected].value > value) { + + return(selected); + } + + ++selected; + } + + ut_ad(selected > 0); + + return((ulint) selected); +} + +/******************************************************************//** +Return the selected FTS aux index suffix. */ +UNIV_INLINE +const char* +fts_get_suffix( +/*===========*/ + ulint selected) /*!< in: selected index */ +{ + return(fts_index_selector[selected].suffix); +} + +/******************************************************************//** +Get the number of index selectors. +@return The number of selectors */ +UNIV_INLINE +ulint +fts_get_n_selectors(void) +/*=====================*/ +{ + ulint i = 0; + + // FIXME: This is a hack + while (fts_index_selector[i].value != 0) { + ++i; + } + + return(i); +} + +#endif /* INNOBASE_FTS0TYPES_IC */ diff --git a/storage/innobase/include/fts0vlc.ic b/storage/innobase/include/fts0vlc.ic new file mode 100644 index 00000000000..e79bcf59347 --- /dev/null +++ b/storage/innobase/include/fts0vlc.ic @@ -0,0 +1,142 @@ +/***************************************************************************** + +Copyright (c) 2007, 2011, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/fts0vlc.ic +Full text variable length integer encoding/decoding. + +Created 2007-03-27 Sunny Bains +*******************************************************/ + +#ifndef INNOBASE_FTS0VLC_IC +#define INNOBASE_FTS0VLC_IC + +#include "fts0types.h" + +/******************************************************************//** +Return length of val if it were encoded using our VLC scheme. +FIXME: We will need to be able encode 8 bytes value +@return length of value encoded, in bytes */ +UNIV_INLINE +ulint +fts_get_encoded_len( +/*================*/ + ulint val) /* in: value to encode */ +{ + if (val <= 127) { + return(1); + } else if (val <= 16383) { + return(2); + } else if (val <= 2097151) { + return(3); + } else if (val <= 268435455) { + return(4); + } else { + /* Possibly we should care that on 64-bit machines ulint can + contain values that we can't encode in 5 bytes, but + fts_encode_int doesn't handle them either so it doesn't much + matter. */ + + return(5); + } +} + +/******************************************************************//** +Encode an integer using our VLC scheme and return the length in bytes. +@return length of value encoded, in bytes */ +UNIV_INLINE +ulint +fts_encode_int( +/*===========*/ + ulint val, /* in: value to encode */ + byte* buf) /* in: buffer, must have enough space */ +{ + ulint len; + + if (val <= 127) { + *buf = (byte) val; + + len = 1; + } else if (val <= 16383) { + *buf++ = (byte)(val >> 7); + *buf = (byte)(val & 0x7F); + + len = 2; + } else if (val <= 2097151) { + *buf++ = (byte)(val >> 14); + *buf++ = (byte)((val >> 7) & 0x7F); + *buf = (byte)(val & 0x7F); + + len = 3; + } else if (val <= 268435455) { + *buf++ = (byte)(val >> 21); + *buf++ = (byte)((val >> 14) & 0x7F); + *buf++ = (byte)((val >> 7) & 0x7F); + *buf = (byte)(val & 0x7F); + + len = 4; + } else { + /* Best to keep the limitations of the 32/64 bit versions + identical, at least for the time being. */ + ut_ad(val <= 4294967295u); + + *buf++ = (byte)(val >> 28); + *buf++ = (byte)((val >> 21) & 0x7F); + *buf++ = (byte)((val >> 14) & 0x7F); + *buf++ = (byte)((val >> 7) & 0x7F); + *buf = (byte)(val & 0x7F); + + len = 5; + } + + /* High-bit on means "last byte in the encoded integer". */ + *buf |= 0x80; + + return(len); +} + +/******************************************************************//** +Decode and return the integer that was encoded using our VLC scheme. +@return value decoded */ +UNIV_INLINE +ulint +fts_decode_vlc( +/*===========*/ + byte** ptr) /* in: ptr to decode from, this ptr is + incremented by the number of bytes decoded */ +{ + ulint val = 0; + + for (;;) { + byte b = **ptr; + + ++*ptr; + val |= (b & 0x7F); + + /* High-bit on means "last byte in the encoded integer". */ + if (b & 0x80) { + break; + } else { + val <<= 7; + } + } + + return(val); +} + +#endif diff --git a/storage/innobase/include/fut0fut.h b/storage/innobase/include/fut0fut.h new file mode 100644 index 00000000000..851cdb44cdf --- /dev/null +++ b/storage/innobase/include/fut0fut.h @@ -0,0 +1,55 @@ +/***************************************************************************** + +Copyright (c) 1995, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/fut0fut.h +File-based utilities + +Created 12/13/1995 Heikki Tuuri +***********************************************************************/ + + +#ifndef fut0fut_h +#define fut0fut_h + +#include "univ.i" + +#include "fil0fil.h" +#include "mtr0mtr.h" + +/********************************************************************//** +Gets a pointer to a file address and latches the page. +@return pointer to a byte in a frame; the file page in the frame is +bufferfixed and latched */ +UNIV_INLINE +byte* +fut_get_ptr( +/*========*/ + ulint space, /*!< in: space id */ + ulint zip_size,/*!< in: compressed page size in bytes + or 0 for uncompressed pages */ + fil_addr_t addr, /*!< in: file address */ + ulint rw_latch, /*!< in: RW_S_LATCH, RW_X_LATCH */ + mtr_t* mtr); /*!< in: mtr handle */ + +#ifndef UNIV_NONINL +#include "fut0fut.ic" +#endif + +#endif + diff --git a/storage/innobase/include/fut0fut.ic b/storage/innobase/include/fut0fut.ic new file mode 100644 index 00000000000..b065b10b9ca --- /dev/null +++ b/storage/innobase/include/fut0fut.ic @@ -0,0 +1,56 @@ +/***************************************************************************** + +Copyright (c) 1995, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/fut0fut.ic +File-based utilities + +Created 12/13/1995 Heikki Tuuri +***********************************************************************/ + +#include "sync0rw.h" +#include "buf0buf.h" + +/********************************************************************//** +Gets a pointer to a file address and latches the page. +@return pointer to a byte in a frame; the file page in the frame is +bufferfixed and latched */ +UNIV_INLINE +byte* +fut_get_ptr( +/*========*/ + ulint space, /*!< in: space id */ + ulint zip_size,/*!< in: compressed page size in bytes + or 0 for uncompressed pages */ + fil_addr_t addr, /*!< in: file address */ + ulint rw_latch, /*!< in: RW_S_LATCH, RW_X_LATCH */ + mtr_t* mtr) /*!< in: mtr handle */ +{ + buf_block_t* block; + byte* ptr; + + ut_ad(addr.boffset < UNIV_PAGE_SIZE); + ut_ad((rw_latch == RW_S_LATCH) || (rw_latch == RW_X_LATCH)); + + block = buf_page_get(space, zip_size, addr.page, rw_latch, mtr); + ptr = buf_block_get_frame(block) + addr.boffset; + + buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK); + + return(ptr); +} diff --git a/storage/innobase/include/fut0lst.h b/storage/innobase/include/fut0lst.h new file mode 100644 index 00000000000..90f9a65d4fa --- /dev/null +++ b/storage/innobase/include/fut0lst.h @@ -0,0 +1,217 @@ +/***************************************************************************** + +Copyright (c) 1995, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/fut0lst.h +File-based list utilities + +Created 11/28/1995 Heikki Tuuri +***********************************************************************/ + +#ifndef fut0lst_h +#define fut0lst_h + +#include "univ.i" + +#include "fil0fil.h" +#include "mtr0mtr.h" + + +/* The C 'types' of base node and list node: these should be used to +write self-documenting code. Of course, the sizeof macro cannot be +applied to these types! */ + +typedef byte flst_base_node_t; +typedef byte flst_node_t; + +/* The physical size of a list base node in bytes */ +#define FLST_BASE_NODE_SIZE (4 + 2 * FIL_ADDR_SIZE) + +/* The physical size of a list node in bytes */ +#define FLST_NODE_SIZE (2 * FIL_ADDR_SIZE) + +#ifndef UNIV_HOTBACKUP +/********************************************************************//** +Initializes a list base node. */ +UNIV_INLINE +void +flst_init( +/*======*/ + flst_base_node_t* base, /*!< in: pointer to base node */ + mtr_t* mtr); /*!< in: mini-transaction handle */ +/********************************************************************//** +Adds a node as the last node in a list. */ +UNIV_INTERN +void +flst_add_last( +/*==========*/ + flst_base_node_t* base, /*!< in: pointer to base node of list */ + flst_node_t* node, /*!< in: node to add */ + mtr_t* mtr); /*!< in: mini-transaction handle */ +/********************************************************************//** +Adds a node as the first node in a list. */ +UNIV_INTERN +void +flst_add_first( +/*===========*/ + flst_base_node_t* base, /*!< in: pointer to base node of list */ + flst_node_t* node, /*!< in: node to add */ + mtr_t* mtr); /*!< in: mini-transaction handle */ +/********************************************************************//** +Inserts a node after another in a list. */ +UNIV_INTERN +void +flst_insert_after( +/*==============*/ + flst_base_node_t* base, /*!< in: pointer to base node of list */ + flst_node_t* node1, /*!< in: node to insert after */ + flst_node_t* node2, /*!< in: node to add */ + mtr_t* mtr); /*!< in: mini-transaction handle */ +/********************************************************************//** +Inserts a node before another in a list. */ +UNIV_INTERN +void +flst_insert_before( +/*===============*/ + flst_base_node_t* base, /*!< in: pointer to base node of list */ + flst_node_t* node2, /*!< in: node to insert */ + flst_node_t* node3, /*!< in: node to insert before */ + mtr_t* mtr); /*!< in: mini-transaction handle */ +/********************************************************************//** +Removes a node. */ +UNIV_INTERN +void +flst_remove( +/*========*/ + flst_base_node_t* base, /*!< in: pointer to base node of list */ + flst_node_t* node2, /*!< in: node to remove */ + mtr_t* mtr); /*!< in: mini-transaction handle */ +/********************************************************************//** +Cuts off the tail of the list, including the node given. The number of +nodes which will be removed must be provided by the caller, as this function +does not measure the length of the tail. */ +UNIV_INTERN +void +flst_cut_end( +/*=========*/ + flst_base_node_t* base, /*!< in: pointer to base node of list */ + flst_node_t* node2, /*!< in: first node to remove */ + ulint n_nodes,/*!< in: number of nodes to remove, + must be >= 1 */ + mtr_t* mtr); /*!< in: mini-transaction handle */ +/********************************************************************//** +Cuts off the tail of the list, not including the given node. The number of +nodes which will be removed must be provided by the caller, as this function +does not measure the length of the tail. */ +UNIV_INTERN +void +flst_truncate_end( +/*==============*/ + flst_base_node_t* base, /*!< in: pointer to base node of list */ + flst_node_t* node2, /*!< in: first node not to remove */ + ulint n_nodes,/*!< in: number of nodes to remove */ + mtr_t* mtr); /*!< in: mini-transaction handle */ +/********************************************************************//** +Gets list length. +@return length */ +UNIV_INLINE +ulint +flst_get_len( +/*=========*/ + const flst_base_node_t* base, /*!< in: pointer to base node */ + mtr_t* mtr); /*!< in: mini-transaction handle */ +/********************************************************************//** +Gets list first node address. +@return file address */ +UNIV_INLINE +fil_addr_t +flst_get_first( +/*===========*/ + const flst_base_node_t* base, /*!< in: pointer to base node */ + mtr_t* mtr); /*!< in: mini-transaction handle */ +/********************************************************************//** +Gets list last node address. +@return file address */ +UNIV_INLINE +fil_addr_t +flst_get_last( +/*==========*/ + const flst_base_node_t* base, /*!< in: pointer to base node */ + mtr_t* mtr); /*!< in: mini-transaction handle */ +/********************************************************************//** +Gets list next node address. +@return file address */ +UNIV_INLINE +fil_addr_t +flst_get_next_addr( +/*===============*/ + const flst_node_t* node, /*!< in: pointer to node */ + mtr_t* mtr); /*!< in: mini-transaction handle */ +/********************************************************************//** +Gets list prev node address. +@return file address */ +UNIV_INLINE +fil_addr_t +flst_get_prev_addr( +/*===============*/ + const flst_node_t* node, /*!< in: pointer to node */ + mtr_t* mtr); /*!< in: mini-transaction handle */ +/********************************************************************//** +Writes a file address. */ +UNIV_INLINE +void +flst_write_addr( +/*============*/ + fil_faddr_t* faddr, /*!< in: pointer to file faddress */ + fil_addr_t addr, /*!< in: file address */ + mtr_t* mtr); /*!< in: mini-transaction handle */ +/********************************************************************//** +Reads a file address. +@return file address */ +UNIV_INLINE +fil_addr_t +flst_read_addr( +/*===========*/ + const fil_faddr_t* faddr, /*!< in: pointer to file faddress */ + mtr_t* mtr); /*!< in: mini-transaction handle */ +/********************************************************************//** +Validates a file-based list. +@return TRUE if ok */ +UNIV_INTERN +ibool +flst_validate( +/*==========*/ + const flst_base_node_t* base, /*!< in: pointer to base node of list */ + mtr_t* mtr1); /*!< in: mtr */ +/********************************************************************//** +Prints info of a file-based list. */ +UNIV_INTERN +void +flst_print( +/*=======*/ + const flst_base_node_t* base, /*!< in: pointer to base node of list */ + mtr_t* mtr); /*!< in: mtr */ + + +#ifndef UNIV_NONINL +#include "fut0lst.ic" +#endif + +#endif /* !UNIV_HOTBACKUP */ + +#endif diff --git a/storage/innobase/include/fut0lst.ic b/storage/innobase/include/fut0lst.ic new file mode 100644 index 00000000000..d18cf21378f --- /dev/null +++ b/storage/innobase/include/fut0lst.ic @@ -0,0 +1,167 @@ +/***************************************************************************** + +Copyright (c) 1995, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/fut0lst.ic +File-based list utilities + +Created 11/28/1995 Heikki Tuuri +***********************************************************************/ + +#include "fut0fut.h" +#include "mtr0log.h" +#include "buf0buf.h" + +/* We define the field offsets of a node for the list */ +#define FLST_PREV 0 /* 6-byte address of the previous list element; + the page part of address is FIL_NULL, if no + previous element */ +#define FLST_NEXT FIL_ADDR_SIZE /* 6-byte address of the next + list element; the page part of address + is FIL_NULL, if no next element */ + +/* We define the field offsets of a base node for the list */ +#define FLST_LEN 0 /* 32-bit list length field */ +#define FLST_FIRST 4 /* 6-byte address of the first element + of the list; undefined if empty list */ +#define FLST_LAST (4 + FIL_ADDR_SIZE) /* 6-byte address of the + last element of the list; undefined + if empty list */ + +/********************************************************************//** +Writes a file address. */ +UNIV_INLINE +void +flst_write_addr( +/*============*/ + fil_faddr_t* faddr, /*!< in: pointer to file faddress */ + fil_addr_t addr, /*!< in: file address */ + mtr_t* mtr) /*!< in: mini-transaction handle */ +{ + ut_ad(faddr && mtr); + ut_ad(mtr_memo_contains_page(mtr, faddr, MTR_MEMO_PAGE_X_FIX)); + ut_a(addr.page == FIL_NULL || addr.boffset >= FIL_PAGE_DATA); + ut_a(ut_align_offset(faddr, UNIV_PAGE_SIZE) >= FIL_PAGE_DATA); + + mlog_write_ulint(faddr + FIL_ADDR_PAGE, addr.page, MLOG_4BYTES, mtr); + mlog_write_ulint(faddr + FIL_ADDR_BYTE, addr.boffset, + MLOG_2BYTES, mtr); +} + +/********************************************************************//** +Reads a file address. +@return file address */ +UNIV_INLINE +fil_addr_t +flst_read_addr( +/*===========*/ + const fil_faddr_t* faddr, /*!< in: pointer to file faddress */ + mtr_t* mtr) /*!< in: mini-transaction handle */ +{ + fil_addr_t addr; + + ut_ad(faddr && mtr); + + addr.page = mtr_read_ulint(faddr + FIL_ADDR_PAGE, MLOG_4BYTES, mtr); + addr.boffset = mtr_read_ulint(faddr + FIL_ADDR_BYTE, MLOG_2BYTES, + mtr); + ut_a(addr.page == FIL_NULL || addr.boffset >= FIL_PAGE_DATA); + ut_a(ut_align_offset(faddr, UNIV_PAGE_SIZE) >= FIL_PAGE_DATA); + return(addr); +} + +/********************************************************************//** +Initializes a list base node. */ +UNIV_INLINE +void +flst_init( +/*======*/ + flst_base_node_t* base, /*!< in: pointer to base node */ + mtr_t* mtr) /*!< in: mini-transaction handle */ +{ + ut_ad(mtr_memo_contains_page(mtr, base, MTR_MEMO_PAGE_X_FIX)); + + mlog_write_ulint(base + FLST_LEN, 0, MLOG_4BYTES, mtr); + flst_write_addr(base + FLST_FIRST, fil_addr_null, mtr); + flst_write_addr(base + FLST_LAST, fil_addr_null, mtr); +} + +/********************************************************************//** +Gets list length. +@return length */ +UNIV_INLINE +ulint +flst_get_len( +/*=========*/ + const flst_base_node_t* base, /*!< in: pointer to base node */ + mtr_t* mtr) /*!< in: mini-transaction handle */ +{ + return(mtr_read_ulint(base + FLST_LEN, MLOG_4BYTES, mtr)); +} + +/********************************************************************//** +Gets list first node address. +@return file address */ +UNIV_INLINE +fil_addr_t +flst_get_first( +/*===========*/ + const flst_base_node_t* base, /*!< in: pointer to base node */ + mtr_t* mtr) /*!< in: mini-transaction handle */ +{ + return(flst_read_addr(base + FLST_FIRST, mtr)); +} + +/********************************************************************//** +Gets list last node address. +@return file address */ +UNIV_INLINE +fil_addr_t +flst_get_last( +/*==========*/ + const flst_base_node_t* base, /*!< in: pointer to base node */ + mtr_t* mtr) /*!< in: mini-transaction handle */ +{ + return(flst_read_addr(base + FLST_LAST, mtr)); +} + +/********************************************************************//** +Gets list next node address. +@return file address */ +UNIV_INLINE +fil_addr_t +flst_get_next_addr( +/*===============*/ + const flst_node_t* node, /*!< in: pointer to node */ + mtr_t* mtr) /*!< in: mini-transaction handle */ +{ + return(flst_read_addr(node + FLST_NEXT, mtr)); +} + +/********************************************************************//** +Gets list prev node address. +@return file address */ +UNIV_INLINE +fil_addr_t +flst_get_prev_addr( +/*===============*/ + const flst_node_t* node, /*!< in: pointer to node */ + mtr_t* mtr) /*!< in: mini-transaction handle */ +{ + return(flst_read_addr(node + FLST_PREV, mtr)); +} diff --git a/storage/innobase/include/ha0ha.h b/storage/innobase/include/ha0ha.h new file mode 100644 index 00000000000..7351b407e8c --- /dev/null +++ b/storage/innobase/include/ha0ha.h @@ -0,0 +1,265 @@ +/***************************************************************************** + +Copyright (c) 1994, 2011, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/ha0ha.h +The hash table with external chains + +Created 8/18/1994 Heikki Tuuri +*******************************************************/ + +#ifndef ha0ha_h +#define ha0ha_h + +#include "univ.i" + +#include "hash0hash.h" +#include "page0types.h" +#include "buf0types.h" +#include "rem0types.h" + +/*************************************************************//** +Looks for an element in a hash table. +@return pointer to the data of the first hash table node in chain +having the fold number, NULL if not found */ +UNIV_INLINE +const rec_t* +ha_search_and_get_data( +/*===================*/ + hash_table_t* table, /*!< in: hash table */ + ulint fold); /*!< in: folded value of the searched data */ +/*********************************************************//** +Looks for an element when we know the pointer to the data and updates +the pointer to data if found. +@return TRUE if found */ +UNIV_INTERN +ibool +ha_search_and_update_if_found_func( +/*===============================*/ + hash_table_t* table, /*!< in/out: hash table */ + ulint fold, /*!< in: folded value of the searched data */ + const rec_t* data, /*!< in: pointer to the data */ +#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG + buf_block_t* new_block,/*!< in: block containing new_data */ +#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */ + const rec_t* new_data);/*!< in: new pointer to the data */ + +#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG +/** Looks for an element when we know the pointer to the data and +updates the pointer to data if found. +@param table in/out: hash table +@param fold in: folded value of the searched data +@param data in: pointer to the data +@param new_block in: block containing new_data +@param new_data in: new pointer to the data */ +# define ha_search_and_update_if_found(table,fold,data,new_block,new_data) \ + ha_search_and_update_if_found_func(table,fold,data,new_block,new_data) +#else /* UNIV_AHI_DEBUG || UNIV_DEBUG */ +/** Looks for an element when we know the pointer to the data and +updates the pointer to data if found. +@param table in/out: hash table +@param fold in: folded value of the searched data +@param data in: pointer to the data +@param new_block ignored: block containing new_data +@param new_data in: new pointer to the data */ +# define ha_search_and_update_if_found(table,fold,data,new_block,new_data) \ + ha_search_and_update_if_found_func(table,fold,data,new_data) +#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */ +/*************************************************************//** +Creates a hash table with at least n array cells. The actual number +of cells is chosen to be a prime number slightly bigger than n. +@return own: created table */ +UNIV_INTERN +hash_table_t* +ha_create_func( +/*===========*/ + ulint n, /*!< in: number of array cells */ +#ifdef UNIV_SYNC_DEBUG + ulint mutex_level, /*!< in: level of the mutexes in the latching + order: this is used in the debug version */ +#endif /* UNIV_SYNC_DEBUG */ + ulint n_mutexes, /*!< in: number of mutexes to protect the + hash table: must be a power of 2, or 0 */ + ulint type); /*!< in: type of datastructure for which + the memory heap is going to be used e.g.: + MEM_HEAP_FOR_BTR_SEARCH or + MEM_HEAP_FOR_PAGE_HASH */ +#ifdef UNIV_SYNC_DEBUG +/** Creates a hash table. +@return own: created table +@param n_c in: number of array cells. The actual number of cells is +chosen to be a slightly bigger prime number. +@param level in: level of the mutexes in the latching order +@param n_m in: number of mutexes to protect the hash table; + must be a power of 2, or 0 */ +# define ha_create(n_c,n_m,type,level) ha_create_func(n_c,level,n_m,type) +#else /* UNIV_SYNC_DEBUG */ +/** Creates a hash table. +@return own: created table +@param n_c in: number of array cells. The actual number of cells is +chosen to be a slightly bigger prime number. +@param level in: level of the mutexes in the latching order +@param n_m in: number of mutexes to protect the hash table; + must be a power of 2, or 0 */ +# define ha_create(n_c,n_m,type,level) ha_create_func(n_c,n_m,type) +#endif /* UNIV_SYNC_DEBUG */ + +/*************************************************************//** +Empties a hash table and frees the memory heaps. */ +UNIV_INTERN +void +ha_clear( +/*=====*/ + hash_table_t* table); /*!< in, own: hash table */ + +/*************************************************************//** +Inserts an entry into a hash table. If an entry with the same fold number +is found, its node is updated to point to the new data, and no new node +is inserted. +@return TRUE if succeed, FALSE if no more memory could be allocated */ +UNIV_INTERN +ibool +ha_insert_for_fold_func( +/*====================*/ + hash_table_t* table, /*!< in: hash table */ + ulint fold, /*!< in: folded value of data; if a node with + the same fold value already exists, it is + updated to point to the same data, and no new + node is created! */ +#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG + buf_block_t* block, /*!< in: buffer block containing the data */ +#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */ + const rec_t* data); /*!< in: data, must not be NULL */ + +#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG +/** +Inserts an entry into a hash table. If an entry with the same fold number +is found, its node is updated to point to the new data, and no new node +is inserted. +@return TRUE if succeed, FALSE if no more memory could be allocated +@param t in: hash table +@param f in: folded value of data +@param b in: buffer block containing the data +@param d in: data, must not be NULL */ +# define ha_insert_for_fold(t,f,b,d) do { \ + ha_insert_for_fold_func(t,f,b,d); \ + MONITOR_INC(MONITOR_ADAPTIVE_HASH_ROW_ADDED); \ +} while(0) +#else /* UNIV_AHI_DEBUG || UNIV_DEBUG */ +/** +Inserts an entry into a hash table. If an entry with the same fold number +is found, its node is updated to point to the new data, and no new node +is inserted. +@return TRUE if succeed, FALSE if no more memory could be allocated +@param t in: hash table +@param f in: folded value of data +@param b ignored: buffer block containing the data +@param d in: data, must not be NULL */ +# define ha_insert_for_fold(t,f,b,d) do { \ + ha_insert_for_fold_func(t,f,d); \ + MONITOR_INC(MONITOR_ADAPTIVE_HASH_ROW_ADDED); \ +} while (0) +#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */ + +/*********************************************************//** +Looks for an element when we know the pointer to the data and deletes +it from the hash table if found. +@return TRUE if found */ +UNIV_INLINE +ibool +ha_search_and_delete_if_found( +/*==========================*/ + hash_table_t* table, /*!< in: hash table */ + ulint fold, /*!< in: folded value of the searched data */ + const rec_t* data); /*!< in: pointer to the data */ +#ifndef UNIV_HOTBACKUP +/*****************************************************************//** +Removes from the chain determined by fold all nodes whose data pointer +points to the page given. */ +UNIV_INTERN +void +ha_remove_all_nodes_to_page( +/*========================*/ + hash_table_t* table, /*!< in: hash table */ + ulint fold, /*!< in: fold value */ + const page_t* page); /*!< in: buffer page */ +#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG +/*************************************************************//** +Validates a given range of the cells in hash table. +@return TRUE if ok */ +UNIV_INTERN +ibool +ha_validate( +/*========*/ + hash_table_t* table, /*!< in: hash table */ + ulint start_index, /*!< in: start index */ + ulint end_index); /*!< in: end index */ +#endif /* defined UNIV_AHI_DEBUG || defined UNIV_DEBUG */ +/*************************************************************//** +Prints info of a hash table. */ +UNIV_INTERN +void +ha_print_info( +/*==========*/ + FILE* file, /*!< in: file where to print */ + hash_table_t* table); /*!< in: hash table */ +#endif /* !UNIV_HOTBACKUP */ + +/** The hash table external chain node */ +struct ha_node_t { + ha_node_t* next; /*!< next chain node or NULL if none */ +#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG + buf_block_t* block; /*!< buffer block containing the data, or NULL */ +#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */ + const rec_t* data; /*!< pointer to the data */ + ulint fold; /*!< fold value for the data */ +}; + +#ifdef UNIV_DEBUG +/********************************************************************//** +Assert that the synchronization object in a hash operation involving +possible change in the hash table is held. +Note that in case of mutexes we assert that mutex is owned while in case +of rw-locks we assert that it is held in exclusive mode. */ +UNIV_INLINE +void +hash_assert_can_modify( +/*===================*/ + hash_table_t* table, /*!< in: hash table */ + ulint fold); /*!< in: fold value */ +/********************************************************************//** +Assert that the synchronization object in a hash search operation is held. +Note that in case of mutexes we assert that mutex is owned while in case +of rw-locks we assert that it is held either in x-mode or s-mode. */ +UNIV_INLINE +void +hash_assert_can_search( +/*===================*/ + hash_table_t* table, /*!< in: hash table */ + ulint fold); /*!< in: fold value */ +#else /* UNIV_DEBUG */ +#define hash_assert_can_modify(t, f) +#define hash_assert_can_search(t, f) +#endif /* UNIV_DEBUG */ + + +#ifndef UNIV_NONINL +#include "ha0ha.ic" +#endif + +#endif diff --git a/storage/innobase/include/ha0ha.ic b/storage/innobase/include/ha0ha.ic new file mode 100644 index 00000000000..c478ff54303 --- /dev/null +++ b/storage/innobase/include/ha0ha.ic @@ -0,0 +1,246 @@ +/***************************************************************************** + +Copyright (c) 1994, 2011, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/********************************************************************//** +@file include/ha0ha.ic +The hash table with external chains + +Created 8/18/1994 Heikki Tuuri +*************************************************************************/ + +#include "ut0rnd.h" +#include "mem0mem.h" +#include "btr0types.h" + +/***********************************************************//** +Deletes a hash node. */ +UNIV_INTERN +void +ha_delete_hash_node( +/*================*/ + hash_table_t* table, /*!< in: hash table */ + ha_node_t* del_node); /*!< in: node to be deleted */ + +/******************************************************************//** +Gets a hash node data. +@return pointer to the data */ +UNIV_INLINE +const rec_t* +ha_node_get_data( +/*=============*/ + const ha_node_t* node) /*!< in: hash chain node */ +{ + return(node->data); +} + +/******************************************************************//** +Sets hash node data. */ +UNIV_INLINE +void +ha_node_set_data_func( +/*==================*/ + ha_node_t* node, /*!< in: hash chain node */ +#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG + buf_block_t* block, /*!< in: buffer block containing the data */ +#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */ + const rec_t* data) /*!< in: pointer to the data */ +{ +#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG + node->block = block; +#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */ + node->data = data; +} + +#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG +/** Sets hash node data. +@param n in: hash chain node +@param b in: buffer block containing the data +@param d in: pointer to the data */ +# define ha_node_set_data(n,b,d) ha_node_set_data_func(n,b,d) +#else /* UNIV_AHI_DEBUG || UNIV_DEBUG */ +/** Sets hash node data. +@param n in: hash chain node +@param b in: buffer block containing the data +@param d in: pointer to the data */ +# define ha_node_set_data(n,b,d) ha_node_set_data_func(n,d) +#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */ + +/******************************************************************//** +Gets the next node in a hash chain. +@return next node, NULL if none */ +UNIV_INLINE +ha_node_t* +ha_chain_get_next( +/*==============*/ + ha_node_t* node) /*!< in: hash chain node */ +{ + return(node->next); +} + +/******************************************************************//** +Gets the first node in a hash chain. +@return first node, NULL if none */ +UNIV_INLINE +ha_node_t* +ha_chain_get_first( +/*===============*/ + hash_table_t* table, /*!< in: hash table */ + ulint fold) /*!< in: fold value determining the chain */ +{ + return((ha_node_t*) + hash_get_nth_cell(table, hash_calc_hash(fold, table))->node); +} + +#ifdef UNIV_DEBUG +/********************************************************************//** +Assert that the synchronization object in a hash operation involving +possible change in the hash table is held. +Note that in case of mutexes we assert that mutex is owned while in case +of rw-locks we assert that it is held in exclusive mode. */ +UNIV_INLINE +void +hash_assert_can_modify( +/*===================*/ + hash_table_t* table, /*!< in: hash table */ + ulint fold) /*!< in: fold value */ +{ + if (table->type == HASH_TABLE_SYNC_MUTEX) { + ut_ad(mutex_own(hash_get_mutex(table, fold))); + } else if (table->type == HASH_TABLE_SYNC_RW_LOCK) { +# ifdef UNIV_SYNC_DEBUG + rw_lock_t* lock = hash_get_lock(table, fold); + ut_ad(rw_lock_own(lock, RW_LOCK_EX)); +# endif + } else { + ut_ad(table->type == HASH_TABLE_SYNC_NONE); + } +} + +/********************************************************************//** +Assert that the synchronization object in a hash search operation is held. +Note that in case of mutexes we assert that mutex is owned while in case +of rw-locks we assert that it is held either in x-mode or s-mode. */ +UNIV_INLINE +void +hash_assert_can_search( +/*===================*/ + hash_table_t* table, /*!< in: hash table */ + ulint fold) /*!< in: fold value */ +{ + if (table->type == HASH_TABLE_SYNC_MUTEX) { + ut_ad(mutex_own(hash_get_mutex(table, fold))); + } else if (table->type == HASH_TABLE_SYNC_RW_LOCK) { +# ifdef UNIV_SYNC_DEBUG + rw_lock_t* lock = hash_get_lock(table, fold); + ut_ad(rw_lock_own(lock, RW_LOCK_EX) + || rw_lock_own(lock, RW_LOCK_SHARED)); +# endif + } else { + ut_ad(table->type == HASH_TABLE_SYNC_NONE); + } +} +#endif /* UNIV_DEBUG */ + +/*************************************************************//** +Looks for an element in a hash table. +@return pointer to the data of the first hash table node in chain +having the fold number, NULL if not found */ +UNIV_INLINE +const rec_t* +ha_search_and_get_data( +/*===================*/ + hash_table_t* table, /*!< in: hash table */ + ulint fold) /*!< in: folded value of the searched data */ +{ + ha_node_t* node; + + hash_assert_can_search(table, fold); + ut_ad(btr_search_enabled); + + node = ha_chain_get_first(table, fold); + + while (node) { + if (node->fold == fold) { + + return(node->data); + } + + node = ha_chain_get_next(node); + } + + return(NULL); +} + +/*********************************************************//** +Looks for an element when we know the pointer to the data. +@return pointer to the hash table node, NULL if not found in the table */ +UNIV_INLINE +ha_node_t* +ha_search_with_data( +/*================*/ + hash_table_t* table, /*!< in: hash table */ + ulint fold, /*!< in: folded value of the searched data */ + const rec_t* data) /*!< in: pointer to the data */ +{ + ha_node_t* node; + + hash_assert_can_search(table, fold); + + ut_ad(btr_search_enabled); + + node = ha_chain_get_first(table, fold); + + while (node) { + if (node->data == data) { + + return(node); + } + + node = ha_chain_get_next(node); + } + + return(NULL); +} + +/*********************************************************//** +Looks for an element when we know the pointer to the data, and deletes +it from the hash table, if found. +@return TRUE if found */ +UNIV_INLINE +ibool +ha_search_and_delete_if_found( +/*==========================*/ + hash_table_t* table, /*!< in: hash table */ + ulint fold, /*!< in: folded value of the searched data */ + const rec_t* data) /*!< in: pointer to the data */ +{ + ha_node_t* node; + + hash_assert_can_modify(table, fold); + ut_ad(btr_search_enabled); + + node = ha_search_with_data(table, fold, data); + + if (node) { + ha_delete_hash_node(table, node); + + return(TRUE); + } + + return(FALSE); +} diff --git a/storage/innobase/include/ha0storage.h b/storage/innobase/include/ha0storage.h new file mode 100644 index 00000000000..0073930b502 --- /dev/null +++ b/storage/innobase/include/ha0storage.h @@ -0,0 +1,140 @@ +/***************************************************************************** + +Copyright (c) 2007, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/ha0storage.h +Hash storage. +Provides a data structure that stores chunks of data in +its own storage, avoiding duplicates. + +Created September 22, 2007 Vasil Dimov +*******************************************************/ + +#ifndef ha0storage_h +#define ha0storage_h + +#include "univ.i" + +/** This value is used by default by ha_storage_create(). More memory +is allocated later when/if it is needed. */ +#define HA_STORAGE_DEFAULT_HEAP_BYTES 1024 + +/** This value is used by default by ha_storage_create(). It is a +constant per ha_storage's lifetime. */ +#define HA_STORAGE_DEFAULT_HASH_CELLS 4096 + +/** Hash storage */ +struct ha_storage_t; + +/*******************************************************************//** +Creates a hash storage. If any of the parameters is 0, then a default +value is used. +@return own: hash storage */ +UNIV_INLINE +ha_storage_t* +ha_storage_create( +/*==============*/ + ulint initial_heap_bytes, /*!< in: initial heap's size */ + ulint initial_hash_cells); /*!< in: initial number of cells + in the hash table */ + +/*******************************************************************//** +Copies data into the storage and returns a pointer to the copy. If the +same data chunk is already present, then pointer to it is returned. +Data chunks are considered to be equal if len1 == len2 and +memcmp(data1, data2, len1) == 0. If "data" is not present (and thus +data_len bytes need to be allocated) and the size of storage is going to +become more than "memlim" then "data" is not added and NULL is returned. +To disable this behavior "memlim" can be set to 0, which stands for +"no limit". +@return pointer to the copy */ +UNIV_INTERN +const void* +ha_storage_put_memlim( +/*==================*/ + ha_storage_t* storage, /*!< in/out: hash storage */ + const void* data, /*!< in: data to store */ + ulint data_len, /*!< in: data length */ + ulint memlim); /*!< in: memory limit to obey */ + +/*******************************************************************//** +Same as ha_storage_put_memlim() but without memory limit. +@param storage in/out: hash storage +@param data in: data to store +@param data_len in: data length +@return pointer to the copy of the string */ +#define ha_storage_put(storage, data, data_len) \ + ha_storage_put_memlim((storage), (data), (data_len), 0) + +/*******************************************************************//** +Copies string into the storage and returns a pointer to the copy. If the +same string is already present, then pointer to it is returned. +Strings are considered to be equal if strcmp(str1, str2) == 0. +@param storage in/out: hash storage +@param str in: string to put +@return pointer to the copy of the string */ +#define ha_storage_put_str(storage, str) \ + ((const char*) ha_storage_put((storage), (str), strlen(str) + 1)) + +/*******************************************************************//** +Copies string into the storage and returns a pointer to the copy obeying +a memory limit. +If the same string is already present, then pointer to it is returned. +Strings are considered to be equal if strcmp(str1, str2) == 0. +@param storage in/out: hash storage +@param str in: string to put +@param memlim in: memory limit to obey +@return pointer to the copy of the string */ +#define ha_storage_put_str_memlim(storage, str, memlim) \ + ((const char*) ha_storage_put_memlim((storage), (str), \ + strlen(str) + 1, (memlim))) + +/*******************************************************************//** +Empties a hash storage, freeing memory occupied by data chunks. +This invalidates any pointers previously returned by ha_storage_put(). +The hash storage is not invalidated itself and can be used again. */ +UNIV_INLINE +void +ha_storage_empty( +/*=============*/ + ha_storage_t** storage); /*!< in/out: hash storage */ + +/*******************************************************************//** +Frees a hash storage and everything it contains, it cannot be used after +this call. +This invalidates any pointers previously returned by ha_storage_put(). */ +UNIV_INLINE +void +ha_storage_free( +/*============*/ + ha_storage_t* storage); /*!< in, own: hash storage */ + +/*******************************************************************//** +Gets the size of the memory used by a storage. +@return bytes used */ +UNIV_INLINE +ulint +ha_storage_get_size( +/*================*/ + const ha_storage_t* storage); /*!< in: hash storage */ + +#ifndef UNIV_NONINL +#include "ha0storage.ic" +#endif + +#endif /* ha0storage_h */ diff --git a/storage/innobase/include/ha0storage.ic b/storage/innobase/include/ha0storage.ic new file mode 100644 index 00000000000..7150ca045ec --- /dev/null +++ b/storage/innobase/include/ha0storage.ic @@ -0,0 +1,146 @@ +/***************************************************************************** + +Copyright (c) 2007, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/ha0storage.ic +Hash storage. +Provides a data structure that stores chunks of data in +its own storage, avoiding duplicates. + +Created September 24, 2007 Vasil Dimov +*******************************************************/ + +#include "univ.i" +#include "ha0storage.h" +#include "hash0hash.h" +#include "mem0mem.h" + +/** Hash storage for strings */ +struct ha_storage_t { + mem_heap_t* heap; /*!< memory heap from which memory is + allocated */ + hash_table_t* hash; /*!< hash table used to avoid + duplicates */ +}; + +/** Objects of this type are stored in ha_storage_t */ +struct ha_storage_node_t { + ulint data_len;/*!< length of the data */ + const void* data; /*!< pointer to data */ + ha_storage_node_t* next; /*!< next node in hash chain */ +}; + +/*******************************************************************//** +Creates a hash storage. If any of the parameters is 0, then a default +value is used. +@return own: hash storage */ +UNIV_INLINE +ha_storage_t* +ha_storage_create( +/*==============*/ + ulint initial_heap_bytes, /*!< in: initial heap's size */ + ulint initial_hash_cells) /*!< in: initial number of cells + in the hash table */ +{ + ha_storage_t* storage; + mem_heap_t* heap; + + if (initial_heap_bytes == 0) { + + initial_heap_bytes = HA_STORAGE_DEFAULT_HEAP_BYTES; + } + + if (initial_hash_cells == 0) { + + initial_hash_cells = HA_STORAGE_DEFAULT_HASH_CELLS; + } + + /* we put "storage" within "storage->heap" */ + + heap = mem_heap_create(sizeof(ha_storage_t) + + initial_heap_bytes); + + storage = (ha_storage_t*) mem_heap_alloc(heap, + sizeof(ha_storage_t)); + + storage->heap = heap; + storage->hash = hash_create(initial_hash_cells); + + return(storage); +} + +/*******************************************************************//** +Empties a hash storage, freeing memory occupied by data chunks. +This invalidates any pointers previously returned by ha_storage_put(). +The hash storage is not invalidated itself and can be used again. */ +UNIV_INLINE +void +ha_storage_empty( +/*=============*/ + ha_storage_t** storage) /*!< in/out: hash storage */ +{ + ha_storage_t temp_storage; + + temp_storage.heap = (*storage)->heap; + temp_storage.hash = (*storage)->hash; + + hash_table_clear(temp_storage.hash); + mem_heap_empty(temp_storage.heap); + + *storage = (ha_storage_t*) mem_heap_alloc(temp_storage.heap, + sizeof(ha_storage_t)); + + (*storage)->heap = temp_storage.heap; + (*storage)->hash = temp_storage.hash; +} + +/*******************************************************************//** +Frees a hash storage and everything it contains, it cannot be used after +this call. +This invalidates any pointers previously returned by ha_storage_put(). */ +UNIV_INLINE +void +ha_storage_free( +/*============*/ + ha_storage_t* storage) /*!< in, own: hash storage */ +{ + /* order is important because the pointer storage->hash is + within the heap */ + hash_table_free(storage->hash); + mem_heap_free(storage->heap); +} + +/*******************************************************************//** +Gets the size of the memory used by a storage. +@return bytes used */ +UNIV_INLINE +ulint +ha_storage_get_size( +/*================*/ + const ha_storage_t* storage) /*!< in: hash storage */ +{ + ulint ret; + + ret = mem_heap_get_size(storage->heap); + + /* this assumes hash->heap and hash->heaps are NULL */ + ret += sizeof(hash_table_t); + ret += sizeof(hash_cell_t) * hash_get_n_cells(storage->hash); + + return(ret); +} diff --git a/storage/innobase/include/ha_prototypes.h b/storage/innobase/include/ha_prototypes.h new file mode 100644 index 00000000000..fa202aa773e --- /dev/null +++ b/storage/innobase/include/ha_prototypes.h @@ -0,0 +1,596 @@ +/***************************************************************************** + +Copyright (c) 2006, 2014, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/*******************************************************************//** +@file include/ha_prototypes.h +Prototypes for global functions in ha_innodb.cc that are called by +InnoDB C code + +Created 5/11/2006 Osku Salerma +************************************************************************/ + +#ifndef HA_INNODB_PROTOTYPES_H +#define HA_INNODB_PROTOTYPES_H + +#include "my_dbug.h" +#include "mysqld_error.h" +#include "my_compare.h" +#include "my_sys.h" +#include "m_string.h" +#include "debug_sync.h" + +#include "trx0types.h" +#include "m_ctype.h" /* CHARSET_INFO */ + +// Forward declarations +class Field; +struct fts_string_t; + +/*********************************************************************//** +Wrapper around MySQL's copy_and_convert function. +@return number of bytes copied to 'to' */ +UNIV_INTERN +ulint +innobase_convert_string( +/*====================*/ + void* to, /*!< out: converted string */ + ulint to_length, /*!< in: number of bytes reserved + for the converted string */ + CHARSET_INFO* to_cs, /*!< in: character set to convert to */ + const void* from, /*!< in: string to convert */ + ulint from_length, /*!< in: number of bytes to convert */ + CHARSET_INFO* from_cs, /*!< in: character set to convert + from */ + uint* errors); /*!< out: number of errors encountered + during the conversion */ + +/*******************************************************************//** +Formats the raw data in "data" (in InnoDB on-disk format) that is of +type DATA_(CHAR|VARCHAR|MYSQL|VARMYSQL) using "charset_coll" and writes +the result to "buf". The result is converted to "system_charset_info". +Not more than "buf_size" bytes are written to "buf". +The result is always NUL-terminated (provided buf_size > 0) and the +number of bytes that were written to "buf" is returned (including the +terminating NUL). +@return number of bytes that were written */ +UNIV_INTERN +ulint +innobase_raw_format( +/*================*/ + const char* data, /*!< in: raw data */ + ulint data_len, /*!< in: raw data length + in bytes */ + ulint charset_coll, /*!< in: charset collation */ + char* buf, /*!< out: output buffer */ + ulint buf_size); /*!< in: output buffer size + in bytes */ + +/*****************************************************************//** +Invalidates the MySQL query cache for the table. */ +UNIV_INTERN +void +innobase_invalidate_query_cache( +/*============================*/ + trx_t* trx, /*!< in: transaction which + modifies the table */ + const char* full_name, /*!< in: concatenation of + database name, null char NUL, + table name, null char NUL; + NOTE that in Windows this is + always in LOWER CASE! */ + ulint full_name_len); /*!< in: full name length where + also the null chars count */ + +/*****************************************************************//** +Convert a table or index name to the MySQL system_charset_info (UTF-8) +and quote it if needed. +@return pointer to the end of buf */ +UNIV_INTERN +char* +innobase_convert_name( +/*==================*/ + char* buf, /*!< out: buffer for converted identifier */ + ulint buflen, /*!< in: length of buf, in bytes */ + const char* id, /*!< in: identifier to convert */ + ulint idlen, /*!< in: length of id, in bytes */ + THD* thd, /*!< in: MySQL connection thread, or NULL */ + ibool table_id);/*!< in: TRUE=id is a table or database name; + FALSE=id is an index name */ + +/******************************************************************//** +Returns true if the thread is the replication thread on the slave +server. Used in srv_conc_enter_innodb() to determine if the thread +should be allowed to enter InnoDB - the replication thread is treated +differently than other threads. Also used in +srv_conc_force_exit_innodb(). +@return true if thd is the replication thread */ +UNIV_INTERN +ibool +thd_is_replication_slave_thread( +/*============================*/ + THD* thd); /*!< in: thread handle */ + +/******************************************************************//** +Gets information on the durability property requested by thread. +Used when writing either a prepare or commit record to the log +buffer. +@return the durability property. */ +UNIV_INTERN +enum durability_properties +thd_requested_durability( +/*=====================*/ + const THD* thd) /*!< in: thread handle */ + __attribute__((nonnull, warn_unused_result)); + +/******************************************************************//** +Returns true if the transaction this thread is processing has edited +non-transactional tables. Used by the deadlock detector when deciding +which transaction to rollback in case of a deadlock - we try to avoid +rolling back transactions that have edited non-transactional tables. +@return true if non-transactional tables have been edited */ +UNIV_INTERN +ibool +thd_has_edited_nontrans_tables( +/*===========================*/ + THD* thd); /*!< in: thread handle */ + +/*************************************************************//** +Prints info of a THD object (== user session thread) to the given file. */ +UNIV_INTERN +void +innobase_mysql_print_thd( +/*=====================*/ + FILE* f, /*!< in: output stream */ + THD* thd, /*!< in: pointer to a MySQL THD object */ + uint max_query_len); /*!< in: max query length to print, or 0 to + use the default max length */ + +/*************************************************************//** +InnoDB uses this function to compare two data fields for which the data type +is such that we must use MySQL code to compare them. +@return 1, 0, -1, if a is greater, equal, less than b, respectively */ +UNIV_INTERN +int +innobase_mysql_cmp( +/*===============*/ + int mysql_type, /*!< in: MySQL type */ + uint charset_number, /*!< in: number of the charset */ + const unsigned char* a, /*!< in: data field */ + unsigned int a_length, /*!< in: data field length, + not UNIV_SQL_NULL */ + const unsigned char* b, /*!< in: data field */ + unsigned int b_length) /*!< in: data field length, + not UNIV_SQL_NULL */ + __attribute__((nonnull, warn_unused_result)); +/**************************************************************//** +Converts a MySQL type to an InnoDB type. Note that this function returns +the 'mtype' of InnoDB. InnoDB differentiates between MySQL's old <= 4.1 +VARCHAR and the new true VARCHAR in >= 5.0.3 by the 'prtype'. +@return DATA_BINARY, DATA_VARCHAR, ... */ +UNIV_INTERN +ulint +get_innobase_type_from_mysql_type( +/*==============================*/ + ulint* unsigned_flag, /*!< out: DATA_UNSIGNED if an + 'unsigned type'; + at least ENUM and SET, + and unsigned integer + types are 'unsigned types' */ + const void* field) /*!< in: MySQL Field */ + __attribute__((nonnull)); + +/******************************************************************//** +Get the variable length bounds of the given character set. */ +UNIV_INTERN +void +innobase_get_cset_width( +/*====================*/ + ulint cset, /*!< in: MySQL charset-collation code */ + ulint* mbminlen, /*!< out: minimum length of a char (in bytes) */ + ulint* mbmaxlen); /*!< out: maximum length of a char (in bytes) */ + +/******************************************************************//** +Compares NUL-terminated UTF-8 strings case insensitively. +@return 0 if a=b, <0 if a<b, >1 if a>b */ +UNIV_INTERN +int +innobase_strcasecmp( +/*================*/ + const char* a, /*!< in: first string to compare */ + const char* b); /*!< in: second string to compare */ + +/******************************************************************//** +Compares NUL-terminated UTF-8 strings case insensitively. The +second string contains wildcards. +@return 0 if a match is found, 1 if not */ +UNIV_INTERN +int +innobase_wildcasecmp( +/*=================*/ + const char* a, /*!< in: string to compare */ + const char* b); /*!< in: wildcard string to compare */ + +/******************************************************************//** +Strip dir name from a full path name and return only its file name. +@return file name or "null" if no file name */ +UNIV_INTERN +const char* +innobase_basename( +/*==============*/ + const char* path_name); /*!< in: full path name */ + +/******************************************************************//** +Returns true if the thread is executing a SELECT statement. +@return true if thd is executing SELECT */ +UNIV_INTERN +ibool +thd_is_select( +/*==========*/ + const THD* thd); /*!< in: thread handle */ + +/******************************************************************//** +Converts an identifier to a table name. */ +UNIV_INTERN +void +innobase_convert_from_table_id( +/*===========================*/ + struct charset_info_st* cs, /*!< in: the 'from' character set */ + char* to, /*!< out: converted identifier */ + const char* from, /*!< in: identifier to convert */ + ulint len); /*!< in: length of 'to', in bytes; should + be at least 5 * strlen(to) + 1 */ +/******************************************************************//** +Converts an identifier to UTF-8. */ +UNIV_INTERN +void +innobase_convert_from_id( +/*=====================*/ + struct charset_info_st* cs, /*!< in: the 'from' character set */ + char* to, /*!< out: converted identifier */ + const char* from, /*!< in: identifier to convert */ + ulint len); /*!< in: length of 'to', in bytes; + should be at least 3 * strlen(to) + 1 */ +/******************************************************************//** +Makes all characters in a NUL-terminated UTF-8 string lower case. */ +UNIV_INTERN +void +innobase_casedn_str( +/*================*/ + char* a); /*!< in/out: string to put in lower case */ + +/**********************************************************************//** +Determines the connection character set. +@return connection character set */ +UNIV_INTERN +struct charset_info_st* +innobase_get_charset( +/*=================*/ + THD* thd); /*!< in: MySQL thread handle */ +/**********************************************************************//** +Determines the current SQL statement. +@return SQL statement string */ +UNIV_INTERN +const char* +innobase_get_stmt( +/*==============*/ + THD* thd, /*!< in: MySQL thread handle */ + size_t* length) /*!< out: length of the SQL statement */ + __attribute__((nonnull)); +/******************************************************************//** +This function is used to find the storage length in bytes of the first n +characters for prefix indexes using a multibyte character set. The function +finds charset information and returns length of prefix_len characters in the +index field in bytes. +@return number of bytes occupied by the first n characters */ +UNIV_INTERN +ulint +innobase_get_at_most_n_mbchars( +/*===========================*/ + ulint charset_id, /*!< in: character set id */ + ulint prefix_len, /*!< in: prefix length in bytes of the index + (this has to be divided by mbmaxlen to get the + number of CHARACTERS n in the prefix) */ + ulint data_len, /*!< in: length of the string in bytes */ + const char* str); /*!< in: character string */ + +/*************************************************************//** +InnoDB index push-down condition check +@return ICP_NO_MATCH, ICP_MATCH, or ICP_OUT_OF_RANGE */ +UNIV_INTERN +enum icp_result +innobase_index_cond( +/*================*/ + void* file) /*!< in/out: pointer to ha_innobase */ + __attribute__((nonnull, warn_unused_result)); +/******************************************************************//** +Returns true if the thread supports XA, +global value of innodb_supports_xa if thd is NULL. +@return true if thd supports XA */ +UNIV_INTERN +ibool +thd_supports_xa( +/*============*/ + THD* thd); /*!< in: thread handle, or NULL to query + the global innodb_supports_xa */ + +/******************************************************************//** +Returns the lock wait timeout for the current connection. +@return the lock wait timeout, in seconds */ +UNIV_INTERN +ulong +thd_lock_wait_timeout( +/*==================*/ + THD* thd); /*!< in: thread handle, or NULL to query + the global innodb_lock_wait_timeout */ +/******************************************************************//** +Add up the time waited for the lock for the current query. */ +UNIV_INTERN +void +thd_set_lock_wait_time( +/*===================*/ + THD* thd, /*!< in/out: thread handle */ + ulint value); /*!< in: time waited for the lock */ + +/**********************************************************************//** +Get the current setting of the table_cache_size global parameter. We do +a dirty read because for one there is no synchronization object and +secondly there is little harm in doing so even if we get a torn read. +@return SQL statement string */ +UNIV_INTERN +ulint +innobase_get_table_cache_size(void); +/*===============================*/ + +/**********************************************************************//** +Get the current setting of the lower_case_table_names global parameter from +mysqld.cc. We do a dirty read because for one there is no synchronization +object and secondly there is little harm in doing so even if we get a torn +read. +@return value of lower_case_table_names */ +UNIV_INTERN +ulint +innobase_get_lower_case_table_names(void); +/*=====================================*/ + +/*****************************************************************//** +Frees a possible InnoDB trx object associated with the current THD. +@return 0 or error number */ +UNIV_INTERN +int +innobase_close_thd( +/*===============*/ + THD* thd); /*!< in: MySQL thread handle for + which to close the connection */ +/*************************************************************//** +Get the next token from the given string and store it in *token. */ +UNIV_INTERN +ulint +innobase_mysql_fts_get_token( +/*=========================*/ + CHARSET_INFO* charset, /*!< in: Character set */ + const byte* start, /*!< in: start of text */ + const byte* end, /*!< in: one character past end of + text */ + fts_string_t* token, /*!< out: token's text */ + ulint* offset); /*!< out: offset to token, + measured as characters from + 'start' */ + +/******************************************************************//** +compare two character string case insensitively according to their charset. */ +UNIV_INTERN +int +innobase_fts_text_case_cmp( +/*=======================*/ + const void* cs, /*!< in: Character set */ + const void* p1, /*!< in: key */ + const void* p2); /*!< in: node */ + +/****************************************************************//** +Get FTS field charset info from the field's prtype +@return charset info */ +UNIV_INTERN +CHARSET_INFO* +innobase_get_fts_charset( +/*=====================*/ + int mysql_type, /*!< in: MySQL type */ + uint charset_number);/*!< in: number of the charset */ +/******************************************************************//** +Returns true if transaction should be flagged as read-only. +@return true if the thd is marked as read-only */ +UNIV_INTERN +ibool +thd_trx_is_read_only( +/*=================*/ + THD* thd); /*!< in/out: thread handle */ + +/******************************************************************//** +Check if the transaction is an auto-commit transaction. TRUE also +implies that it is a SELECT (read-only) transaction. +@return true if the transaction is an auto commit read-only transaction. */ +UNIV_INTERN +ibool +thd_trx_is_auto_commit( +/*===================*/ + THD* thd); /*!< in: thread handle, or NULL */ + +/*****************************************************************//** +A wrapper function of innobase_convert_name(), convert a table or +index name to the MySQL system_charset_info (UTF-8) and quote it if needed. +@return pointer to the end of buf */ +UNIV_INTERN +void +innobase_format_name( +/*==================*/ + char* buf, /*!< out: buffer for converted + identifier */ + ulint buflen, /*!< in: length of buf, in bytes */ + const char* name, /*!< in: index or table name + to format */ + ibool is_index_name) /*!< in: index name */ + __attribute__((nonnull)); + +/** Corresponds to Sql_condition:enum_warning_level. */ +enum ib_log_level_t { + IB_LOG_LEVEL_INFO, + IB_LOG_LEVEL_WARN, + IB_LOG_LEVEL_ERROR, + IB_LOG_LEVEL_FATAL +}; + +/******************************************************************//** +Use this when the args are first converted to a formatted string and then +passed to the format string from errmsg-utf8.txt. The error message format +must be: "Some string ... %s". + +Push a warning message to the client, it is a wrapper around: + +void push_warning_printf( + THD *thd, Sql_condition::enum_warning_level level, + uint code, const char *format, ...); +*/ +UNIV_INTERN +void +ib_errf( +/*====*/ + THD* thd, /*!< in/out: session */ + ib_log_level_t level, /*!< in: warning level */ + ib_uint32_t code, /*!< MySQL error code */ + const char* format, /*!< printf format */ + ...) /*!< Args */ + __attribute__((format(printf, 4, 5))); + +/******************************************************************//** +Use this when the args are passed to the format string from +errmsg-utf8.txt directly as is. + +Push a warning message to the client, it is a wrapper around: + +void push_warning_printf( + THD *thd, Sql_condition::enum_warning_level level, + uint code, const char *format, ...); +*/ +UNIV_INTERN +void +ib_senderrf( +/*========*/ + THD* thd, /*!< in/out: session */ + ib_log_level_t level, /*!< in: warning level */ + ib_uint32_t code, /*!< MySQL error code */ + ...); /*!< Args */ + +/******************************************************************//** +Write a message to the MySQL log, prefixed with "InnoDB: ". +Wrapper around sql_print_information() */ +UNIV_INTERN +void +ib_logf( +/*====*/ + ib_log_level_t level, /*!< in: warning level */ + const char* format, /*!< printf format */ + ...) /*!< Args */ + __attribute__((format(printf, 2, 3))); + +/******************************************************************//** +Returns the NUL terminated value of glob_hostname. +@return pointer to glob_hostname. */ +UNIV_INTERN +const char* +server_get_hostname(); +/*=================*/ + +/******************************************************************//** +Get the error message format string. +@return the format string or 0 if not found. */ +UNIV_INTERN +const char* +innobase_get_err_msg( +/*=================*/ + int error_code); /*!< in: MySQL error code */ + +/*********************************************************************//** +Compute the next autoinc value. + +For MySQL replication the autoincrement values can be partitioned among +the nodes. The offset is the start or origin of the autoincrement value +for a particular node. For n nodes the increment will be n and the offset +will be in the interval [1, n]. The formula tries to allocate the next +value for a particular node. + +Note: This function is also called with increment set to the number of +values we want to reserve for multi-value inserts e.g., + + INSERT INTO T VALUES(), (), (); + +innobase_next_autoinc() will be called with increment set to 3 where +autoinc_lock_mode != TRADITIONAL because we want to reserve 3 values for +the multi-value INSERT above. +@return the next value */ +UNIV_INTERN +ulonglong +innobase_next_autoinc( +/*==================*/ + ulonglong current, /*!< in: Current value */ + ulonglong need, /*!< in: count of values needed */ + ulonglong step, /*!< in: AUTOINC increment step */ + ulonglong offset, /*!< in: AUTOINC offset */ + ulonglong max_value) /*!< in: max value for type */ + __attribute__((pure, warn_unused_result)); + +/********************************************************************//** +Get the upper limit of the MySQL integral and floating-point type. +@return maximum allowed value for the field */ +UNIV_INTERN +ulonglong +innobase_get_int_col_max_value( +/*===========================*/ + const Field* field) /*!< in: MySQL field */ + __attribute__((nonnull, pure, warn_unused_result)); + +/********************************************************************** +Check if the length of the identifier exceeds the maximum allowed. +The input to this function is an identifier in charset my_charset_filename. +return true when length of identifier is too long. */ +UNIV_INTERN +my_bool +innobase_check_identifier_length( +/*=============================*/ + const char* id); /* in: identifier to check. it must belong + to charset my_charset_filename */ + +/********************************************************************** +Converts an identifier from my_charset_filename to UTF-8 charset. */ +uint +innobase_convert_to_system_charset( +/*===============================*/ + char* to, /* out: converted identifier */ + const char* from, /* in: identifier to convert */ + ulint len, /* in: length of 'to', in bytes */ + uint* errors); /* out: error return */ + +/********************************************************************** +Converts an identifier from my_charset_filename to UTF-8 charset. */ +uint +innobase_convert_to_filename_charset( +/*=================================*/ + char* to, /* out: converted identifier */ + const char* from, /* in: identifier to convert */ + ulint len); /* in: length of 'to', in bytes */ + + +#endif /* HA_INNODB_PROTOTYPES_H */ diff --git a/storage/innobase/include/handler0alter.h b/storage/innobase/include/handler0alter.h new file mode 100644 index 00000000000..66b963ae39a --- /dev/null +++ b/storage/innobase/include/handler0alter.h @@ -0,0 +1,114 @@ +/***************************************************************************** + +Copyright (c) 2005, 2013, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/handler0alter.h +Smart ALTER TABLE +*******************************************************/ + +/*************************************************************//** +Copies an InnoDB record to table->record[0]. */ +UNIV_INTERN +void +innobase_rec_to_mysql( +/*==================*/ + struct TABLE* table, /*!< in/out: MySQL table */ + const rec_t* rec, /*!< in: record */ + const dict_index_t* index, /*!< in: index */ + const ulint* offsets)/*!< in: rec_get_offsets( + rec, index, ...) */ + __attribute__((nonnull)); + +/*************************************************************//** +Copies an InnoDB index entry to table->record[0]. */ +UNIV_INTERN +void +innobase_fields_to_mysql( +/*=====================*/ + struct TABLE* table, /*!< in/out: MySQL table */ + const dict_index_t* index, /*!< in: InnoDB index */ + const dfield_t* fields) /*!< in: InnoDB index fields */ + __attribute__((nonnull)); + +/*************************************************************//** +Copies an InnoDB row to table->record[0]. */ +UNIV_INTERN +void +innobase_row_to_mysql( +/*==================*/ + struct TABLE* table, /*!< in/out: MySQL table */ + const dict_table_t* itab, /*!< in: InnoDB table */ + const dtuple_t* row) /*!< in: InnoDB row */ + __attribute__((nonnull)); + +/*************************************************************//** +Resets table->record[0]. */ +UNIV_INTERN +void +innobase_rec_reset( +/*===============*/ + struct TABLE* table) /*!< in/out: MySQL table */ + __attribute__((nonnull)); + +/** Generate the next autoinc based on a snapshot of the session +auto_increment_increment and auto_increment_offset variables. */ +struct ib_sequence_t { + + /** + @param thd - the session + @param start_value - the lower bound + @param max_value - the upper bound (inclusive) */ + ib_sequence_t(THD* thd, ulonglong start_value, ulonglong max_value); + + /** + Postfix increment + @return the value to insert */ + ulonglong operator++(int) UNIV_NOTHROW; + + /** Check if the autoinc "sequence" is exhausted. + @return true if the sequence is exhausted */ + bool eof() const UNIV_NOTHROW + { + return(m_eof); + } + + /** + @return the next value in the sequence */ + ulonglong last() const UNIV_NOTHROW + { + ut_ad(m_next_value > 0); + + return(m_next_value); + } + + /** Maximum calumn value if adding an AUTOINC column else 0. Once + we reach the end of the sequence it will be set to ~0. */ + const ulonglong m_max_value; + + /** Value of auto_increment_increment */ + ulong m_increment; + + /** Value of auto_increment_offset */ + ulong m_offset; + + /** Next value in the sequence */ + ulonglong m_next_value; + + /** true if no more values left in the sequence */ + bool m_eof; +}; diff --git a/storage/innobase/include/hash0hash.h b/storage/innobase/include/hash0hash.h new file mode 100644 index 00000000000..6f9a628df5d --- /dev/null +++ b/storage/innobase/include/hash0hash.h @@ -0,0 +1,575 @@ +/***************************************************************************** + +Copyright (c) 1997, 2011, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/hash0hash.h +The simple hash table utility + +Created 5/20/1997 Heikki Tuuri +*******************************************************/ + +#ifndef hash0hash_h +#define hash0hash_h + +#include "univ.i" +#include "mem0mem.h" +#ifndef UNIV_HOTBACKUP +# include "sync0sync.h" +# include "sync0rw.h" +#endif /* !UNIV_HOTBACKUP */ + +struct hash_table_t; +struct hash_cell_t; + +typedef void* hash_node_t; + +/* Fix Bug #13859: symbol collision between imap/mysql */ +#define hash_create hash0_create + +/* Differnt types of hash_table based on the synchronization +method used for it. */ +enum hash_table_sync_t { + HASH_TABLE_SYNC_NONE = 0, /*!< Don't use any internal + synchronization objects for + this hash_table. */ + HASH_TABLE_SYNC_MUTEX, /*!< Use mutexes to control + access to this hash_table. */ + HASH_TABLE_SYNC_RW_LOCK /*!< Use rw_locks to control + access to this hash_table. */ +}; + +/*************************************************************//** +Creates a hash table with >= n array cells. The actual number +of cells is chosen to be a prime number slightly bigger than n. +@return own: created table */ +UNIV_INTERN +hash_table_t* +hash_create( +/*========*/ + ulint n); /*!< in: number of array cells */ +#ifndef UNIV_HOTBACKUP +/*************************************************************//** +Creates a sync object array array to protect a hash table. +::sync_obj can be mutexes or rw_locks depening on the type of +hash table. */ +UNIV_INTERN +void +hash_create_sync_obj_func( +/*======================*/ + hash_table_t* table, /*!< in: hash table */ + enum hash_table_sync_t type, /*!< in: HASH_TABLE_SYNC_MUTEX + or HASH_TABLE_SYNC_RW_LOCK */ +#ifdef UNIV_SYNC_DEBUG + ulint sync_level,/*!< in: latching order level + of the mutexes: used in the + debug version */ +#endif /* UNIV_SYNC_DEBUG */ + ulint n_sync_obj);/*!< in: number of sync objects, + must be a power of 2 */ +#ifdef UNIV_SYNC_DEBUG +# define hash_create_sync_obj(t, s, n, level) \ + hash_create_sync_obj_func(t, s, level, n) +#else /* UNIV_SYNC_DEBUG */ +# define hash_create_sync_obj(t, s, n, level) \ + hash_create_sync_obj_func(t, s, n) +#endif /* UNIV_SYNC_DEBUG */ +#endif /* !UNIV_HOTBACKUP */ + +/*************************************************************//** +Frees a hash table. */ +UNIV_INTERN +void +hash_table_free( +/*============*/ + hash_table_t* table); /*!< in, own: hash table */ +/**************************************************************//** +Calculates the hash value from a folded value. +@return hashed value */ +UNIV_INLINE +ulint +hash_calc_hash( +/*===========*/ + ulint fold, /*!< in: folded value */ + hash_table_t* table); /*!< in: hash table */ +#ifndef UNIV_HOTBACKUP +/********************************************************************//** +Assert that the mutex for the table is held */ +# define HASH_ASSERT_OWN(TABLE, FOLD) \ + ut_ad((TABLE)->type != HASH_TABLE_SYNC_MUTEX \ + || (mutex_own(hash_get_mutex((TABLE), FOLD)))); +#else /* !UNIV_HOTBACKUP */ +# define HASH_ASSERT_OWN(TABLE, FOLD) +#endif /* !UNIV_HOTBACKUP */ + +/*******************************************************************//** +Inserts a struct to a hash table. */ + +#define HASH_INSERT(TYPE, NAME, TABLE, FOLD, DATA)\ +do {\ + hash_cell_t* cell3333;\ + TYPE* struct3333;\ +\ + HASH_ASSERT_OWN(TABLE, FOLD)\ +\ + (DATA)->NAME = NULL;\ +\ + cell3333 = hash_get_nth_cell(TABLE, hash_calc_hash(FOLD, TABLE));\ +\ + if (cell3333->node == NULL) {\ + cell3333->node = DATA;\ + } else {\ + struct3333 = (TYPE*) cell3333->node;\ +\ + while (struct3333->NAME != NULL) {\ +\ + struct3333 = (TYPE*) struct3333->NAME;\ + }\ +\ + struct3333->NAME = DATA;\ + }\ +} while (0) + +#ifdef UNIV_HASH_DEBUG +# define HASH_ASSERT_VALID(DATA) ut_a((void*) (DATA) != (void*) -1) +# define HASH_INVALIDATE(DATA, NAME) *(void**) (&DATA->NAME) = (void*) -1 +#else +# define HASH_ASSERT_VALID(DATA) do {} while (0) +# define HASH_INVALIDATE(DATA, NAME) do {} while (0) +#endif + +/*******************************************************************//** +Deletes a struct from a hash table. */ + +#define HASH_DELETE(TYPE, NAME, TABLE, FOLD, DATA)\ +do {\ + hash_cell_t* cell3333;\ + TYPE* struct3333;\ +\ + HASH_ASSERT_OWN(TABLE, FOLD)\ +\ + cell3333 = hash_get_nth_cell(TABLE, hash_calc_hash(FOLD, TABLE));\ +\ + if (cell3333->node == DATA) {\ + HASH_ASSERT_VALID(DATA->NAME);\ + cell3333->node = DATA->NAME;\ + } else {\ + struct3333 = (TYPE*) cell3333->node;\ +\ + while (struct3333->NAME != DATA) {\ +\ + struct3333 = (TYPE*) struct3333->NAME;\ + ut_a(struct3333);\ + }\ +\ + struct3333->NAME = DATA->NAME;\ + }\ + HASH_INVALIDATE(DATA, NAME);\ +} while (0) + +/*******************************************************************//** +Gets the first struct in a hash chain, NULL if none. */ + +#define HASH_GET_FIRST(TABLE, HASH_VAL)\ + (hash_get_nth_cell(TABLE, HASH_VAL)->node) + +/*******************************************************************//** +Gets the next struct in a hash chain, NULL if none. */ + +#define HASH_GET_NEXT(NAME, DATA) ((DATA)->NAME) + +/********************************************************************//** +Looks for a struct in a hash table. */ +#define HASH_SEARCH(NAME, TABLE, FOLD, TYPE, DATA, ASSERTION, TEST)\ +{\ +\ + HASH_ASSERT_OWN(TABLE, FOLD)\ +\ + (DATA) = (TYPE) HASH_GET_FIRST(TABLE, hash_calc_hash(FOLD, TABLE));\ + HASH_ASSERT_VALID(DATA);\ +\ + while ((DATA) != NULL) {\ + ASSERTION;\ + if (TEST) {\ + break;\ + } else {\ + HASH_ASSERT_VALID(HASH_GET_NEXT(NAME, DATA));\ + (DATA) = (TYPE) HASH_GET_NEXT(NAME, DATA);\ + }\ + }\ +} + +/********************************************************************//** +Looks for an item in all hash buckets. */ +#define HASH_SEARCH_ALL(NAME, TABLE, TYPE, DATA, ASSERTION, TEST) \ +do { \ + ulint i3333; \ + \ + for (i3333 = (TABLE)->n_cells; i3333--; ) { \ + (DATA) = (TYPE) HASH_GET_FIRST(TABLE, i3333); \ + \ + while ((DATA) != NULL) { \ + HASH_ASSERT_VALID(DATA); \ + ASSERTION; \ + \ + if (TEST) { \ + break; \ + } \ + \ + (DATA) = (TYPE) HASH_GET_NEXT(NAME, DATA); \ + } \ + \ + if ((DATA) != NULL) { \ + break; \ + } \ + } \ +} while (0) + +/************************************************************//** +Gets the nth cell in a hash table. +@return pointer to cell */ +UNIV_INLINE +hash_cell_t* +hash_get_nth_cell( +/*==============*/ + hash_table_t* table, /*!< in: hash table */ + ulint n); /*!< in: cell index */ + +/*************************************************************//** +Clears a hash table so that all the cells become empty. */ +UNIV_INLINE +void +hash_table_clear( +/*=============*/ + hash_table_t* table); /*!< in/out: hash table */ + +/*************************************************************//** +Returns the number of cells in a hash table. +@return number of cells */ +UNIV_INLINE +ulint +hash_get_n_cells( +/*=============*/ + hash_table_t* table); /*!< in: table */ +/*******************************************************************//** +Deletes a struct which is stored in the heap of the hash table, and compacts +the heap. The fold value must be stored in the struct NODE in a field named +'fold'. */ + +#define HASH_DELETE_AND_COMPACT(TYPE, NAME, TABLE, NODE)\ +do {\ + TYPE* node111;\ + TYPE* top_node111;\ + hash_cell_t* cell111;\ + ulint fold111;\ +\ + fold111 = (NODE)->fold;\ +\ + HASH_DELETE(TYPE, NAME, TABLE, fold111, NODE);\ +\ + top_node111 = (TYPE*) mem_heap_get_top(\ + hash_get_heap(TABLE, fold111),\ + sizeof(TYPE));\ +\ + /* If the node to remove is not the top node in the heap, compact the\ + heap of nodes by moving the top node in the place of NODE. */\ +\ + if (NODE != top_node111) {\ +\ + /* Copy the top node in place of NODE */\ +\ + *(NODE) = *top_node111;\ +\ + cell111 = hash_get_nth_cell(TABLE,\ + hash_calc_hash(top_node111->fold, TABLE));\ +\ + /* Look for the pointer to the top node, to update it */\ +\ + if (cell111->node == top_node111) {\ + /* The top node is the first in the chain */\ +\ + cell111->node = NODE;\ + } else {\ + /* We have to look for the predecessor of the top\ + node */\ + node111 = static_cast<TYPE*>(cell111->node);\ +\ + while (top_node111 != HASH_GET_NEXT(NAME, node111)) {\ +\ + node111 = static_cast<TYPE*>(\ + HASH_GET_NEXT(NAME, node111));\ + }\ +\ + /* Now we have the predecessor node */\ +\ + node111->NAME = NODE;\ + }\ + }\ +\ + /* Free the space occupied by the top node */\ +\ + mem_heap_free_top(hash_get_heap(TABLE, fold111), sizeof(TYPE));\ +} while (0) + +#ifndef UNIV_HOTBACKUP +/****************************************************************//** +Move all hash table entries from OLD_TABLE to NEW_TABLE. */ + +#define HASH_MIGRATE(OLD_TABLE, NEW_TABLE, NODE_TYPE, PTR_NAME, FOLD_FUNC) \ +do {\ + ulint i2222;\ + ulint cell_count2222;\ +\ + cell_count2222 = hash_get_n_cells(OLD_TABLE);\ +\ + for (i2222 = 0; i2222 < cell_count2222; i2222++) {\ + NODE_TYPE* node2222 = HASH_GET_FIRST((OLD_TABLE), i2222);\ +\ + while (node2222) {\ + NODE_TYPE* next2222 = node2222->PTR_NAME;\ + ulint fold2222 = FOLD_FUNC(node2222);\ +\ + HASH_INSERT(NODE_TYPE, PTR_NAME, (NEW_TABLE),\ + fold2222, node2222);\ +\ + node2222 = next2222;\ + }\ + }\ +} while (0) + +/************************************************************//** +Gets the sync object index for a fold value in a hash table. +@return index */ +UNIV_INLINE +ulint +hash_get_sync_obj_index( +/*====================*/ + hash_table_t* table, /*!< in: hash table */ + ulint fold); /*!< in: fold */ +/************************************************************//** +Gets the nth heap in a hash table. +@return mem heap */ +UNIV_INLINE +mem_heap_t* +hash_get_nth_heap( +/*==============*/ + hash_table_t* table, /*!< in: hash table */ + ulint i); /*!< in: index of the heap */ +/************************************************************//** +Gets the heap for a fold value in a hash table. +@return mem heap */ +UNIV_INLINE +mem_heap_t* +hash_get_heap( +/*==========*/ + hash_table_t* table, /*!< in: hash table */ + ulint fold); /*!< in: fold */ +/************************************************************//** +Gets the nth mutex in a hash table. +@return mutex */ +UNIV_INLINE +ib_mutex_t* +hash_get_nth_mutex( +/*===============*/ + hash_table_t* table, /*!< in: hash table */ + ulint i); /*!< in: index of the mutex */ +/************************************************************//** +Gets the nth rw_lock in a hash table. +@return rw_lock */ +UNIV_INLINE +rw_lock_t* +hash_get_nth_lock( +/*==============*/ + hash_table_t* table, /*!< in: hash table */ + ulint i); /*!< in: index of the rw_lock */ +/************************************************************//** +Gets the mutex for a fold value in a hash table. +@return mutex */ +UNIV_INLINE +ib_mutex_t* +hash_get_mutex( +/*===========*/ + hash_table_t* table, /*!< in: hash table */ + ulint fold); /*!< in: fold */ +/************************************************************//** +Gets the rw_lock for a fold value in a hash table. +@return rw_lock */ +UNIV_INLINE +rw_lock_t* +hash_get_lock( +/*==========*/ + hash_table_t* table, /*!< in: hash table */ + ulint fold); /*!< in: fold */ +/************************************************************//** +Reserves the mutex for a fold value in a hash table. */ +UNIV_INTERN +void +hash_mutex_enter( +/*=============*/ + hash_table_t* table, /*!< in: hash table */ + ulint fold); /*!< in: fold */ +/************************************************************//** +Releases the mutex for a fold value in a hash table. */ +UNIV_INTERN +void +hash_mutex_exit( +/*============*/ + hash_table_t* table, /*!< in: hash table */ + ulint fold); /*!< in: fold */ +/************************************************************//** +Reserves all the mutexes of a hash table, in an ascending order. */ +UNIV_INTERN +void +hash_mutex_enter_all( +/*=================*/ + hash_table_t* table); /*!< in: hash table */ +/************************************************************//** +Releases all the mutexes of a hash table. */ +UNIV_INTERN +void +hash_mutex_exit_all( +/*================*/ + hash_table_t* table); /*!< in: hash table */ +/************************************************************//** +Releases all but the passed in mutex of a hash table. */ +UNIV_INTERN +void +hash_mutex_exit_all_but( +/*====================*/ + hash_table_t* table, /*!< in: hash table */ + ib_mutex_t* keep_mutex); /*!< in: mutex to keep */ +/************************************************************//** +s-lock a lock for a fold value in a hash table. */ +UNIV_INTERN +void +hash_lock_s( +/*========*/ + hash_table_t* table, /*!< in: hash table */ + ulint fold); /*!< in: fold */ +/************************************************************//** +x-lock a lock for a fold value in a hash table. */ +UNIV_INTERN +void +hash_lock_x( +/*========*/ + hash_table_t* table, /*!< in: hash table */ + ulint fold); /*!< in: fold */ +/************************************************************//** +unlock an s-lock for a fold value in a hash table. */ +UNIV_INTERN +void +hash_unlock_s( +/*==========*/ + + hash_table_t* table, /*!< in: hash table */ + ulint fold); /*!< in: fold */ +/************************************************************//** +unlock x-lock for a fold value in a hash table. */ +UNIV_INTERN +void +hash_unlock_x( +/*==========*/ + hash_table_t* table, /*!< in: hash table */ + ulint fold); /*!< in: fold */ +/************************************************************//** +Reserves all the locks of a hash table, in an ascending order. */ +UNIV_INTERN +void +hash_lock_x_all( +/*============*/ + hash_table_t* table); /*!< in: hash table */ +/************************************************************//** +Releases all the locks of a hash table, in an ascending order. */ +UNIV_INTERN +void +hash_unlock_x_all( +/*==============*/ + hash_table_t* table); /*!< in: hash table */ +/************************************************************//** +Releases all but passed in lock of a hash table, */ +UNIV_INTERN +void +hash_unlock_x_all_but( +/*==================*/ + hash_table_t* table, /*!< in: hash table */ + rw_lock_t* keep_lock); /*!< in: lock to keep */ + +#else /* !UNIV_HOTBACKUP */ +# define hash_get_heap(table, fold) ((table)->heap) +# define hash_mutex_enter(table, fold) ((void) 0) +# define hash_mutex_exit(table, fold) ((void) 0) +# define hash_mutex_enter_all(table) ((void) 0) +# define hash_mutex_exit_all(table) ((void) 0) +# define hash_mutex_exit_all_but(t, m) ((void) 0) +# define hash_lock_s(t, f) ((void) 0) +# define hash_lock_x(t, f) ((void) 0) +# define hash_unlock_s(t, f) ((void) 0) +# define hash_unlock_x(t, f) ((void) 0) +# define hash_lock_x_all(t) ((void) 0) +# define hash_unlock_x_all(t) ((void) 0) +# define hash_unlock_x_all_but(t, l) ((void) 0) +#endif /* !UNIV_HOTBACKUP */ + +struct hash_cell_t{ + void* node; /*!< hash chain node, NULL if none */ +}; + +/* The hash table structure */ +struct hash_table_t { + enum hash_table_sync_t type; /*<! type of hash_table. */ +#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG +# ifndef UNIV_HOTBACKUP + ibool adaptive;/* TRUE if this is the hash + table of the adaptive hash + index */ +# endif /* !UNIV_HOTBACKUP */ +#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */ + ulint n_cells;/* number of cells in the hash table */ + hash_cell_t* array; /*!< pointer to cell array */ +#ifndef UNIV_HOTBACKUP + ulint n_sync_obj;/* if sync_objs != NULL, then + the number of either the number + of mutexes or the number of + rw_locks depending on the type. + Must be a power of 2 */ + union { + ib_mutex_t* mutexes;/* NULL, or an array of mutexes + used to protect segments of the + hash table */ + rw_lock_t* rw_locks;/* NULL, or an array of rw_lcoks + used to protect segments of the + hash table */ + } sync_obj; + + mem_heap_t** heaps; /*!< if this is non-NULL, hash + chain nodes for external chaining + can be allocated from these memory + heaps; there are then n_mutexes + many of these heaps */ +#endif /* !UNIV_HOTBACKUP */ + mem_heap_t* heap; +#ifdef UNIV_DEBUG + ulint magic_n; +# define HASH_TABLE_MAGIC_N 76561114 +#endif /* UNIV_DEBUG */ +}; + +#ifndef UNIV_NONINL +#include "hash0hash.ic" +#endif + +#endif diff --git a/storage/innobase/include/hash0hash.ic b/storage/innobase/include/hash0hash.ic new file mode 100644 index 00000000000..254f3f82e5d --- /dev/null +++ b/storage/innobase/include/hash0hash.ic @@ -0,0 +1,225 @@ +/***************************************************************************** + +Copyright (c) 1997, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/hash0hash.ic +The simple hash table utility + +Created 5/20/1997 Heikki Tuuri +*******************************************************/ + +#include "ut0rnd.h" + +/************************************************************//** +Gets the nth cell in a hash table. +@return pointer to cell */ +UNIV_INLINE +hash_cell_t* +hash_get_nth_cell( +/*==============*/ + hash_table_t* table, /*!< in: hash table */ + ulint n) /*!< in: cell index */ +{ + ut_ad(table); + ut_ad(table->magic_n == HASH_TABLE_MAGIC_N); + ut_ad(n < table->n_cells); + + return(table->array + n); +} + +/*************************************************************//** +Clears a hash table so that all the cells become empty. */ +UNIV_INLINE +void +hash_table_clear( +/*=============*/ + hash_table_t* table) /*!< in/out: hash table */ +{ + ut_ad(table); + ut_ad(table->magic_n == HASH_TABLE_MAGIC_N); + memset(table->array, 0x0, + table->n_cells * sizeof(*table->array)); +} + +/*************************************************************//** +Returns the number of cells in a hash table. +@return number of cells */ +UNIV_INLINE +ulint +hash_get_n_cells( +/*=============*/ + hash_table_t* table) /*!< in: table */ +{ + ut_ad(table); + ut_ad(table->magic_n == HASH_TABLE_MAGIC_N); + return(table->n_cells); +} + +/**************************************************************//** +Calculates the hash value from a folded value. +@return hashed value */ +UNIV_INLINE +ulint +hash_calc_hash( +/*===========*/ + ulint fold, /*!< in: folded value */ + hash_table_t* table) /*!< in: hash table */ +{ + ut_ad(table); + ut_ad(table->magic_n == HASH_TABLE_MAGIC_N); + return(ut_hash_ulint(fold, table->n_cells)); +} + +#ifndef UNIV_HOTBACKUP +/************************************************************//** +Gets the sync object index for a fold value in a hash table. +@return index */ +UNIV_INLINE +ulint +hash_get_sync_obj_index( +/*====================*/ + hash_table_t* table, /*!< in: hash table */ + ulint fold) /*!< in: fold */ +{ + ut_ad(table); + ut_ad(table->magic_n == HASH_TABLE_MAGIC_N); + ut_ad(table->type != HASH_TABLE_SYNC_NONE); + ut_ad(ut_is_2pow(table->n_sync_obj)); + return(ut_2pow_remainder(hash_calc_hash(fold, table), + table->n_sync_obj)); +} + +/************************************************************//** +Gets the nth heap in a hash table. +@return mem heap */ +UNIV_INLINE +mem_heap_t* +hash_get_nth_heap( +/*==============*/ + hash_table_t* table, /*!< in: hash table */ + ulint i) /*!< in: index of the heap */ +{ + ut_ad(table); + ut_ad(table->magic_n == HASH_TABLE_MAGIC_N); + ut_ad(table->type != HASH_TABLE_SYNC_NONE); + ut_ad(i < table->n_sync_obj); + + return(table->heaps[i]); +} + +/************************************************************//** +Gets the heap for a fold value in a hash table. +@return mem heap */ +UNIV_INLINE +mem_heap_t* +hash_get_heap( +/*==========*/ + hash_table_t* table, /*!< in: hash table */ + ulint fold) /*!< in: fold */ +{ + ulint i; + + ut_ad(table); + ut_ad(table->magic_n == HASH_TABLE_MAGIC_N); + + if (table->heap) { + return(table->heap); + } + + i = hash_get_sync_obj_index(table, fold); + + return(hash_get_nth_heap(table, i)); +} + +/************************************************************//** +Gets the nth mutex in a hash table. +@return mutex */ +UNIV_INLINE +ib_mutex_t* +hash_get_nth_mutex( +/*===============*/ + hash_table_t* table, /*!< in: hash table */ + ulint i) /*!< in: index of the mutex */ +{ + ut_ad(table); + ut_ad(table->magic_n == HASH_TABLE_MAGIC_N); + ut_ad(table->type == HASH_TABLE_SYNC_MUTEX); + ut_ad(i < table->n_sync_obj); + + return(table->sync_obj.mutexes + i); +} + +/************************************************************//** +Gets the mutex for a fold value in a hash table. +@return mutex */ +UNIV_INLINE +ib_mutex_t* +hash_get_mutex( +/*===========*/ + hash_table_t* table, /*!< in: hash table */ + ulint fold) /*!< in: fold */ +{ + ulint i; + + ut_ad(table); + ut_ad(table->magic_n == HASH_TABLE_MAGIC_N); + + i = hash_get_sync_obj_index(table, fold); + + return(hash_get_nth_mutex(table, i)); +} + +/************************************************************//** +Gets the nth rw_lock in a hash table. +@return rw_lock */ +UNIV_INLINE +rw_lock_t* +hash_get_nth_lock( +/*==============*/ + hash_table_t* table, /*!< in: hash table */ + ulint i) /*!< in: index of the rw_lock */ +{ + ut_ad(table); + ut_ad(table->magic_n == HASH_TABLE_MAGIC_N); + ut_ad(table->type == HASH_TABLE_SYNC_RW_LOCK); + ut_ad(i < table->n_sync_obj); + + return(table->sync_obj.rw_locks + i); +} + +/************************************************************//** +Gets the rw_lock for a fold value in a hash table. +@return rw_lock */ +UNIV_INLINE +rw_lock_t* +hash_get_lock( +/*==========*/ + hash_table_t* table, /*!< in: hash table */ + ulint fold) /*!< in: fold */ +{ + ulint i; + + ut_ad(table); + ut_ad(table->type == HASH_TABLE_SYNC_RW_LOCK); + ut_ad(table->magic_n == HASH_TABLE_MAGIC_N); + + i = hash_get_sync_obj_index(table, fold); + + return(hash_get_nth_lock(table, i)); +} +#endif /* !UNIV_HOTBACKUP */ diff --git a/storage/innobase/include/ibuf0ibuf.h b/storage/innobase/include/ibuf0ibuf.h new file mode 100644 index 00000000000..9c3b686c998 --- /dev/null +++ b/storage/innobase/include/ibuf0ibuf.h @@ -0,0 +1,467 @@ +/***************************************************************************** + +Copyright (c) 1997, 2013, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/ibuf0ibuf.h +Insert buffer + +Created 7/19/1997 Heikki Tuuri +*******************************************************/ + +#ifndef ibuf0ibuf_h +#define ibuf0ibuf_h + +#include "univ.i" + +#include "mtr0mtr.h" +#include "dict0mem.h" +#include "fsp0fsp.h" + +#ifndef UNIV_HOTBACKUP +# include "ibuf0types.h" + +/** Default value for maximum on-disk size of change buffer in terms +of percentage of the buffer pool. */ +#define CHANGE_BUFFER_DEFAULT_SIZE (25) + +/* Possible operations buffered in the insert/whatever buffer. See +ibuf_insert(). DO NOT CHANGE THE VALUES OF THESE, THEY ARE STORED ON DISK. */ +typedef enum { + IBUF_OP_INSERT = 0, + IBUF_OP_DELETE_MARK = 1, + IBUF_OP_DELETE = 2, + + /* Number of different operation types. */ + IBUF_OP_COUNT = 3 +} ibuf_op_t; + +/** Combinations of operations that can be buffered. Because the enum +values are used for indexing innobase_change_buffering_values[], they +should start at 0 and there should not be any gaps. */ +typedef enum { + IBUF_USE_NONE = 0, + IBUF_USE_INSERT, /* insert */ + IBUF_USE_DELETE_MARK, /* delete */ + IBUF_USE_INSERT_DELETE_MARK, /* insert+delete */ + IBUF_USE_DELETE, /* delete+purge */ + IBUF_USE_ALL, /* insert+delete+purge */ + + IBUF_USE_COUNT /* number of entries in ibuf_use_t */ +} ibuf_use_t; + +/** Operations that can currently be buffered. */ +extern ibuf_use_t ibuf_use; + +/** The insert buffer control structure */ +extern ibuf_t* ibuf; + +/* The purpose of the insert buffer is to reduce random disk access. +When we wish to insert a record into a non-unique secondary index and +the B-tree leaf page where the record belongs to is not in the buffer +pool, we insert the record into the insert buffer B-tree, indexed by +(space_id, page_no). When the page is eventually read into the buffer +pool, we look up the insert buffer B-tree for any modifications to the +page, and apply these upon the completion of the read operation. This +is called the insert buffer merge. */ + +/* The insert buffer merge must always succeed. To guarantee this, +the insert buffer subsystem keeps track of the free space in pages for +which it can buffer operations. Two bits per page in the insert +buffer bitmap indicate the available space in coarse increments. The +free bits in the insert buffer bitmap must never exceed the free space +on a page. It is safe to decrement or reset the bits in the bitmap in +a mini-transaction that is committed before the mini-transaction that +affects the free space. It is unsafe to increment the bits in a +separately committed mini-transaction, because in crash recovery, the +free bits could momentarily be set too high. */ + +/******************************************************************//** +Creates the insert buffer data structure at a database startup. */ +UNIV_INTERN +void +ibuf_init_at_db_start(void); +/*=======================*/ +/*********************************************************************//** +Updates the max_size value for ibuf. */ +UNIV_INTERN +void +ibuf_max_size_update( +/*=================*/ + ulint new_val); /*!< in: new value in terms of + percentage of the buffer pool size */ +/*********************************************************************//** +Reads the biggest tablespace id from the high end of the insert buffer +tree and updates the counter in fil_system. */ +UNIV_INTERN +void +ibuf_update_max_tablespace_id(void); +/*===============================*/ +/***************************************************************//** +Starts an insert buffer mini-transaction. */ +UNIV_INLINE +void +ibuf_mtr_start( +/*===========*/ + mtr_t* mtr) /*!< out: mini-transaction */ + __attribute__((nonnull)); +/***************************************************************//** +Commits an insert buffer mini-transaction. */ +UNIV_INLINE +void +ibuf_mtr_commit( +/*============*/ + mtr_t* mtr) /*!< in/out: mini-transaction */ + __attribute__((nonnull)); +/*********************************************************************//** +Initializes an ibuf bitmap page. */ +UNIV_INTERN +void +ibuf_bitmap_page_init( +/*==================*/ + buf_block_t* block, /*!< in: bitmap page */ + mtr_t* mtr); /*!< in: mtr */ +/************************************************************************//** +Resets the free bits of the page in the ibuf bitmap. This is done in a +separate mini-transaction, hence this operation does not restrict +further work to only ibuf bitmap operations, which would result if the +latch to the bitmap page were kept. NOTE: The free bits in the insert +buffer bitmap must never exceed the free space on a page. It is safe +to decrement or reset the bits in the bitmap in a mini-transaction +that is committed before the mini-transaction that affects the free +space. */ +UNIV_INTERN +void +ibuf_reset_free_bits( +/*=================*/ + buf_block_t* block); /*!< in: index page; free bits are set to 0 + if the index is a non-clustered + non-unique, and page level is 0 */ +/************************************************************************//** +Updates the free bits of an uncompressed page in the ibuf bitmap if +there is not enough free on the page any more. This is done in a +separate mini-transaction, hence this operation does not restrict +further work to only ibuf bitmap operations, which would result if the +latch to the bitmap page were kept. NOTE: The free bits in the insert +buffer bitmap must never exceed the free space on a page. It is +unsafe to increment the bits in a separately committed +mini-transaction, because in crash recovery, the free bits could +momentarily be set too high. It is only safe to use this function for +decrementing the free bits. Should more free space become available, +we must not update the free bits here, because that would break crash +recovery. */ +UNIV_INLINE +void +ibuf_update_free_bits_if_full( +/*==========================*/ + buf_block_t* block, /*!< in: index page to which we have added new + records; the free bits are updated if the + index is non-clustered and non-unique and + the page level is 0, and the page becomes + fuller */ + ulint max_ins_size,/*!< in: value of maximum insert size with + reorganize before the latest operation + performed to the page */ + ulint increase);/*!< in: upper limit for the additional space + used in the latest operation, if known, or + ULINT_UNDEFINED */ +/**********************************************************************//** +Updates the free bits for an uncompressed page to reflect the present +state. Does this in the mtr given, which means that the latching +order rules virtually prevent any further operations for this OS +thread until mtr is committed. NOTE: The free bits in the insert +buffer bitmap must never exceed the free space on a page. It is safe +to set the free bits in the same mini-transaction that updated the +page. */ +UNIV_INTERN +void +ibuf_update_free_bits_low( +/*======================*/ + const buf_block_t* block, /*!< in: index page */ + ulint max_ins_size, /*!< in: value of + maximum insert size + with reorganize before + the latest operation + performed to the page */ + mtr_t* mtr); /*!< in/out: mtr */ +/**********************************************************************//** +Updates the free bits for a compressed page to reflect the present +state. Does this in the mtr given, which means that the latching +order rules virtually prevent any further operations for this OS +thread until mtr is committed. NOTE: The free bits in the insert +buffer bitmap must never exceed the free space on a page. It is safe +to set the free bits in the same mini-transaction that updated the +page. */ +UNIV_INTERN +void +ibuf_update_free_bits_zip( +/*======================*/ + buf_block_t* block, /*!< in/out: index page */ + mtr_t* mtr); /*!< in/out: mtr */ +/**********************************************************************//** +Updates the free bits for the two pages to reflect the present state. +Does this in the mtr given, which means that the latching order rules +virtually prevent any further operations until mtr is committed. +NOTE: The free bits in the insert buffer bitmap must never exceed the +free space on a page. It is safe to set the free bits in the same +mini-transaction that updated the pages. */ +UNIV_INTERN +void +ibuf_update_free_bits_for_two_pages_low( +/*====================================*/ + ulint zip_size,/*!< in: compressed page size in bytes; + 0 for uncompressed pages */ + buf_block_t* block1, /*!< in: index page */ + buf_block_t* block2, /*!< in: index page */ + mtr_t* mtr); /*!< in: mtr */ +/**********************************************************************//** +A basic partial test if an insert to the insert buffer could be possible and +recommended. */ +UNIV_INLINE +ibool +ibuf_should_try( +/*============*/ + dict_index_t* index, /*!< in: index where to insert */ + ulint ignore_sec_unique); /*!< in: if != 0, we should + ignore UNIQUE constraint on + a secondary index when we + decide */ +/******************************************************************//** +Returns TRUE if the current OS thread is performing an insert buffer +routine. + +For instance, a read-ahead of non-ibuf pages is forbidden by threads +that are executing an insert buffer routine. +@return TRUE if inside an insert buffer routine */ +UNIV_INLINE +ibool +ibuf_inside( +/*========*/ + const mtr_t* mtr) /*!< in: mini-transaction */ + __attribute__((nonnull, pure)); +/***********************************************************************//** +Checks if a page address is an ibuf bitmap page (level 3 page) address. +@return TRUE if a bitmap page */ +UNIV_INLINE +ibool +ibuf_bitmap_page( +/*=============*/ + ulint zip_size,/*!< in: compressed page size in bytes; + 0 for uncompressed pages */ + ulint page_no);/*!< in: page number */ +/***********************************************************************//** +Checks if a page is a level 2 or 3 page in the ibuf hierarchy of pages. +Must not be called when recv_no_ibuf_operations==TRUE. +@return TRUE if level 2 or level 3 page */ +UNIV_INTERN +ibool +ibuf_page_low( +/*==========*/ + ulint space, /*!< in: space id */ + ulint zip_size,/*!< in: compressed page size in bytes, or 0 */ + ulint page_no,/*!< in: page number */ +#ifdef UNIV_DEBUG + ibool x_latch,/*!< in: FALSE if relaxed check + (avoid latching the bitmap page) */ +#endif /* UNIV_DEBUG */ + const char* file, /*!< in: file name */ + ulint line, /*!< in: line where called */ + mtr_t* mtr) /*!< in: mtr which will contain an + x-latch to the bitmap page if the page + is not one of the fixed address ibuf + pages, or NULL, in which case a new + transaction is created. */ + __attribute__((warn_unused_result)); +#ifdef UNIV_DEBUG +/** Checks if a page is a level 2 or 3 page in the ibuf hierarchy of +pages. Must not be called when recv_no_ibuf_operations==TRUE. +@param space tablespace identifier +@param zip_size compressed page size in bytes, or 0 +@param page_no page number +@param mtr mini-transaction or NULL +@return TRUE if level 2 or level 3 page */ +# define ibuf_page(space, zip_size, page_no, mtr) \ + ibuf_page_low(space, zip_size, page_no, TRUE, __FILE__, __LINE__, mtr) +#else /* UVIV_DEBUG */ +/** Checks if a page is a level 2 or 3 page in the ibuf hierarchy of +pages. Must not be called when recv_no_ibuf_operations==TRUE. +@param space tablespace identifier +@param zip_size compressed page size in bytes, or 0 +@param page_no page number +@param mtr mini-transaction or NULL +@return TRUE if level 2 or level 3 page */ +# define ibuf_page(space, zip_size, page_no, mtr) \ + ibuf_page_low(space, zip_size, page_no, __FILE__, __LINE__, mtr) +#endif /* UVIV_DEBUG */ +/***********************************************************************//** +Frees excess pages from the ibuf free list. This function is called when an OS +thread calls fsp services to allocate a new file segment, or a new page to a +file segment, and the thread did not own the fsp latch before this call. */ +UNIV_INTERN +void +ibuf_free_excess_pages(void); +/*========================*/ +/*********************************************************************//** +Buffer an operation in the insert/delete buffer, instead of doing it +directly to the disk page, if this is possible. Does not do it if the index +is clustered or unique. +@return TRUE if success */ +UNIV_INTERN +ibool +ibuf_insert( +/*========*/ + ibuf_op_t op, /*!< in: operation type */ + const dtuple_t* entry, /*!< in: index entry to insert */ + dict_index_t* index, /*!< in: index where to insert */ + ulint space, /*!< in: space id where to insert */ + ulint zip_size,/*!< in: compressed page size in bytes, or 0 */ + ulint page_no,/*!< in: page number where to insert */ + que_thr_t* thr); /*!< in: query thread */ +/*********************************************************************//** +When an index page is read from a disk to the buffer pool, this function +applies any buffered operations to the page and deletes the entries from the +insert buffer. If the page is not read, but created in the buffer pool, this +function deletes its buffered entries from the insert buffer; there can +exist entries for such a page if the page belonged to an index which +subsequently was dropped. */ +UNIV_INTERN +void +ibuf_merge_or_delete_for_page( +/*==========================*/ + buf_block_t* block, /*!< in: if page has been read from + disk, pointer to the page x-latched, + else NULL */ + ulint space, /*!< in: space id of the index page */ + ulint page_no,/*!< in: page number of the index page */ + ulint zip_size,/*!< in: compressed page size in bytes, + or 0 */ + ibool update_ibuf_bitmap);/*!< in: normally this is set + to TRUE, but if we have deleted or are + deleting the tablespace, then we + naturally do not want to update a + non-existent bitmap page */ +/*********************************************************************//** +Deletes all entries in the insert buffer for a given space id. This is used +in DISCARD TABLESPACE and IMPORT TABLESPACE. +NOTE: this does not update the page free bitmaps in the space. The space will +become CORRUPT when you call this function! */ +UNIV_INTERN +void +ibuf_delete_for_discarded_space( +/*============================*/ + ulint space); /*!< in: space id */ +/*********************************************************************//** +Contracts insert buffer trees by reading pages to the buffer pool. +@return a lower limit for the combined size in bytes of entries which +will be merged from ibuf trees to the pages read, 0 if ibuf is +empty */ +UNIV_INTERN +ulint +ibuf_contract_in_background( +/*========================*/ + table_id_t table_id, /*!< in: if merge should be done only + for a specific table, for all tables + this should be 0 */ + ibool full); /*!< in: TRUE if the caller wants to + do a full contract based on PCT_IO(100). + If FALSE then the size of contract + batch is determined based on the + current size of the ibuf tree. */ +#endif /* !UNIV_HOTBACKUP */ +/*********************************************************************//** +Parses a redo log record of an ibuf bitmap page init. +@return end of log record or NULL */ +UNIV_INTERN +byte* +ibuf_parse_bitmap_init( +/*===================*/ + byte* ptr, /*!< in: buffer */ + byte* end_ptr,/*!< in: buffer end */ + buf_block_t* block, /*!< in: block or NULL */ + mtr_t* mtr); /*!< in: mtr or NULL */ +#ifndef UNIV_HOTBACKUP +#ifdef UNIV_IBUF_COUNT_DEBUG +/******************************************************************//** +Gets the ibuf count for a given page. +@return number of entries in the insert buffer currently buffered for +this page */ +UNIV_INTERN +ulint +ibuf_count_get( +/*===========*/ + ulint space, /*!< in: space id */ + ulint page_no);/*!< in: page number */ +#endif +/******************************************************************//** +Looks if the insert buffer is empty. +@return true if empty */ +UNIV_INTERN +bool +ibuf_is_empty(void); +/*===============*/ +/******************************************************************//** +Prints info of ibuf. */ +UNIV_INTERN +void +ibuf_print( +/*=======*/ + FILE* file); /*!< in: file where to print */ +/******************************************************************** +Read the first two bytes from a record's fourth field (counter field in new +records; something else in older records). +@return "counter" field, or ULINT_UNDEFINED if for some reason it can't be read */ +UNIV_INTERN +ulint +ibuf_rec_get_counter( +/*=================*/ + const rec_t* rec); /*!< in: ibuf record */ +/******************************************************************//** +Closes insert buffer and frees the data structures. */ +UNIV_INTERN +void +ibuf_close(void); +/*============*/ + +/******************************************************************//** +Checks the insert buffer bitmaps on IMPORT TABLESPACE. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +ibuf_check_bitmap_on_import( +/*========================*/ + const trx_t* trx, /*!< in: transaction */ + ulint space_id) /*!< in: tablespace identifier */ + __attribute__((nonnull, warn_unused_result)); + +#define IBUF_HEADER_PAGE_NO FSP_IBUF_HEADER_PAGE_NO +#define IBUF_TREE_ROOT_PAGE_NO FSP_IBUF_TREE_ROOT_PAGE_NO + +#endif /* !UNIV_HOTBACKUP */ + +/* The ibuf header page currently contains only the file segment header +for the file segment from which the pages for the ibuf tree are allocated */ +#define IBUF_HEADER PAGE_DATA +#define IBUF_TREE_SEG_HEADER 0 /* fseg header for ibuf tree */ + +/* The insert buffer tree itself is always located in space 0. */ +#define IBUF_SPACE_ID 0 + +#ifndef UNIV_NONINL +#include "ibuf0ibuf.ic" +#endif + +#endif diff --git a/storage/innobase/include/ibuf0ibuf.ic b/storage/innobase/include/ibuf0ibuf.ic new file mode 100644 index 00000000000..21747fdceac --- /dev/null +++ b/storage/innobase/include/ibuf0ibuf.ic @@ -0,0 +1,367 @@ +/***************************************************************************** + +Copyright (c) 1997, 2013, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/ibuf0ibuf.ic +Insert buffer + +Created 7/19/1997 Heikki Tuuri +*******************************************************/ + +#include "page0page.h" +#include "page0zip.h" +#ifndef UNIV_HOTBACKUP +#include "buf0lru.h" + +/** An index page must contain at least UNIV_PAGE_SIZE / +IBUF_PAGE_SIZE_PER_FREE_SPACE bytes of free space for ibuf to try to +buffer inserts to this page. If there is this much of free space, the +corresponding bits are set in the ibuf bitmap. */ +#define IBUF_PAGE_SIZE_PER_FREE_SPACE 32 + +/***************************************************************//** +Starts an insert buffer mini-transaction. */ +UNIV_INLINE +void +ibuf_mtr_start( +/*===========*/ + mtr_t* mtr) /*!< out: mini-transaction */ +{ + mtr_start(mtr); + mtr->inside_ibuf = TRUE; +} +/***************************************************************//** +Commits an insert buffer mini-transaction. */ +UNIV_INLINE +void +ibuf_mtr_commit( +/*============*/ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + ut_ad(mtr->inside_ibuf); + ut_d(mtr->inside_ibuf = FALSE); + mtr_commit(mtr); +} + +/** Insert buffer struct */ +struct ibuf_t{ + ulint size; /*!< current size of the ibuf index + tree, in pages */ + ulint max_size; /*!< recommended maximum size of the + ibuf index tree, in pages */ + ulint seg_size; /*!< allocated pages of the file + segment containing ibuf header and + tree */ + bool empty; /*!< Protected by the page + latch of the root page of the + insert buffer tree + (FSP_IBUF_TREE_ROOT_PAGE_NO). true + if and only if the insert + buffer tree is empty. */ + ulint free_list_len; /*!< length of the free list */ + ulint height; /*!< tree height */ + dict_index_t* index; /*!< insert buffer index */ + + ulint n_merges; /*!< number of pages merged */ + ulint n_merged_ops[IBUF_OP_COUNT]; + /*!< number of operations of each type + merged to index pages */ + ulint n_discarded_ops[IBUF_OP_COUNT]; + /*!< number of operations of each type + discarded without merging due to the + tablespace being deleted or the + index being dropped */ +}; + +/************************************************************************//** +Sets the free bit of the page in the ibuf bitmap. This is done in a separate +mini-transaction, hence this operation does not restrict further work to only +ibuf bitmap operations, which would result if the latch to the bitmap page +were kept. */ +UNIV_INTERN +void +ibuf_set_free_bits_func( +/*====================*/ + buf_block_t* block, /*!< in: index page of a non-clustered index; + free bit is reset if page level is 0 */ +#ifdef UNIV_IBUF_DEBUG + ulint max_val,/*!< in: ULINT_UNDEFINED or a maximum + value which the bits must have before + setting; this is for debugging */ +#endif /* UNIV_IBUF_DEBUG */ + ulint val); /*!< in: value to set: < 4 */ +#ifdef UNIV_IBUF_DEBUG +# define ibuf_set_free_bits(b,v,max) ibuf_set_free_bits_func(b,max,v) +#else /* UNIV_IBUF_DEBUG */ +# define ibuf_set_free_bits(b,v,max) ibuf_set_free_bits_func(b,v) +#endif /* UNIV_IBUF_DEBUG */ + +/**********************************************************************//** +A basic partial test if an insert to the insert buffer could be possible and +recommended. */ +UNIV_INLINE +ibool +ibuf_should_try( +/*============*/ + dict_index_t* index, /*!< in: index where to insert */ + ulint ignore_sec_unique) /*!< in: if != 0, we should + ignore UNIQUE constraint on + a secondary index when we + decide */ +{ + return(ibuf_use != IBUF_USE_NONE + && ibuf->max_size != 0 + && !dict_index_is_clust(index) + && index->table->quiesce == QUIESCE_NONE + && (ignore_sec_unique || !dict_index_is_unique(index))); +} + +/******************************************************************//** +Returns TRUE if the current OS thread is performing an insert buffer +routine. + +For instance, a read-ahead of non-ibuf pages is forbidden by threads +that are executing an insert buffer routine. +@return TRUE if inside an insert buffer routine */ +UNIV_INLINE +ibool +ibuf_inside( +/*========*/ + const mtr_t* mtr) /*!< in: mini-transaction */ +{ + return(mtr->inside_ibuf); +} + +/***********************************************************************//** +Checks if a page address is an ibuf bitmap page address. +@return TRUE if a bitmap page */ +UNIV_INLINE +ibool +ibuf_bitmap_page( +/*=============*/ + ulint zip_size,/*!< in: compressed page size in bytes; + 0 for uncompressed pages */ + ulint page_no)/*!< in: page number */ +{ + ut_ad(ut_is_2pow(zip_size)); + + if (!zip_size) { + return((page_no & (UNIV_PAGE_SIZE - 1)) + == FSP_IBUF_BITMAP_OFFSET); + } + + return((page_no & (zip_size - 1)) == FSP_IBUF_BITMAP_OFFSET); +} + +/*********************************************************************//** +Translates the free space on a page to a value in the ibuf bitmap. +@return value for ibuf bitmap bits */ +UNIV_INLINE +ulint +ibuf_index_page_calc_free_bits( +/*===========================*/ + ulint zip_size, /*!< in: compressed page size in bytes; + 0 for uncompressed pages */ + ulint max_ins_size) /*!< in: maximum insert size after reorganize + for the page */ +{ + ulint n; + ut_ad(ut_is_2pow(zip_size)); + ut_ad(!zip_size || zip_size > IBUF_PAGE_SIZE_PER_FREE_SPACE); + ut_ad(zip_size <= UNIV_ZIP_SIZE_MAX); + + if (zip_size) { + n = max_ins_size + / (zip_size / IBUF_PAGE_SIZE_PER_FREE_SPACE); + } else { + n = max_ins_size + / (UNIV_PAGE_SIZE / IBUF_PAGE_SIZE_PER_FREE_SPACE); + } + + if (n == 3) { + n = 2; + } + + if (n > 3) { + n = 3; + } + + return(n); +} + +/*********************************************************************//** +Translates the ibuf free bits to the free space on a page in bytes. +@return maximum insert size after reorganize for the page */ +UNIV_INLINE +ulint +ibuf_index_page_calc_free_from_bits( +/*================================*/ + ulint zip_size,/*!< in: compressed page size in bytes; + 0 for uncompressed pages */ + ulint bits) /*!< in: value for ibuf bitmap bits */ +{ + ut_ad(bits < 4); + ut_ad(ut_is_2pow(zip_size)); + ut_ad(!zip_size || zip_size > IBUF_PAGE_SIZE_PER_FREE_SPACE); + ut_ad(zip_size <= UNIV_ZIP_SIZE_MAX); + + if (zip_size) { + if (bits == 3) { + return(4 * zip_size / IBUF_PAGE_SIZE_PER_FREE_SPACE); + } + + return(bits * zip_size / IBUF_PAGE_SIZE_PER_FREE_SPACE); + } + + if (bits == 3) { + return(4 * UNIV_PAGE_SIZE / IBUF_PAGE_SIZE_PER_FREE_SPACE); + } + + return(bits * (UNIV_PAGE_SIZE / IBUF_PAGE_SIZE_PER_FREE_SPACE)); +} + +/*********************************************************************//** +Translates the free space on a compressed page to a value in the ibuf bitmap. +@return value for ibuf bitmap bits */ +UNIV_INLINE +ulint +ibuf_index_page_calc_free_zip( +/*==========================*/ + ulint zip_size, + /*!< in: compressed page size in bytes */ + const buf_block_t* block) /*!< in: buffer block */ +{ + ulint max_ins_size; + const page_zip_des_t* page_zip; + lint zip_max_ins; + + ut_ad(zip_size == buf_block_get_zip_size(block)); + ut_ad(zip_size); + + /* Consider the maximum insert size on the uncompressed page + without reorganizing the page. We must not assume anything + about the compression ratio. If zip_max_ins > max_ins_size and + there is 1/4 garbage on the page, recompression after the + reorganize could fail, in theory. So, let us guarantee that + merging a buffered insert to a compressed page will always + succeed without reorganizing or recompressing the page, just + by using the page modification log. */ + max_ins_size = page_get_max_insert_size( + buf_block_get_frame(block), 1); + + page_zip = buf_block_get_page_zip(block); + zip_max_ins = page_zip_max_ins_size(page_zip, + FALSE/* not clustered */); + + if (zip_max_ins < 0) { + return(0); + } else if (max_ins_size > (ulint) zip_max_ins) { + max_ins_size = (ulint) zip_max_ins; + } + + return(ibuf_index_page_calc_free_bits(zip_size, max_ins_size)); +} + +/*********************************************************************//** +Translates the free space on a page to a value in the ibuf bitmap. +@return value for ibuf bitmap bits */ +UNIV_INLINE +ulint +ibuf_index_page_calc_free( +/*======================*/ + ulint zip_size,/*!< in: compressed page size in bytes; + 0 for uncompressed pages */ + const buf_block_t* block) /*!< in: buffer block */ +{ + ut_ad(zip_size == buf_block_get_zip_size(block)); + + if (!zip_size) { + ulint max_ins_size; + + max_ins_size = page_get_max_insert_size_after_reorganize( + buf_block_get_frame(block), 1); + + return(ibuf_index_page_calc_free_bits(0, max_ins_size)); + } else { + return(ibuf_index_page_calc_free_zip(zip_size, block)); + } +} + +/************************************************************************//** +Updates the free bits of an uncompressed page in the ibuf bitmap if +there is not enough free on the page any more. This is done in a +separate mini-transaction, hence this operation does not restrict +further work to only ibuf bitmap operations, which would result if the +latch to the bitmap page were kept. NOTE: The free bits in the insert +buffer bitmap must never exceed the free space on a page. It is +unsafe to increment the bits in a separately committed +mini-transaction, because in crash recovery, the free bits could +momentarily be set too high. It is only safe to use this function for +decrementing the free bits. Should more free space become available, +we must not update the free bits here, because that would break crash +recovery. */ +UNIV_INLINE +void +ibuf_update_free_bits_if_full( +/*==========================*/ + buf_block_t* block, /*!< in: index page to which we have added new + records; the free bits are updated if the + index is non-clustered and non-unique and + the page level is 0, and the page becomes + fuller */ + ulint max_ins_size,/*!< in: value of maximum insert size with + reorganize before the latest operation + performed to the page */ + ulint increase)/*!< in: upper limit for the additional space + used in the latest operation, if known, or + ULINT_UNDEFINED */ +{ + ulint before; + ulint after; + + ut_ad(!buf_block_get_page_zip(block)); + + before = ibuf_index_page_calc_free_bits(0, max_ins_size); + + if (max_ins_size >= increase) { +#if ULINT32_UNDEFINED <= UNIV_PAGE_SIZE_MAX +# error "ULINT32_UNDEFINED <= UNIV_PAGE_SIZE_MAX" +#endif + after = ibuf_index_page_calc_free_bits(0, max_ins_size + - increase); +#ifdef UNIV_IBUF_DEBUG + ut_a(after <= ibuf_index_page_calc_free(0, block)); +#endif + } else { + after = ibuf_index_page_calc_free(0, block); + } + + if (after == 0) { + /* We move the page to the front of the buffer pool LRU list: + the purpose of this is to prevent those pages to which we + cannot make inserts using the insert buffer from slipping + out of the buffer pool */ + + buf_page_make_young(&block->page); + } + + if (before > after) { + ibuf_set_free_bits(block, after, before); + } +} +#endif /* !UNIV_HOTBACKUP */ diff --git a/storage/innobase/include/ibuf0types.h b/storage/innobase/include/ibuf0types.h new file mode 100644 index 00000000000..3fdbf078b0b --- /dev/null +++ b/storage/innobase/include/ibuf0types.h @@ -0,0 +1,31 @@ +/***************************************************************************** + +Copyright (c) 1997, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/ibuf0types.h +Insert buffer global types + +Created 7/29/1997 Heikki Tuuri +*******************************************************/ + +#ifndef ibuf0types_h +#define ibuf0types_h + +struct ibuf_t; + +#endif diff --git a/storage/innobase/include/lock0iter.h b/storage/innobase/include/lock0iter.h new file mode 100644 index 00000000000..0054850b526 --- /dev/null +++ b/storage/innobase/include/lock0iter.h @@ -0,0 +1,69 @@ +/***************************************************************************** + +Copyright (c) 2007, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/lock0iter.h +Lock queue iterator type and function prototypes. + +Created July 16, 2007 Vasil Dimov +*******************************************************/ + +#ifndef lock0iter_h +#define lock0iter_h + +#include "univ.i" +#include "lock0types.h" + +struct lock_queue_iterator_t { + const lock_t* current_lock; + /* In case this is a record lock queue (not table lock queue) + then bit_no is the record number within the heap in which the + record is stored. */ + ulint bit_no; +}; + +/*******************************************************************//** +Initialize lock queue iterator so that it starts to iterate from +"lock". bit_no specifies the record number within the heap where the +record is stored. It can be undefined (ULINT_UNDEFINED) in two cases: +1. If the lock is a table lock, thus we have a table lock queue; +2. If the lock is a record lock and it is a wait lock. In this case + bit_no is calculated in this function by using + lock_rec_find_set_bit(). There is exactly one bit set in the bitmap + of a wait lock. */ +UNIV_INTERN +void +lock_queue_iterator_reset( +/*======================*/ + lock_queue_iterator_t* iter, /*!< out: iterator */ + const lock_t* lock, /*!< in: lock to start from */ + ulint bit_no);/*!< in: record number in the + heap */ + +/*******************************************************************//** +Gets the previous lock in the lock queue, returns NULL if there are no +more locks (i.e. the current lock is the first one). The iterator is +receded (if not-NULL is returned). +@return previous lock or NULL */ + +const lock_t* +lock_queue_iterator_get_prev( +/*=========================*/ + lock_queue_iterator_t* iter); /*!< in/out: iterator */ + +#endif /* lock0iter_h */ diff --git a/storage/innobase/include/lock0lock.h b/storage/innobase/include/lock0lock.h new file mode 100644 index 00000000000..6d5ed35d5d8 --- /dev/null +++ b/storage/innobase/include/lock0lock.h @@ -0,0 +1,979 @@ +/***************************************************************************** + +Copyright (c) 1996, 2014, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/lock0lock.h +The transaction lock system + +Created 5/7/1996 Heikki Tuuri +*******************************************************/ + +#ifndef lock0lock_h +#define lock0lock_h + +#include "univ.i" +#include "buf0types.h" +#include "trx0types.h" +#include "mtr0types.h" +#include "rem0types.h" +#include "dict0types.h" +#include "que0types.h" +#include "lock0types.h" +#include "read0types.h" +#include "hash0hash.h" +#include "srv0srv.h" +#include "ut0vec.h" + +#ifdef UNIV_DEBUG +extern ibool lock_print_waits; +#endif /* UNIV_DEBUG */ + +/*********************************************************************//** +Gets the size of a lock struct. +@return size in bytes */ +UNIV_INTERN +ulint +lock_get_size(void); +/*===============*/ +/*********************************************************************//** +Creates the lock system at database start. */ +UNIV_INTERN +void +lock_sys_create( +/*============*/ + ulint n_cells); /*!< in: number of slots in lock hash table */ +/*********************************************************************//** +Closes the lock system at database shutdown. */ +UNIV_INTERN +void +lock_sys_close(void); +/*================*/ +/*********************************************************************//** +Gets the heap_no of the smallest user record on a page. +@return heap_no of smallest user record, or PAGE_HEAP_NO_SUPREMUM */ +UNIV_INLINE +ulint +lock_get_min_heap_no( +/*=================*/ + const buf_block_t* block); /*!< in: buffer block */ +/*************************************************************//** +Updates the lock table when we have reorganized a page. NOTE: we copy +also the locks set on the infimum of the page; the infimum may carry +locks if an update of a record is occurring on the page, and its locks +were temporarily stored on the infimum. */ +UNIV_INTERN +void +lock_move_reorganize_page( +/*======================*/ + const buf_block_t* block, /*!< in: old index page, now + reorganized */ + const buf_block_t* oblock);/*!< in: copy of the old, not + reorganized page */ +/*************************************************************//** +Moves the explicit locks on user records to another page if a record +list end is moved to another page. */ +UNIV_INTERN +void +lock_move_rec_list_end( +/*===================*/ + const buf_block_t* new_block, /*!< in: index page to move to */ + const buf_block_t* block, /*!< in: index page */ + const rec_t* rec); /*!< in: record on page: this + is the first record moved */ +/*************************************************************//** +Moves the explicit locks on user records to another page if a record +list start is moved to another page. */ +UNIV_INTERN +void +lock_move_rec_list_start( +/*=====================*/ + const buf_block_t* new_block, /*!< in: index page to move to */ + const buf_block_t* block, /*!< in: index page */ + const rec_t* rec, /*!< in: record on page: + this is the first + record NOT copied */ + const rec_t* old_end); /*!< in: old + previous-to-last + record on new_page + before the records + were copied */ +/*************************************************************//** +Updates the lock table when a page is split to the right. */ +UNIV_INTERN +void +lock_update_split_right( +/*====================*/ + const buf_block_t* right_block, /*!< in: right page */ + const buf_block_t* left_block); /*!< in: left page */ +/*************************************************************//** +Updates the lock table when a page is merged to the right. */ +UNIV_INTERN +void +lock_update_merge_right( +/*====================*/ + const buf_block_t* right_block, /*!< in: right page to + which merged */ + const rec_t* orig_succ, /*!< in: original + successor of infimum + on the right page + before merge */ + const buf_block_t* left_block); /*!< in: merged index + page which will be + discarded */ +/*************************************************************//** +Updates the lock table when the root page is copied to another in +btr_root_raise_and_insert. Note that we leave lock structs on the +root page, even though they do not make sense on other than leaf +pages: the reason is that in a pessimistic update the infimum record +of the root page will act as a dummy carrier of the locks of the record +to be updated. */ +UNIV_INTERN +void +lock_update_root_raise( +/*===================*/ + const buf_block_t* block, /*!< in: index page to which copied */ + const buf_block_t* root); /*!< in: root page */ +/*************************************************************//** +Updates the lock table when a page is copied to another and the original page +is removed from the chain of leaf pages, except if page is the root! */ +UNIV_INTERN +void +lock_update_copy_and_discard( +/*=========================*/ + const buf_block_t* new_block, /*!< in: index page to + which copied */ + const buf_block_t* block); /*!< in: index page; + NOT the root! */ +/*************************************************************//** +Updates the lock table when a page is split to the left. */ +UNIV_INTERN +void +lock_update_split_left( +/*===================*/ + const buf_block_t* right_block, /*!< in: right page */ + const buf_block_t* left_block); /*!< in: left page */ +/*************************************************************//** +Updates the lock table when a page is merged to the left. */ +UNIV_INTERN +void +lock_update_merge_left( +/*===================*/ + const buf_block_t* left_block, /*!< in: left page to + which merged */ + const rec_t* orig_pred, /*!< in: original predecessor + of supremum on the left page + before merge */ + const buf_block_t* right_block); /*!< in: merged index page + which will be discarded */ +/*************************************************************//** +Resets the original locks on heir and replaces them with gap type locks +inherited from rec. */ +UNIV_INTERN +void +lock_rec_reset_and_inherit_gap_locks( +/*=================================*/ + const buf_block_t* heir_block, /*!< in: block containing the + record which inherits */ + const buf_block_t* block, /*!< in: block containing the + record from which inherited; + does NOT reset the locks on + this record */ + ulint heir_heap_no, /*!< in: heap_no of the + inheriting record */ + ulint heap_no); /*!< in: heap_no of the + donating record */ +/*************************************************************//** +Updates the lock table when a page is discarded. */ +UNIV_INTERN +void +lock_update_discard( +/*================*/ + const buf_block_t* heir_block, /*!< in: index page + which will inherit the locks */ + ulint heir_heap_no, /*!< in: heap_no of the record + which will inherit the locks */ + const buf_block_t* block); /*!< in: index page + which will be discarded */ +/*************************************************************//** +Updates the lock table when a new user record is inserted. */ +UNIV_INTERN +void +lock_update_insert( +/*===============*/ + const buf_block_t* block, /*!< in: buffer block containing rec */ + const rec_t* rec); /*!< in: the inserted record */ +/*************************************************************//** +Updates the lock table when a record is removed. */ +UNIV_INTERN +void +lock_update_delete( +/*===============*/ + const buf_block_t* block, /*!< in: buffer block containing rec */ + const rec_t* rec); /*!< in: the record to be removed */ +/*********************************************************************//** +Stores on the page infimum record the explicit locks of another record. +This function is used to store the lock state of a record when it is +updated and the size of the record changes in the update. The record +is in such an update moved, perhaps to another page. The infimum record +acts as a dummy carrier record, taking care of lock releases while the +actual record is being moved. */ +UNIV_INTERN +void +lock_rec_store_on_page_infimum( +/*===========================*/ + const buf_block_t* block, /*!< in: buffer block containing rec */ + const rec_t* rec); /*!< in: record whose lock state + is stored on the infimum + record of the same page; lock + bits are reset on the + record */ +/*********************************************************************//** +Restores the state of explicit lock requests on a single record, where the +state was stored on the infimum of the page. */ +UNIV_INTERN +void +lock_rec_restore_from_page_infimum( +/*===============================*/ + const buf_block_t* block, /*!< in: buffer block containing rec */ + const rec_t* rec, /*!< in: record whose lock state + is restored */ + const buf_block_t* donator);/*!< in: page (rec is not + necessarily on this page) + whose infimum stored the lock + state; lock bits are reset on + the infimum */ +/*********************************************************************//** +Determines if there are explicit record locks on a page. +@return an explicit record lock on the page, or NULL if there are none */ +UNIV_INTERN +lock_t* +lock_rec_expl_exist_on_page( +/*========================*/ + ulint space, /*!< in: space id */ + ulint page_no)/*!< in: page number */ + __attribute__((warn_unused_result)); +/*********************************************************************//** +Checks if locks of other transactions prevent an immediate insert of +a record. If they do, first tests if the query thread should anyway +be suspended for some reason; if not, then puts the transaction and +the query thread to the lock wait state and inserts a waiting request +for a gap x-lock to the lock queue. +@return DB_SUCCESS, DB_LOCK_WAIT, DB_DEADLOCK, or DB_QUE_THR_SUSPENDED */ +UNIV_INTERN +dberr_t +lock_rec_insert_check_and_lock( +/*===========================*/ + ulint flags, /*!< in: if BTR_NO_LOCKING_FLAG bit is + set, does nothing */ + const rec_t* rec, /*!< in: record after which to insert */ + buf_block_t* block, /*!< in/out: buffer block of rec */ + dict_index_t* index, /*!< in: index */ + que_thr_t* thr, /*!< in: query thread */ + mtr_t* mtr, /*!< in/out: mini-transaction */ + ibool* inherit)/*!< out: set to TRUE if the new + inserted record maybe should inherit + LOCK_GAP type locks from the successor + record */ + __attribute__((nonnull, warn_unused_result)); +/*********************************************************************//** +Checks if locks of other transactions prevent an immediate modify (update, +delete mark, or delete unmark) of a clustered index record. If they do, +first tests if the query thread should anyway be suspended for some +reason; if not, then puts the transaction and the query thread to the +lock wait state and inserts a waiting request for a record x-lock to the +lock queue. +@return DB_SUCCESS, DB_LOCK_WAIT, DB_DEADLOCK, or DB_QUE_THR_SUSPENDED */ +UNIV_INTERN +dberr_t +lock_clust_rec_modify_check_and_lock( +/*=================================*/ + ulint flags, /*!< in: if BTR_NO_LOCKING_FLAG + bit is set, does nothing */ + const buf_block_t* block, /*!< in: buffer block of rec */ + const rec_t* rec, /*!< in: record which should be + modified */ + dict_index_t* index, /*!< in: clustered index */ + const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */ + que_thr_t* thr) /*!< in: query thread */ + __attribute__((warn_unused_result, nonnull)); +/*********************************************************************//** +Checks if locks of other transactions prevent an immediate modify +(delete mark or delete unmark) of a secondary index record. +@return DB_SUCCESS, DB_LOCK_WAIT, DB_DEADLOCK, or DB_QUE_THR_SUSPENDED */ +UNIV_INTERN +dberr_t +lock_sec_rec_modify_check_and_lock( +/*===============================*/ + ulint flags, /*!< in: if BTR_NO_LOCKING_FLAG + bit is set, does nothing */ + buf_block_t* block, /*!< in/out: buffer block of rec */ + const rec_t* rec, /*!< in: record which should be + modified; NOTE: as this is a secondary + index, we always have to modify the + clustered index record first: see the + comment below */ + dict_index_t* index, /*!< in: secondary index */ + que_thr_t* thr, /*!< in: query thread + (can be NULL if BTR_NO_LOCKING_FLAG) */ + mtr_t* mtr) /*!< in/out: mini-transaction */ + __attribute__((warn_unused_result, nonnull(2,3,4,6))); +/*********************************************************************//** +Like lock_clust_rec_read_check_and_lock(), but reads a +secondary index record. +@return DB_SUCCESS, DB_SUCCESS_LOCKED_REC, DB_LOCK_WAIT, DB_DEADLOCK, +or DB_QUE_THR_SUSPENDED */ +UNIV_INTERN +dberr_t +lock_sec_rec_read_check_and_lock( +/*=============================*/ + ulint flags, /*!< in: if BTR_NO_LOCKING_FLAG + bit is set, does nothing */ + const buf_block_t* block, /*!< in: buffer block of rec */ + const rec_t* rec, /*!< in: user record or page + supremum record which should + be read or passed over by a + read cursor */ + dict_index_t* index, /*!< in: secondary index */ + const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */ + enum lock_mode mode, /*!< in: mode of the lock which + the read cursor should set on + records: LOCK_S or LOCK_X; the + latter is possible in + SELECT FOR UPDATE */ + ulint gap_mode,/*!< in: LOCK_ORDINARY, LOCK_GAP, or + LOCK_REC_NOT_GAP */ + que_thr_t* thr); /*!< in: query thread */ +/*********************************************************************//** +Checks if locks of other transactions prevent an immediate read, or passing +over by a read cursor, of a clustered index record. If they do, first tests +if the query thread should anyway be suspended for some reason; if not, then +puts the transaction and the query thread to the lock wait state and inserts a +waiting request for a record lock to the lock queue. Sets the requested mode +lock on the record. +@return DB_SUCCESS, DB_SUCCESS_LOCKED_REC, DB_LOCK_WAIT, DB_DEADLOCK, +or DB_QUE_THR_SUSPENDED */ +UNIV_INTERN +dberr_t +lock_clust_rec_read_check_and_lock( +/*===============================*/ + ulint flags, /*!< in: if BTR_NO_LOCKING_FLAG + bit is set, does nothing */ + const buf_block_t* block, /*!< in: buffer block of rec */ + const rec_t* rec, /*!< in: user record or page + supremum record which should + be read or passed over by a + read cursor */ + dict_index_t* index, /*!< in: clustered index */ + const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */ + enum lock_mode mode, /*!< in: mode of the lock which + the read cursor should set on + records: LOCK_S or LOCK_X; the + latter is possible in + SELECT FOR UPDATE */ + ulint gap_mode,/*!< in: LOCK_ORDINARY, LOCK_GAP, or + LOCK_REC_NOT_GAP */ + que_thr_t* thr); /*!< in: query thread */ +/*********************************************************************//** +Checks if locks of other transactions prevent an immediate read, or passing +over by a read cursor, of a clustered index record. If they do, first tests +if the query thread should anyway be suspended for some reason; if not, then +puts the transaction and the query thread to the lock wait state and inserts a +waiting request for a record lock to the lock queue. Sets the requested mode +lock on the record. This is an alternative version of +lock_clust_rec_read_check_and_lock() that does not require the parameter +"offsets". +@return DB_SUCCESS, DB_LOCK_WAIT, DB_DEADLOCK, or DB_QUE_THR_SUSPENDED */ +UNIV_INTERN +dberr_t +lock_clust_rec_read_check_and_lock_alt( +/*===================================*/ + ulint flags, /*!< in: if BTR_NO_LOCKING_FLAG + bit is set, does nothing */ + const buf_block_t* block, /*!< in: buffer block of rec */ + const rec_t* rec, /*!< in: user record or page + supremum record which should + be read or passed over by a + read cursor */ + dict_index_t* index, /*!< in: clustered index */ + enum lock_mode mode, /*!< in: mode of the lock which + the read cursor should set on + records: LOCK_S or LOCK_X; the + latter is possible in + SELECT FOR UPDATE */ + ulint gap_mode,/*!< in: LOCK_ORDINARY, LOCK_GAP, or + LOCK_REC_NOT_GAP */ + que_thr_t* thr) /*!< in: query thread */ + __attribute__((nonnull, warn_unused_result)); +/*********************************************************************//** +Checks that a record is seen in a consistent read. +@return true if sees, or false if an earlier version of the record +should be retrieved */ +UNIV_INTERN +bool +lock_clust_rec_cons_read_sees( +/*==========================*/ + const rec_t* rec, /*!< in: user record which should be read or + passed over by a read cursor */ + dict_index_t* index, /*!< in: clustered index */ + const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */ + read_view_t* view); /*!< in: consistent read view */ +/*********************************************************************//** +Checks that a non-clustered index record is seen in a consistent read. + +NOTE that a non-clustered index page contains so little information on +its modifications that also in the case false, the present version of +rec may be the right, but we must check this from the clustered index +record. + +@return true if certainly sees, or false if an earlier version of the +clustered index record might be needed */ +UNIV_INTERN +bool +lock_sec_rec_cons_read_sees( +/*========================*/ + const rec_t* rec, /*!< in: user record which + should be read or passed over + by a read cursor */ + const read_view_t* view) /*!< in: consistent read view */ + __attribute__((nonnull, warn_unused_result)); +/*********************************************************************//** +Locks the specified database table in the mode given. If the lock cannot +be granted immediately, the query thread is put to wait. +@return DB_SUCCESS, DB_LOCK_WAIT, DB_DEADLOCK, or DB_QUE_THR_SUSPENDED */ +UNIV_INTERN +dberr_t +lock_table( +/*=======*/ + ulint flags, /*!< in: if BTR_NO_LOCKING_FLAG bit is set, + does nothing */ + dict_table_t* table, /*!< in/out: database table + in dictionary cache */ + enum lock_mode mode, /*!< in: lock mode */ + que_thr_t* thr) /*!< in: query thread */ + __attribute__((nonnull, warn_unused_result)); +/*********************************************************************//** +Creates a table IX lock object for a resurrected transaction. */ +UNIV_INTERN +void +lock_table_ix_resurrect( +/*====================*/ + dict_table_t* table, /*!< in/out: table */ + trx_t* trx); /*!< in/out: transaction */ +/*************************************************************//** +Removes a granted record lock of a transaction from the queue and grants +locks to other transactions waiting in the queue if they now are entitled +to a lock. */ +UNIV_INTERN +void +lock_rec_unlock( +/*============*/ + trx_t* trx, /*!< in/out: transaction that has + set a record lock */ + const buf_block_t* block, /*!< in: buffer block containing rec */ + const rec_t* rec, /*!< in: record */ + enum lock_mode lock_mode);/*!< in: LOCK_S or LOCK_X */ +/*********************************************************************//** +Releases a transaction's locks, and releases possible other transactions +waiting because of these locks. Change the state of the transaction to +TRX_STATE_COMMITTED_IN_MEMORY. */ +UNIV_INTERN +void +lock_trx_release_locks( +/*===================*/ + trx_t* trx); /*!< in/out: transaction */ +/*********************************************************************//** +Removes locks on a table to be dropped or truncated. +If remove_also_table_sx_locks is TRUE then table-level S and X locks are +also removed in addition to other table-level and record-level locks. +No lock, that is going to be removed, is allowed to be a wait lock. */ +UNIV_INTERN +void +lock_remove_all_on_table( +/*=====================*/ + dict_table_t* table, /*!< in: table to be dropped + or truncated */ + ibool remove_also_table_sx_locks);/*!< in: also removes + table S and X locks */ + +/*********************************************************************//** +Calculates the fold value of a page file address: used in inserting or +searching for a lock in the hash table. +@return folded value */ +UNIV_INLINE +ulint +lock_rec_fold( +/*==========*/ + ulint space, /*!< in: space */ + ulint page_no)/*!< in: page number */ + __attribute__((const)); +/*********************************************************************//** +Calculates the hash value of a page file address: used in inserting or +searching for a lock in the hash table. +@return hashed value */ +UNIV_INLINE +ulint +lock_rec_hash( +/*==========*/ + ulint space, /*!< in: space */ + ulint page_no);/*!< in: page number */ + +/**********************************************************************//** +Looks for a set bit in a record lock bitmap. Returns ULINT_UNDEFINED, +if none found. +@return bit index == heap number of the record, or ULINT_UNDEFINED if +none found */ +UNIV_INTERN +ulint +lock_rec_find_set_bit( +/*==================*/ + const lock_t* lock); /*!< in: record lock with at least one + bit set */ + +/*********************************************************************//** +Gets the source table of an ALTER TABLE transaction. The table must be +covered by an IX or IS table lock. +@return the source table of transaction, if it is covered by an IX or +IS table lock; dest if there is no source table, and NULL if the +transaction is locking more than two tables or an inconsistency is +found */ +UNIV_INTERN +dict_table_t* +lock_get_src_table( +/*===============*/ + trx_t* trx, /*!< in: transaction */ + dict_table_t* dest, /*!< in: destination of ALTER TABLE */ + enum lock_mode* mode); /*!< out: lock mode of the source table */ +/*********************************************************************//** +Determine if the given table is exclusively "owned" by the given +transaction, i.e., transaction holds LOCK_IX and possibly LOCK_AUTO_INC +on the table. +@return TRUE if table is only locked by trx, with LOCK_IX, and +possibly LOCK_AUTO_INC */ +UNIV_INTERN +ibool +lock_is_table_exclusive( +/*====================*/ + const dict_table_t* table, /*!< in: table */ + const trx_t* trx) /*!< in: transaction */ + __attribute__((nonnull)); +/*********************************************************************//** +Checks if a lock request lock1 has to wait for request lock2. +@return TRUE if lock1 has to wait for lock2 to be removed */ +UNIV_INTERN +ibool +lock_has_to_wait( +/*=============*/ + const lock_t* lock1, /*!< in: waiting lock */ + const lock_t* lock2); /*!< in: another lock; NOTE that it is + assumed that this has a lock bit set + on the same record as in lock1 if the + locks are record locks */ +/*********************************************************************//** +Reports that a transaction id is insensible, i.e., in the future. */ +UNIV_INTERN +void +lock_report_trx_id_insanity( +/*========================*/ + trx_id_t trx_id, /*!< in: trx id */ + const rec_t* rec, /*!< in: user record */ + dict_index_t* index, /*!< in: index */ + const ulint* offsets, /*!< in: rec_get_offsets(rec, index) */ + trx_id_t max_trx_id) /*!< in: trx_sys_get_max_trx_id() */ + __attribute__((nonnull)); +/*********************************************************************//** +Prints info of a table lock. */ +UNIV_INTERN +void +lock_table_print( +/*=============*/ + FILE* file, /*!< in: file where to print */ + const lock_t* lock); /*!< in: table type lock */ +/*********************************************************************//** +Prints info of a record lock. */ +UNIV_INTERN +void +lock_rec_print( +/*===========*/ + FILE* file, /*!< in: file where to print */ + const lock_t* lock); /*!< in: record type lock */ +/*********************************************************************//** +Prints info of locks for all transactions. +@return FALSE if not able to obtain lock mutex and exits without +printing info */ +UNIV_INTERN +ibool +lock_print_info_summary( +/*====================*/ + FILE* file, /*!< in: file where to print */ + ibool nowait) /*!< in: whether to wait for the lock mutex */ + __attribute__((nonnull, warn_unused_result)); +/*********************************************************************//** +Prints info of locks for each transaction. This function assumes that the +caller holds the lock mutex and more importantly it will release the lock +mutex on behalf of the caller. (This should be fixed in the future). */ +UNIV_INTERN +void +lock_print_info_all_transactions( +/*=============================*/ + FILE* file); /*!< in: file where to print */ +/*********************************************************************//** +Return approximate number or record locks (bits set in the bitmap) for +this transaction. Since delete-marked records may be removed, the +record count will not be precise. +The caller must be holding lock_sys->mutex. */ +UNIV_INTERN +ulint +lock_number_of_rows_locked( +/*=======================*/ + const trx_lock_t* trx_lock) /*!< in: transaction locks */ + __attribute__((nonnull, warn_unused_result)); + +/*******************************************************************//** +Gets the type of a lock. Non-inline version for using outside of the +lock module. +@return LOCK_TABLE or LOCK_REC */ +UNIV_INTERN +ulint +lock_get_type( +/*==========*/ + const lock_t* lock); /*!< in: lock */ + +/*******************************************************************//** +Gets the id of the transaction owning a lock. +@return transaction id */ +UNIV_INTERN +trx_id_t +lock_get_trx_id( +/*============*/ + const lock_t* lock); /*!< in: lock */ + +/*******************************************************************//** +Gets the mode of a lock in a human readable string. +The string should not be free()'d or modified. +@return lock mode */ +UNIV_INTERN +const char* +lock_get_mode_str( +/*==============*/ + const lock_t* lock); /*!< in: lock */ + +/*******************************************************************//** +Gets the type of a lock in a human readable string. +The string should not be free()'d or modified. +@return lock type */ +UNIV_INTERN +const char* +lock_get_type_str( +/*==============*/ + const lock_t* lock); /*!< in: lock */ + +/*******************************************************************//** +Gets the id of the table on which the lock is. +@return id of the table */ +UNIV_INTERN +table_id_t +lock_get_table_id( +/*==============*/ + const lock_t* lock); /*!< in: lock */ + +/*******************************************************************//** +Gets the name of the table on which the lock is. +The string should not be free()'d or modified. +@return name of the table */ +UNIV_INTERN +const char* +lock_get_table_name( +/*================*/ + const lock_t* lock); /*!< in: lock */ + +/*******************************************************************//** +For a record lock, gets the index on which the lock is. +@return index */ +UNIV_INTERN +const dict_index_t* +lock_rec_get_index( +/*===============*/ + const lock_t* lock); /*!< in: lock */ + +/*******************************************************************//** +For a record lock, gets the name of the index on which the lock is. +The string should not be free()'d or modified. +@return name of the index */ +UNIV_INTERN +const char* +lock_rec_get_index_name( +/*====================*/ + const lock_t* lock); /*!< in: lock */ + +/*******************************************************************//** +For a record lock, gets the tablespace number on which the lock is. +@return tablespace number */ +UNIV_INTERN +ulint +lock_rec_get_space_id( +/*==================*/ + const lock_t* lock); /*!< in: lock */ + +/*******************************************************************//** +For a record lock, gets the page number on which the lock is. +@return page number */ +UNIV_INTERN +ulint +lock_rec_get_page_no( +/*=================*/ + const lock_t* lock); /*!< in: lock */ +/*******************************************************************//** +Check if there are any locks (table or rec) against table. +@return TRUE if locks exist */ +UNIV_INTERN +ibool +lock_table_has_locks( +/*=================*/ + const dict_table_t* table); /*!< in: check if there are any locks + held on records in this table or on the + table itself */ + +/*********************************************************************//** +A thread which wakes up threads whose lock wait may have lasted too long. +@return a dummy parameter */ +extern "C" UNIV_INTERN +os_thread_ret_t +DECLARE_THREAD(lock_wait_timeout_thread)( +/*=====================================*/ + void* arg); /*!< in: a dummy parameter required by + os_thread_create */ + +/********************************************************************//** +Releases a user OS thread waiting for a lock to be released, if the +thread is already suspended. */ +UNIV_INTERN +void +lock_wait_release_thread_if_suspended( +/*==================================*/ + que_thr_t* thr); /*!< in: query thread associated with the + user OS thread */ + +/***************************************************************//** +Puts a user OS thread to wait for a lock to be released. If an error +occurs during the wait trx->error_state associated with thr is +!= DB_SUCCESS when we return. DB_LOCK_WAIT_TIMEOUT and DB_DEADLOCK +are possible errors. DB_DEADLOCK is returned if selective deadlock +resolution chose this transaction as a victim. */ +UNIV_INTERN +void +lock_wait_suspend_thread( +/*=====================*/ + que_thr_t* thr); /*!< in: query thread associated with the + user OS thread */ +/*********************************************************************//** +Unlocks AUTO_INC type locks that were possibly reserved by a trx. This +function should be called at the the end of an SQL statement, by the +connection thread that owns the transaction (trx->mysql_thd). */ +UNIV_INTERN +void +lock_unlock_table_autoinc( +/*======================*/ + trx_t* trx); /*!< in/out: transaction */ +/*********************************************************************//** +Check whether the transaction has already been rolled back because it +was selected as a deadlock victim, or if it has to wait then cancel +the wait lock. +@return DB_DEADLOCK, DB_LOCK_WAIT or DB_SUCCESS */ +UNIV_INTERN +dberr_t +lock_trx_handle_wait( +/*=================*/ + trx_t* trx) /*!< in/out: trx lock state */ + __attribute__((nonnull)); +/*********************************************************************//** +Get the number of locks on a table. +@return number of locks */ +UNIV_INTERN +ulint +lock_table_get_n_locks( +/*===================*/ + const dict_table_t* table) /*!< in: table */ + __attribute__((nonnull)); +#ifdef UNIV_DEBUG +/*********************************************************************//** +Checks that a transaction id is sensible, i.e., not in the future. +@return true if ok */ +UNIV_INTERN +bool +lock_check_trx_id_sanity( +/*=====================*/ + trx_id_t trx_id, /*!< in: trx id */ + const rec_t* rec, /*!< in: user record */ + dict_index_t* index, /*!< in: index */ + const ulint* offsets) /*!< in: rec_get_offsets(rec, index) */ + __attribute__((nonnull, warn_unused_result)); +/*******************************************************************//** +Check if the transaction holds any locks on the sys tables +or its records. +@return the strongest lock found on any sys table or 0 for none */ +UNIV_INTERN +const lock_t* +lock_trx_has_sys_table_locks( +/*=========================*/ + const trx_t* trx) /*!< in: transaction to check */ + __attribute__((warn_unused_result)); + +/*******************************************************************//** +Check if the transaction holds an exclusive lock on a record. +@return whether the locks are held */ +UNIV_INTERN +bool +lock_trx_has_rec_x_lock( +/*====================*/ + const trx_t* trx, /*!< in: transaction to check */ + const dict_table_t* table, /*!< in: table to check */ + const buf_block_t* block, /*!< in: buffer block of the record */ + ulint heap_no)/*!< in: record heap number */ + __attribute__((nonnull, warn_unused_result)); +#endif /* UNIV_DEBUG */ + +/** Lock modes and types */ +/* @{ */ +#define LOCK_MODE_MASK 0xFUL /*!< mask used to extract mode from the + type_mode field in a lock */ +/** Lock types */ +/* @{ */ +#define LOCK_TABLE 16 /*!< table lock */ +#define LOCK_REC 32 /*!< record lock */ +#define LOCK_TYPE_MASK 0xF0UL /*!< mask used to extract lock type from the + type_mode field in a lock */ +#if LOCK_MODE_MASK & LOCK_TYPE_MASK +# error "LOCK_MODE_MASK & LOCK_TYPE_MASK" +#endif + +#define LOCK_WAIT 256 /*!< Waiting lock flag; when set, it + means that the lock has not yet been + granted, it is just waiting for its + turn in the wait queue */ +/* Precise modes */ +#define LOCK_ORDINARY 0 /*!< this flag denotes an ordinary + next-key lock in contrast to LOCK_GAP + or LOCK_REC_NOT_GAP */ +#define LOCK_GAP 512 /*!< when this bit is set, it means that the + lock holds only on the gap before the record; + for instance, an x-lock on the gap does not + give permission to modify the record on which + the bit is set; locks of this type are created + when records are removed from the index chain + of records */ +#define LOCK_REC_NOT_GAP 1024 /*!< this bit means that the lock is only on + the index record and does NOT block inserts + to the gap before the index record; this is + used in the case when we retrieve a record + with a unique key, and is also used in + locking plain SELECTs (not part of UPDATE + or DELETE) when the user has set the READ + COMMITTED isolation level */ +#define LOCK_INSERT_INTENTION 2048 /*!< this bit is set when we place a waiting + gap type record lock request in order to let + an insert of an index record to wait until + there are no conflicting locks by other + transactions on the gap; note that this flag + remains set when the waiting lock is granted, + or if the lock is inherited to a neighboring + record */ + +#if (LOCK_WAIT|LOCK_GAP|LOCK_REC_NOT_GAP|LOCK_INSERT_INTENTION)&LOCK_MODE_MASK +# error +#endif +#if (LOCK_WAIT|LOCK_GAP|LOCK_REC_NOT_GAP|LOCK_INSERT_INTENTION)&LOCK_TYPE_MASK +# error +#endif +/* @} */ + +/** Lock operation struct */ +struct lock_op_t{ + dict_table_t* table; /*!< table to be locked */ + enum lock_mode mode; /*!< lock mode */ +}; + +/** The lock system struct */ +struct lock_sys_t{ + ib_mutex_t mutex; /*!< Mutex protecting the + locks */ + hash_table_t* rec_hash; /*!< hash table of the record + locks */ + ib_mutex_t wait_mutex; /*!< Mutex protecting the + next two fields */ + srv_slot_t* waiting_threads; /*!< Array of user threads + suspended while waiting for + locks within InnoDB, protected + by the lock_sys->wait_mutex */ + srv_slot_t* last_slot; /*!< highest slot ever used + in the waiting_threads array, + protected by + lock_sys->wait_mutex */ + ibool rollback_complete; + /*!< TRUE if rollback of all + recovered transactions is + complete. Protected by + lock_sys->mutex */ + + ulint n_lock_max_wait_time; /*!< Max wait time */ + + os_event_t timeout_event; /*!< Set to the event that is + created in the lock wait monitor + thread. A value of 0 means the + thread is not active */ + + bool timeout_thread_active; /*!< True if the timeout thread + is running */ +}; + +/** The lock system */ +extern lock_sys_t* lock_sys; + +/** Test if lock_sys->mutex can be acquired without waiting. */ +#define lock_mutex_enter_nowait() mutex_enter_nowait(&lock_sys->mutex) + +/** Test if lock_sys->mutex is owned. */ +#define lock_mutex_own() mutex_own(&lock_sys->mutex) + +/** Acquire the lock_sys->mutex. */ +#define lock_mutex_enter() do { \ + mutex_enter(&lock_sys->mutex); \ +} while (0) + +/** Release the lock_sys->mutex. */ +#define lock_mutex_exit() do { \ + mutex_exit(&lock_sys->mutex); \ +} while (0) + +/** Test if lock_sys->wait_mutex is owned. */ +#define lock_wait_mutex_own() mutex_own(&lock_sys->wait_mutex) + +/** Acquire the lock_sys->wait_mutex. */ +#define lock_wait_mutex_enter() do { \ + mutex_enter(&lock_sys->wait_mutex); \ +} while (0) + +/** Release the lock_sys->wait_mutex. */ +#define lock_wait_mutex_exit() do { \ + mutex_exit(&lock_sys->wait_mutex); \ +} while (0) + +#ifndef UNIV_NONINL +#include "lock0lock.ic" +#endif + +#endif diff --git a/storage/innobase/include/lock0lock.ic b/storage/innobase/include/lock0lock.ic new file mode 100644 index 00000000000..736936954cb --- /dev/null +++ b/storage/innobase/include/lock0lock.ic @@ -0,0 +1,92 @@ +/***************************************************************************** + +Copyright (c) 1996, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/lock0lock.ic +The transaction lock system + +Created 5/7/1996 Heikki Tuuri +*******************************************************/ + +#include "sync0sync.h" +#include "srv0srv.h" +#include "dict0dict.h" +#include "row0row.h" +#include "trx0sys.h" +#include "trx0trx.h" +#include "buf0buf.h" +#include "page0page.h" +#include "page0cur.h" +#include "row0vers.h" +#include "que0que.h" +#include "btr0cur.h" +#include "read0read.h" +#include "log0recv.h" + +/*********************************************************************//** +Calculates the fold value of a page file address: used in inserting or +searching for a lock in the hash table. +@return folded value */ +UNIV_INLINE +ulint +lock_rec_fold( +/*==========*/ + ulint space, /*!< in: space */ + ulint page_no)/*!< in: page number */ +{ + return(ut_fold_ulint_pair(space, page_no)); +} + +/*********************************************************************//** +Calculates the hash value of a page file address: used in inserting or +searching for a lock in the hash table. +@return hashed value */ +UNIV_INLINE +ulint +lock_rec_hash( +/*==========*/ + ulint space, /*!< in: space */ + ulint page_no)/*!< in: page number */ +{ + return(hash_calc_hash(lock_rec_fold(space, page_no), + lock_sys->rec_hash)); +} + +/*********************************************************************//** +Gets the heap_no of the smallest user record on a page. +@return heap_no of smallest user record, or PAGE_HEAP_NO_SUPREMUM */ +UNIV_INLINE +ulint +lock_get_min_heap_no( +/*=================*/ + const buf_block_t* block) /*!< in: buffer block */ +{ + const page_t* page = block->frame; + + if (page_is_comp(page)) { + return(rec_get_heap_no_new( + page + + rec_get_next_offs(page + PAGE_NEW_INFIMUM, + TRUE))); + } else { + return(rec_get_heap_no_old( + page + + rec_get_next_offs(page + PAGE_OLD_INFIMUM, + FALSE))); + } +} diff --git a/storage/innobase/include/lock0priv.h b/storage/innobase/include/lock0priv.h new file mode 100644 index 00000000000..9f7ab9f76b6 --- /dev/null +++ b/storage/innobase/include/lock0priv.h @@ -0,0 +1,126 @@ +/***************************************************************************** + +Copyright (c) 2007, 2011, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/lock0priv.h +Lock module internal structures and methods. + +Created July 12, 2007 Vasil Dimov +*******************************************************/ + +#ifndef lock0priv_h +#define lock0priv_h + +#ifndef LOCK_MODULE_IMPLEMENTATION +/* If you need to access members of the structures defined in this +file, please write appropriate functions that retrieve them and put +those functions in lock/ */ +#error Do not include lock0priv.h outside of the lock/ module +#endif + +#include "univ.i" +#include "dict0types.h" +#include "hash0hash.h" +#include "trx0types.h" +#include "ut0lst.h" + +/** A table lock */ +struct lock_table_t { + dict_table_t* table; /*!< database table in dictionary + cache */ + UT_LIST_NODE_T(lock_t) + locks; /*!< list of locks on the same + table */ +}; + +/** Record lock for a page */ +struct lock_rec_t { + ulint space; /*!< space id */ + ulint page_no; /*!< page number */ + ulint n_bits; /*!< number of bits in the lock + bitmap; NOTE: the lock bitmap is + placed immediately after the + lock struct */ +}; + +/** Lock struct; protected by lock_sys->mutex */ +struct lock_t { + trx_t* trx; /*!< transaction owning the + lock */ + UT_LIST_NODE_T(lock_t) + trx_locks; /*!< list of the locks of the + transaction */ + ulint type_mode; /*!< lock type, mode, LOCK_GAP or + LOCK_REC_NOT_GAP, + LOCK_INSERT_INTENTION, + wait flag, ORed */ + hash_node_t hash; /*!< hash chain node for a record + lock */ + dict_index_t* index; /*!< index for a record lock */ + union { + lock_table_t tab_lock;/*!< table lock */ + lock_rec_t rec_lock;/*!< record lock */ + } un_member; /*!< lock details */ +}; + +/*********************************************************************//** +Gets the type of a lock. +@return LOCK_TABLE or LOCK_REC */ +UNIV_INLINE +ulint +lock_get_type_low( +/*==============*/ + const lock_t* lock); /*!< in: lock */ + +/*********************************************************************//** +Gets the previous record lock set on a record. +@return previous lock on the same record, NULL if none exists */ +UNIV_INTERN +const lock_t* +lock_rec_get_prev( +/*==============*/ + const lock_t* in_lock,/*!< in: record lock */ + ulint heap_no);/*!< in: heap number of the record */ + +/*********************************************************************//** +Cancels a waiting lock request and releases possible other transactions +waiting behind it. */ +UNIV_INTERN +void +lock_cancel_waiting_and_release( +/*============================*/ + lock_t* lock); /*!< in/out: waiting lock request */ + +/*********************************************************************//** +Checks if some transaction has an implicit x-lock on a record in a clustered +index. +@return transaction id of the transaction which has the x-lock, or 0 */ +UNIV_INLINE +trx_id_t +lock_clust_rec_some_has_impl( +/*=========================*/ + const rec_t* rec, /*!< in: user record */ + const dict_index_t* index, /*!< in: clustered index */ + const ulint* offsets)/*!< in: rec_get_offsets(rec, index) */ + __attribute__((nonnull, warn_unused_result)); + +#ifndef UNIV_NONINL +#include "lock0priv.ic" +#endif + +#endif /* lock0priv_h */ diff --git a/storage/innobase/include/lock0priv.ic b/storage/innobase/include/lock0priv.ic new file mode 100644 index 00000000000..6b70dc33d3c --- /dev/null +++ b/storage/innobase/include/lock0priv.ic @@ -0,0 +1,67 @@ +/***************************************************************************** + +Copyright (c) 2007, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/lock0priv.ic +Lock module internal inline methods. + +Created July 16, 2007 Vasil Dimov +*******************************************************/ + +/* This file contains only methods which are used in +lock/lock0* files, other than lock/lock0lock.cc. +I.e. lock/lock0lock.cc contains more internal inline +methods but they are used only in that file. */ + +#ifndef LOCK_MODULE_IMPLEMENTATION +#error Do not include lock0priv.ic outside of the lock/ module +#endif + +/*********************************************************************//** +Gets the type of a lock. +@return LOCK_TABLE or LOCK_REC */ +UNIV_INLINE +ulint +lock_get_type_low( +/*==============*/ + const lock_t* lock) /*!< in: lock */ +{ + ut_ad(lock); + + return(lock->type_mode & LOCK_TYPE_MASK); +} + +/*********************************************************************//** +Checks if some transaction has an implicit x-lock on a record in a clustered +index. +@return transaction id of the transaction which has the x-lock, or 0 */ +UNIV_INLINE +trx_id_t +lock_clust_rec_some_has_impl( +/*=========================*/ + const rec_t* rec, /*!< in: user record */ + const dict_index_t* index, /*!< in: clustered index */ + const ulint* offsets)/*!< in: rec_get_offsets(rec, index) */ +{ + ut_ad(dict_index_is_clust(index)); + ut_ad(page_rec_is_user_rec(rec)); + + return(row_get_rec_trx_id(rec, index, offsets)); +} + +/* vim: set filetype=c: */ diff --git a/storage/innobase/include/lock0types.h b/storage/innobase/include/lock0types.h new file mode 100644 index 00000000000..cf32e72f864 --- /dev/null +++ b/storage/innobase/include/lock0types.h @@ -0,0 +1,47 @@ +/***************************************************************************** + +Copyright (c) 1996, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/lock0types.h +The transaction lock system global types + +Created 5/7/1996 Heikki Tuuri +*******************************************************/ + +#ifndef lock0types_h +#define lock0types_h + +#define lock_t ib_lock_t +struct lock_t; +struct lock_sys_t; + +/* Basic lock modes */ +enum lock_mode { + LOCK_IS = 0, /* intention shared */ + LOCK_IX, /* intention exclusive */ + LOCK_S, /* shared */ + LOCK_X, /* exclusive */ + LOCK_AUTO_INC, /* locks the auto-inc counter of a table + in an exclusive mode */ + LOCK_NONE, /* this is used elsewhere to note consistent read */ + LOCK_NUM = LOCK_NONE, /* number of lock modes */ + LOCK_NONE_UNSET = 255 +}; + + +#endif diff --git a/storage/innobase/include/log0log.h b/storage/innobase/include/log0log.h new file mode 100644 index 00000000000..1318b62c242 --- /dev/null +++ b/storage/innobase/include/log0log.h @@ -0,0 +1,999 @@ +/***************************************************************************** + +Copyright (c) 1995, 2013, Oracle and/or its affiliates. All rights reserved. +Copyright (c) 2009, Google Inc. + +Portions of this file contain modifications contributed and copyrighted by +Google, Inc. Those modifications are gratefully acknowledged and are described +briefly in the InnoDB documentation. The contributions by Google are +incorporated with their permission, and subject to the conditions contained in +the file COPYING.Google. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/log0log.h +Database log + +Created 12/9/1995 Heikki Tuuri +*******************************************************/ + +#ifndef log0log_h +#define log0log_h + +#include "univ.i" +#include "ut0byte.h" +#include "ut0lst.h" +#ifndef UNIV_HOTBACKUP +#include "sync0sync.h" +#include "sync0rw.h" +#endif /* !UNIV_HOTBACKUP */ + +/* Type used for all log sequence number storage and arithmetics */ +typedef ib_uint64_t lsn_t; +#define LSN_MAX IB_UINT64_MAX + +#define LSN_PF UINT64PF + +/** Redo log buffer */ +struct log_t; +/** Redo log group */ +struct log_group_t; + +#ifdef UNIV_DEBUG +/** Flag: write to log file? */ +extern ibool log_do_write; +/** Flag: enable debug output when writing to the log? */ +extern ibool log_debug_writes; +#else /* UNIV_DEBUG */ +/** Write to log */ +# define log_do_write TRUE +#endif /* UNIV_DEBUG */ + +/** Wait modes for log_write_up_to @{ */ +#define LOG_NO_WAIT 91 +#define LOG_WAIT_ONE_GROUP 92 +#define LOG_WAIT_ALL_GROUPS 93 +/* @} */ +/** Maximum number of log groups in log_group_t::checkpoint_buf */ +#define LOG_MAX_N_GROUPS 32 + +/*******************************************************************//** +Calculates where in log files we find a specified lsn. +@return log file number */ +UNIV_INTERN +ulint +log_calc_where_lsn_is( +/*==================*/ + ib_int64_t* log_file_offset, /*!< out: offset in that file + (including the header) */ + ib_uint64_t first_header_lsn, /*!< in: first log file start + lsn */ + ib_uint64_t lsn, /*!< in: lsn whose position to + determine */ + ulint n_log_files, /*!< in: total number of log + files */ + ib_int64_t log_file_size); /*!< in: log file size + (including the header) */ +#ifndef UNIV_HOTBACKUP +/************************************************************//** +Writes to the log the string given. The log must be released with +log_release. +@return end lsn of the log record, zero if did not succeed */ +UNIV_INLINE +lsn_t +log_reserve_and_write_fast( +/*=======================*/ + const void* str, /*!< in: string */ + ulint len, /*!< in: string length */ + lsn_t* start_lsn);/*!< out: start lsn of the log record */ +/***********************************************************************//** +Releases the log mutex. */ +UNIV_INLINE +void +log_release(void); +/*=============*/ +/***********************************************************************//** +Checks if there is need for a log buffer flush or a new checkpoint, and does +this if yes. Any database operation should call this when it has modified +more than about 4 pages. NOTE that this function may only be called when the +OS thread owns no synchronization objects except the dictionary mutex. */ +UNIV_INLINE +void +log_free_check(void); +/*================*/ +/************************************************************//** +Opens the log for log_write_low. The log must be closed with log_close and +released with log_release. +@return start lsn of the log record */ +UNIV_INTERN +lsn_t +log_reserve_and_open( +/*=================*/ + ulint len); /*!< in: length of data to be catenated */ +/************************************************************//** +Writes to the log the string given. It is assumed that the caller holds the +log mutex. */ +UNIV_INTERN +void +log_write_low( +/*==========*/ + byte* str, /*!< in: string */ + ulint str_len); /*!< in: string length */ +/************************************************************//** +Closes the log. +@return lsn */ +UNIV_INTERN +lsn_t +log_close(void); +/*===========*/ +/************************************************************//** +Gets the current lsn. +@return current lsn */ +UNIV_INLINE +lsn_t +log_get_lsn(void); +/*=============*/ +/**************************************************************** +Gets the log group capacity. It is OK to read the value without +holding log_sys->mutex because it is constant. +@return log group capacity */ +UNIV_INLINE +lsn_t +log_get_capacity(void); +/*==================*/ +/**************************************************************** +Get log_sys::max_modified_age_async. It is OK to read the value without +holding log_sys::mutex because it is constant. +@return max_modified_age_async */ +UNIV_INLINE +lsn_t +log_get_max_modified_age_async(void); +/*================================*/ +/******************************************************//** +Initializes the log. */ +UNIV_INTERN +void +log_init(void); +/*==========*/ +/******************************************************************//** +Inits a log group to the log system. */ +UNIV_INTERN +void +log_group_init( +/*===========*/ + ulint id, /*!< in: group id */ + ulint n_files, /*!< in: number of log files */ + lsn_t file_size, /*!< in: log file size in bytes */ + ulint space_id, /*!< in: space id of the file space + which contains the log files of this + group */ + ulint archive_space_id); /*!< in: space id of the file space + which contains some archived log + files for this group; currently, only + for the first log group this is + used */ +/******************************************************//** +Completes an i/o to a log file. */ +UNIV_INTERN +void +log_io_complete( +/*============*/ + log_group_t* group); /*!< in: log group */ +/******************************************************//** +This function is called, e.g., when a transaction wants to commit. It checks +that the log has been written to the log file up to the last log entry written +by the transaction. If there is a flush running, it waits and checks if the +flush flushed enough. If not, starts a new flush. */ +UNIV_INTERN +void +log_write_up_to( +/*============*/ + lsn_t lsn, /*!< in: log sequence number up to which + the log should be written, LSN_MAX if not specified */ + ulint wait, /*!< in: LOG_NO_WAIT, LOG_WAIT_ONE_GROUP, + or LOG_WAIT_ALL_GROUPS */ + ibool flush_to_disk); + /*!< in: TRUE if we want the written log + also to be flushed to disk */ +/****************************************************************//** +Does a syncronous flush of the log buffer to disk. */ +UNIV_INTERN +void +log_buffer_flush_to_disk(void); +/*==========================*/ +/****************************************************************//** +This functions writes the log buffer to the log file and if 'flush' +is set it forces a flush of the log file as well. This is meant to be +called from background master thread only as it does not wait for +the write (+ possible flush) to finish. */ +UNIV_INTERN +void +log_buffer_sync_in_background( +/*==========================*/ + ibool flush); /*<! in: flush the logs to disk */ +/******************************************************//** +Makes a checkpoint. Note that this function does not flush dirty +blocks from the buffer pool: it only checks what is lsn of the oldest +modification in the pool, and writes information about the lsn in +log files. Use log_make_checkpoint_at to flush also the pool. +@return TRUE if success, FALSE if a checkpoint write was already running */ +UNIV_INTERN +ibool +log_checkpoint( +/*===========*/ + ibool sync, /*!< in: TRUE if synchronous operation is + desired */ + ibool write_always); /*!< in: the function normally checks if the + the new checkpoint would have a greater + lsn than the previous one: if not, then no + physical write is done; by setting this + parameter TRUE, a physical write will always be + made to log files */ +/****************************************************************//** +Makes a checkpoint at a given lsn or later. */ +UNIV_INTERN +void +log_make_checkpoint_at( +/*===================*/ + lsn_t lsn, /*!< in: make a checkpoint at this or a + later lsn, if LSN_MAX, makes + a checkpoint at the latest lsn */ + ibool write_always); /*!< in: the function normally checks if + the new checkpoint would have a + greater lsn than the previous one: if + not, then no physical write is done; + by setting this parameter TRUE, a + physical write will always be made to + log files */ +/****************************************************************//** +Makes a checkpoint at the latest lsn and writes it to first page of each +data file in the database, so that we know that the file spaces contain +all modifications up to that lsn. This can only be called at database +shutdown. This function also writes all log in log files to the log archive. */ +UNIV_INTERN +void +logs_empty_and_mark_files_at_shutdown(void); +/*=======================================*/ +/******************************************************//** +Reads a checkpoint info from a log group header to log_sys->checkpoint_buf. */ +UNIV_INTERN +void +log_group_read_checkpoint_info( +/*===========================*/ + log_group_t* group, /*!< in: log group */ + ulint field); /*!< in: LOG_CHECKPOINT_1 or LOG_CHECKPOINT_2 */ +/*******************************************************************//** +Gets info from a checkpoint about a log group. */ +UNIV_INTERN +void +log_checkpoint_get_nth_group_info( +/*==============================*/ + const byte* buf, /*!< in: buffer containing checkpoint info */ + ulint n, /*!< in: nth slot */ + ulint* file_no,/*!< out: archived file number */ + ulint* offset);/*!< out: archived file offset */ +/******************************************************//** +Writes checkpoint info to groups. */ +UNIV_INTERN +void +log_groups_write_checkpoint_info(void); +/*==================================*/ +/********************************************************************//** +Starts an archiving operation. +@return TRUE if succeed, FALSE if an archiving operation was already running */ +UNIV_INTERN +ibool +log_archive_do( +/*===========*/ + ibool sync, /*!< in: TRUE if synchronous operation is desired */ + ulint* n_bytes);/*!< out: archive log buffer size, 0 if nothing to + archive */ +/****************************************************************//** +Writes the log contents to the archive up to the lsn when this function was +called, and stops the archiving. When archiving is started again, the archived +log file numbers start from a number one higher, so that the archiving will +not write again to the archived log files which exist when this function +returns. +@return DB_SUCCESS or DB_ERROR */ +UNIV_INTERN +ulint +log_archive_stop(void); +/*==================*/ +/****************************************************************//** +Starts again archiving which has been stopped. +@return DB_SUCCESS or DB_ERROR */ +UNIV_INTERN +ulint +log_archive_start(void); +/*===================*/ +/****************************************************************//** +Stop archiving the log so that a gap may occur in the archived log files. +@return DB_SUCCESS or DB_ERROR */ +UNIV_INTERN +ulint +log_archive_noarchivelog(void); +/*==========================*/ +/****************************************************************//** +Start archiving the log so that a gap may occur in the archived log files. +@return DB_SUCCESS or DB_ERROR */ +UNIV_INTERN +ulint +log_archive_archivelog(void); +/*========================*/ +/******************************************************//** +Generates an archived log file name. */ +UNIV_INTERN +void +log_archived_file_name_gen( +/*=======================*/ + char* buf, /*!< in: buffer where to write */ + ulint id, /*!< in: group id */ + ulint file_no);/*!< in: file number */ +#else /* !UNIV_HOTBACKUP */ +/******************************************************//** +Writes info to a buffer of a log group when log files are created in +backup restoration. */ +UNIV_INTERN +void +log_reset_first_header_and_checkpoint( +/*==================================*/ + byte* hdr_buf,/*!< in: buffer which will be written to the + start of the first log file */ + ib_uint64_t start); /*!< in: lsn of the start of the first log file; + we pretend that there is a checkpoint at + start + LOG_BLOCK_HDR_SIZE */ +#endif /* !UNIV_HOTBACKUP */ +/********************************************************************//** +Checks that there is enough free space in the log to start a new query step. +Flushes the log buffer or makes a new checkpoint if necessary. NOTE: this +function may only be called if the calling thread owns no synchronization +objects! */ +UNIV_INTERN +void +log_check_margins(void); +/*===================*/ +#ifndef UNIV_HOTBACKUP +/******************************************************//** +Reads a specified log segment to a buffer. */ +UNIV_INTERN +void +log_group_read_log_seg( +/*===================*/ + ulint type, /*!< in: LOG_ARCHIVE or LOG_RECOVER */ + byte* buf, /*!< in: buffer where to read */ + log_group_t* group, /*!< in: log group */ + lsn_t start_lsn, /*!< in: read area start */ + lsn_t end_lsn); /*!< in: read area end */ +/******************************************************//** +Writes a buffer to a log file group. */ +UNIV_INTERN +void +log_group_write_buf( +/*================*/ + log_group_t* group, /*!< in: log group */ + byte* buf, /*!< in: buffer */ + ulint len, /*!< in: buffer len; must be divisible + by OS_FILE_LOG_BLOCK_SIZE */ + lsn_t start_lsn, /*!< in: start lsn of the buffer; must + be divisible by + OS_FILE_LOG_BLOCK_SIZE */ + ulint new_data_offset);/*!< in: start offset of new data in + buf: this parameter is used to decide + if we have to write a new log file + header */ +/********************************************************//** +Sets the field values in group to correspond to a given lsn. For this function +to work, the values must already be correctly initialized to correspond to +some lsn, for instance, a checkpoint lsn. */ +UNIV_INTERN +void +log_group_set_fields( +/*=================*/ + log_group_t* group, /*!< in/out: group */ + lsn_t lsn); /*!< in: lsn for which the values should be + set */ +/******************************************************//** +Calculates the data capacity of a log group, when the log file headers are not +included. +@return capacity in bytes */ +UNIV_INTERN +lsn_t +log_group_get_capacity( +/*===================*/ + const log_group_t* group); /*!< in: log group */ +#endif /* !UNIV_HOTBACKUP */ +/************************************************************//** +Gets a log block flush bit. +@return TRUE if this block was the first to be written in a log flush */ +UNIV_INLINE +ibool +log_block_get_flush_bit( +/*====================*/ + const byte* log_block); /*!< in: log block */ +/************************************************************//** +Gets a log block number stored in the header. +@return log block number stored in the block header */ +UNIV_INLINE +ulint +log_block_get_hdr_no( +/*=================*/ + const byte* log_block); /*!< in: log block */ +/************************************************************//** +Gets a log block data length. +@return log block data length measured as a byte offset from the block start */ +UNIV_INLINE +ulint +log_block_get_data_len( +/*===================*/ + const byte* log_block); /*!< in: log block */ +/************************************************************//** +Sets the log block data length. */ +UNIV_INLINE +void +log_block_set_data_len( +/*===================*/ + byte* log_block, /*!< in/out: log block */ + ulint len); /*!< in: data length */ +/************************************************************//** +Calculates the checksum for a log block. +@return checksum */ +UNIV_INLINE +ulint +log_block_calc_checksum( +/*====================*/ + const byte* block); /*!< in: log block */ +/************************************************************//** +Gets a log block checksum field value. +@return checksum */ +UNIV_INLINE +ulint +log_block_get_checksum( +/*===================*/ + const byte* log_block); /*!< in: log block */ +/************************************************************//** +Sets a log block checksum field value. */ +UNIV_INLINE +void +log_block_set_checksum( +/*===================*/ + byte* log_block, /*!< in/out: log block */ + ulint checksum); /*!< in: checksum */ +/************************************************************//** +Gets a log block first mtr log record group offset. +@return first mtr log record group byte offset from the block start, 0 +if none */ +UNIV_INLINE +ulint +log_block_get_first_rec_group( +/*==========================*/ + const byte* log_block); /*!< in: log block */ +/************************************************************//** +Sets the log block first mtr log record group offset. */ +UNIV_INLINE +void +log_block_set_first_rec_group( +/*==========================*/ + byte* log_block, /*!< in/out: log block */ + ulint offset); /*!< in: offset, 0 if none */ +/************************************************************//** +Gets a log block checkpoint number field (4 lowest bytes). +@return checkpoint no (4 lowest bytes) */ +UNIV_INLINE +ulint +log_block_get_checkpoint_no( +/*========================*/ + const byte* log_block); /*!< in: log block */ +/************************************************************//** +Initializes a log block in the log buffer. */ +UNIV_INLINE +void +log_block_init( +/*===========*/ + byte* log_block, /*!< in: pointer to the log buffer */ + lsn_t lsn); /*!< in: lsn within the log block */ +/************************************************************//** +Initializes a log block in the log buffer in the old, < 3.23.52 format, where +there was no checksum yet. */ +UNIV_INLINE +void +log_block_init_in_old_format( +/*=========================*/ + byte* log_block, /*!< in: pointer to the log buffer */ + lsn_t lsn); /*!< in: lsn within the log block */ +/************************************************************//** +Converts a lsn to a log block number. +@return log block number, it is > 0 and <= 1G */ +UNIV_INLINE +ulint +log_block_convert_lsn_to_no( +/*========================*/ + lsn_t lsn); /*!< in: lsn of a byte within the block */ +/******************************************************//** +Prints info of the log. */ +UNIV_INTERN +void +log_print( +/*======*/ + FILE* file); /*!< in: file where to print */ +/******************************************************//** +Peeks the current lsn. +@return TRUE if success, FALSE if could not get the log system mutex */ +UNIV_INTERN +ibool +log_peek_lsn( +/*=========*/ + lsn_t* lsn); /*!< out: if returns TRUE, current lsn is here */ +/**********************************************************************//** +Refreshes the statistics used to print per-second averages. */ +UNIV_INTERN +void +log_refresh_stats(void); +/*===================*/ +/********************************************************//** +Closes all log groups. */ +UNIV_INTERN +void +log_group_close_all(void); +/*=====================*/ +/********************************************************//** +Shutdown the log system but do not release all the memory. */ +UNIV_INTERN +void +log_shutdown(void); +/*==============*/ +/********************************************************//** +Free the log system data structures. */ +UNIV_INTERN +void +log_mem_free(void); +/*==============*/ + +extern log_t* log_sys; + +/* Values used as flags */ +#define LOG_FLUSH 7652559 +#define LOG_CHECKPOINT 78656949 +#ifdef UNIV_LOG_ARCHIVE +# define LOG_ARCHIVE 11122331 +#endif /* UNIV_LOG_ARCHIVE */ +#define LOG_RECOVER 98887331 + +/* The counting of lsn's starts from this value: this must be non-zero */ +#define LOG_START_LSN ((lsn_t) (16 * OS_FILE_LOG_BLOCK_SIZE)) + +#define LOG_BUFFER_SIZE (srv_log_buffer_size * UNIV_PAGE_SIZE) +#define LOG_ARCHIVE_BUF_SIZE (srv_log_buffer_size * UNIV_PAGE_SIZE / 4) + +/* Offsets of a log block header */ +#define LOG_BLOCK_HDR_NO 0 /* block number which must be > 0 and + is allowed to wrap around at 2G; the + highest bit is set to 1 if this is the + first log block in a log flush write + segment */ +#define LOG_BLOCK_FLUSH_BIT_MASK 0x80000000UL + /* mask used to get the highest bit in + the preceding field */ +#define LOG_BLOCK_HDR_DATA_LEN 4 /* number of bytes of log written to + this block */ +#define LOG_BLOCK_FIRST_REC_GROUP 6 /* offset of the first start of an + mtr log record group in this log block, + 0 if none; if the value is the same + as LOG_BLOCK_HDR_DATA_LEN, it means + that the first rec group has not yet + been catenated to this log block, but + if it will, it will start at this + offset; an archive recovery can + start parsing the log records starting + from this offset in this log block, + if value not 0 */ +#define LOG_BLOCK_CHECKPOINT_NO 8 /* 4 lower bytes of the value of + log_sys->next_checkpoint_no when the + log block was last written to: if the + block has not yet been written full, + this value is only updated before a + log buffer flush */ +#define LOG_BLOCK_HDR_SIZE 12 /* size of the log block header in + bytes */ + +/* Offsets of a log block trailer from the end of the block */ +#define LOG_BLOCK_CHECKSUM 4 /* 4 byte checksum of the log block + contents; in InnoDB versions + < 3.23.52 this did not contain the + checksum but the same value as + .._HDR_NO */ +#define LOG_BLOCK_TRL_SIZE 4 /* trailer size in bytes */ + +/* Offsets for a checkpoint field */ +#define LOG_CHECKPOINT_NO 0 +#define LOG_CHECKPOINT_LSN 8 +#define LOG_CHECKPOINT_OFFSET_LOW32 16 +#define LOG_CHECKPOINT_LOG_BUF_SIZE 20 +#define LOG_CHECKPOINT_ARCHIVED_LSN 24 +#define LOG_CHECKPOINT_GROUP_ARRAY 32 + +/* For each value smaller than LOG_MAX_N_GROUPS the following 8 bytes: */ + +#define LOG_CHECKPOINT_ARCHIVED_FILE_NO 0 +#define LOG_CHECKPOINT_ARCHIVED_OFFSET 4 + +#define LOG_CHECKPOINT_ARRAY_END (LOG_CHECKPOINT_GROUP_ARRAY\ + + LOG_MAX_N_GROUPS * 8) +#define LOG_CHECKPOINT_CHECKSUM_1 LOG_CHECKPOINT_ARRAY_END +#define LOG_CHECKPOINT_CHECKSUM_2 (4 + LOG_CHECKPOINT_ARRAY_END) +#if 0 +#define LOG_CHECKPOINT_FSP_FREE_LIMIT (8 + LOG_CHECKPOINT_ARRAY_END) + /*!< Not used (0); + This used to contain the + current fsp free limit in + tablespace 0, in units of one + megabyte. + + This information might have been used + since mysqlbackup version 0.35 but + before 1.41 to decide if unused ends of + non-auto-extending data files + in space 0 can be truncated. + + This information was made obsolete + by mysqlbackup --compress. */ +#define LOG_CHECKPOINT_FSP_MAGIC_N (12 + LOG_CHECKPOINT_ARRAY_END) + /*!< Not used (0); + This magic number tells if the + checkpoint contains the above field: + the field was added to + InnoDB-3.23.50 and + removed from MySQL 5.6 */ +#define LOG_CHECKPOINT_FSP_MAGIC_N_VAL 1441231243 + /*!< if LOG_CHECKPOINT_FSP_MAGIC_N + contains this value, then + LOG_CHECKPOINT_FSP_FREE_LIMIT + is valid */ +#endif +#define LOG_CHECKPOINT_OFFSET_HIGH32 (16 + LOG_CHECKPOINT_ARRAY_END) +#define LOG_CHECKPOINT_SIZE (20 + LOG_CHECKPOINT_ARRAY_END) + + +/* Offsets of a log file header */ +#define LOG_GROUP_ID 0 /* log group number */ +#define LOG_FILE_START_LSN 4 /* lsn of the start of data in this + log file */ +#define LOG_FILE_NO 12 /* 4-byte archived log file number; + this field is only defined in an + archived log file */ +#define LOG_FILE_WAS_CREATED_BY_HOT_BACKUP 16 + /* a 32-byte field which contains + the string 'ibbackup' and the + creation time if the log file was + created by mysqlbackup --restore; + when mysqld is first time started + on the restored database, it can + print helpful info for the user */ +#define LOG_FILE_ARCH_COMPLETED OS_FILE_LOG_BLOCK_SIZE + /* this 4-byte field is TRUE when + the writing of an archived log file + has been completed; this field is + only defined in an archived log file */ +#define LOG_FILE_END_LSN (OS_FILE_LOG_BLOCK_SIZE + 4) + /* lsn where the archived log file + at least extends: actually the + archived log file may extend to a + later lsn, as long as it is within the + same log block as this lsn; this field + is defined only when an archived log + file has been completely written */ +#define LOG_CHECKPOINT_1 OS_FILE_LOG_BLOCK_SIZE + /* first checkpoint field in the log + header; we write alternately to the + checkpoint fields when we make new + checkpoints; this field is only defined + in the first log file of a log group */ +#define LOG_CHECKPOINT_2 (3 * OS_FILE_LOG_BLOCK_SIZE) + /* second checkpoint field in the log + header */ +#define LOG_FILE_HDR_SIZE (4 * OS_FILE_LOG_BLOCK_SIZE) + +#define LOG_GROUP_OK 301 +#define LOG_GROUP_CORRUPTED 302 + +/** Log group consists of a number of log files, each of the same size; a log +group is implemented as a space in the sense of the module fil0fil. */ +struct log_group_t{ + /* The following fields are protected by log_sys->mutex */ + ulint id; /*!< log group id */ + ulint n_files; /*!< number of files in the group */ + lsn_t file_size; /*!< individual log file size in bytes, + including the log file header */ + ulint space_id; /*!< file space which implements the log + group */ + ulint state; /*!< LOG_GROUP_OK or + LOG_GROUP_CORRUPTED */ + lsn_t lsn; /*!< lsn used to fix coordinates within + the log group */ + lsn_t lsn_offset; /*!< the offset of the above lsn */ + ulint n_pending_writes;/*!< number of currently pending flush + writes for this log group */ + byte** file_header_bufs_ptr;/*!< unaligned buffers */ + byte** file_header_bufs;/*!< buffers for each file + header in the group */ +#ifdef UNIV_LOG_ARCHIVE + /*-----------------------------*/ + byte** archive_file_header_bufs_ptr;/*!< unaligned buffers */ + byte** archive_file_header_bufs;/*!< buffers for each file + header in the group */ + ulint archive_space_id;/*!< file space which + implements the log group + archive */ + ulint archived_file_no;/*!< file number corresponding to + log_sys->archived_lsn */ + ulint archived_offset;/*!< file offset corresponding to + log_sys->archived_lsn, 0 if we have + not yet written to the archive file + number archived_file_no */ + ulint next_archived_file_no;/*!< during an archive write, + until the write is completed, we + store the next value for + archived_file_no here: the write + completion function then sets the new + value to ..._file_no */ + ulint next_archived_offset; /*!< like the preceding field */ +#endif /* UNIV_LOG_ARCHIVE */ + /*-----------------------------*/ + lsn_t scanned_lsn; /*!< used only in recovery: recovery scan + succeeded up to this lsn in this log + group */ + byte* checkpoint_buf_ptr;/*!< unaligned checkpoint header */ + byte* checkpoint_buf; /*!< checkpoint header is written from + this buffer to the group */ + UT_LIST_NODE_T(log_group_t) + log_groups; /*!< list of log groups */ +}; + +/** Redo log buffer */ +struct log_t{ + byte pad[64]; /*!< padding to prevent other memory + update hotspots from residing on the + same memory cache line */ + lsn_t lsn; /*!< log sequence number */ + ulint buf_free; /*!< first free offset within the log + buffer */ +#ifndef UNIV_HOTBACKUP + ib_mutex_t mutex; /*!< mutex protecting the log */ + + ib_mutex_t log_flush_order_mutex;/*!< mutex to serialize access to + the flush list when we are putting + dirty blocks in the list. The idea + behind this mutex is to be able + to release log_sys->mutex during + mtr_commit and still ensure that + insertions in the flush_list happen + in the LSN order. */ +#endif /* !UNIV_HOTBACKUP */ + byte* buf_ptr; /* unaligned log buffer */ + byte* buf; /*!< log buffer */ + ulint buf_size; /*!< log buffer size in bytes */ + ulint max_buf_free; /*!< recommended maximum value of + buf_free, after which the buffer is + flushed */ + #ifdef UNIV_LOG_DEBUG + ulint old_buf_free; /*!< value of buf free when log was + last time opened; only in the debug + version */ + ib_uint64_t old_lsn; /*!< value of lsn when log was + last time opened; only in the + debug version */ +#endif /* UNIV_LOG_DEBUG */ + ibool check_flush_or_checkpoint; + /*!< this is set to TRUE when there may + be need to flush the log buffer, or + preflush buffer pool pages, or make + a checkpoint; this MUST be TRUE when + lsn - last_checkpoint_lsn > + max_checkpoint_age; this flag is + peeked at by log_free_check(), which + does not reserve the log mutex */ + UT_LIST_BASE_NODE_T(log_group_t) + log_groups; /*!< log groups */ + +#ifndef UNIV_HOTBACKUP + /** The fields involved in the log buffer flush @{ */ + + ulint buf_next_to_write;/*!< first offset in the log buffer + where the byte content may not exist + written to file, e.g., the start + offset of a log record catenated + later; this is advanced when a flush + operation is completed to all the log + groups */ + volatile bool is_extending; /*!< this is set to true during extend + the log buffer size */ + lsn_t written_to_some_lsn; + /*!< first log sequence number not yet + written to any log group; for this to + be advanced, it is enough that the + write i/o has been completed for any + one log group */ + lsn_t written_to_all_lsn; + /*!< first log sequence number not yet + written to some log group; for this to + be advanced, it is enough that the + write i/o has been completed for all + log groups. + Note that since InnoDB currently + has only one log group therefore + this value is redundant. Also it + is possible that this value + falls behind the + flushed_to_disk_lsn transiently. + It is appropriate to use either + flushed_to_disk_lsn or + write_lsn which are always + up-to-date and accurate. */ + lsn_t write_lsn; /*!< end lsn for the current running + write */ + ulint write_end_offset;/*!< the data in buffer has + been written up to this offset + when the current write ends: + this field will then be copied + to buf_next_to_write */ + lsn_t current_flush_lsn;/*!< end lsn for the current running + write + flush operation */ + lsn_t flushed_to_disk_lsn; + /*!< how far we have written the log + AND flushed to disk */ + ulint n_pending_writes;/*!< number of currently + pending flushes or writes */ + /* NOTE on the 'flush' in names of the fields below: starting from + 4.0.14, we separate the write of the log file and the actual fsync() + or other method to flush it to disk. The names below shhould really + be 'flush_or_write'! */ + os_event_t no_flush_event; /*!< this event is in the reset state + when a flush or a write is running; + a thread should wait for this without + owning the log mutex, but NOTE that + to set or reset this event, the + thread MUST own the log mutex! */ + ibool one_flushed; /*!< during a flush, this is + first FALSE and becomes TRUE + when one log group has been + written or flushed */ + os_event_t one_flushed_event;/*!< this event is reset when the + flush or write has not yet completed + for any log group; e.g., this means + that a transaction has been committed + when this is set; a thread should wait + for this without owning the log mutex, + but NOTE that to set or reset this + event, the thread MUST own the log + mutex! */ + ulint n_log_ios; /*!< number of log i/os initiated thus + far */ + ulint n_log_ios_old; /*!< number of log i/o's at the + previous printout */ + time_t last_printout_time;/*!< when log_print was last time + called */ + /* @} */ + + /** Fields involved in checkpoints @{ */ + lsn_t log_group_capacity; /*!< capacity of the log group; if + the checkpoint age exceeds this, it is + a serious error because it is possible + we will then overwrite log and spoil + crash recovery */ + lsn_t max_modified_age_async; + /*!< when this recommended + value for lsn - + buf_pool_get_oldest_modification() + is exceeded, we start an + asynchronous preflush of pool pages */ + lsn_t max_modified_age_sync; + /*!< when this recommended + value for lsn - + buf_pool_get_oldest_modification() + is exceeded, we start a + synchronous preflush of pool pages */ + lsn_t max_checkpoint_age_async; + /*!< when this checkpoint age + is exceeded we start an + asynchronous writing of a new + checkpoint */ + lsn_t max_checkpoint_age; + /*!< this is the maximum allowed value + for lsn - last_checkpoint_lsn when a + new query step is started */ + ib_uint64_t next_checkpoint_no; + /*!< next checkpoint number */ + lsn_t last_checkpoint_lsn; + /*!< latest checkpoint lsn */ + lsn_t next_checkpoint_lsn; + /*!< next checkpoint lsn */ + ulint n_pending_checkpoint_writes; + /*!< number of currently pending + checkpoint writes */ + rw_lock_t checkpoint_lock;/*!< this latch is x-locked when a + checkpoint write is running; a thread + should wait for this without owning + the log mutex */ +#endif /* !UNIV_HOTBACKUP */ + byte* checkpoint_buf_ptr;/* unaligned checkpoint header */ + byte* checkpoint_buf; /*!< checkpoint header is read to this + buffer */ + /* @} */ +#ifdef UNIV_LOG_ARCHIVE + /** Fields involved in archiving @{ */ + ulint archiving_state;/*!< LOG_ARCH_ON, LOG_ARCH_STOPPING + LOG_ARCH_STOPPED, LOG_ARCH_OFF */ + lsn_t archived_lsn; /*!< archiving has advanced to this + lsn */ + lsn_t max_archived_lsn_age_async; + /*!< recommended maximum age of + archived_lsn, before we start + asynchronous copying to the archive */ + lsn_t max_archived_lsn_age; + /*!< maximum allowed age for + archived_lsn */ + lsn_t next_archived_lsn;/*!< during an archive write, + until the write is completed, we + store the next value for + archived_lsn here: the write + completion function then sets the new + value to archived_lsn */ + ulint archiving_phase;/*!< LOG_ARCHIVE_READ or + LOG_ARCHIVE_WRITE */ + ulint n_pending_archive_ios; + /*!< number of currently pending reads + or writes in archiving */ + rw_lock_t archive_lock; /*!< this latch is x-locked when an + archive write is running; a thread + should wait for this without owning + the log mutex */ + ulint archive_buf_size;/*!< size of archive_buf */ + byte* archive_buf; /*!< log segment is written to the + archive from this buffer */ + os_event_t archiving_on; /*!< if archiving has been stopped, + a thread can wait for this event to + become signaled */ + /* @} */ +#endif /* UNIV_LOG_ARCHIVE */ +}; + +/** Test if flush order mutex is owned. */ +#define log_flush_order_mutex_own() \ + mutex_own(&log_sys->log_flush_order_mutex) + +/** Acquire the flush order mutex. */ +#define log_flush_order_mutex_enter() do { \ + mutex_enter(&log_sys->log_flush_order_mutex); \ +} while (0) +/** Release the flush order mutex. */ +# define log_flush_order_mutex_exit() do { \ + mutex_exit(&log_sys->log_flush_order_mutex); \ +} while (0) + +#ifdef UNIV_LOG_ARCHIVE +/** Archiving state @{ */ +#define LOG_ARCH_ON 71 +#define LOG_ARCH_STOPPING 72 +#define LOG_ARCH_STOPPING2 73 +#define LOG_ARCH_STOPPED 74 +#define LOG_ARCH_OFF 75 +/* @} */ +#endif /* UNIV_LOG_ARCHIVE */ + +#ifndef UNIV_NONINL +#include "log0log.ic" +#endif + +#endif diff --git a/storage/innobase/include/log0log.ic b/storage/innobase/include/log0log.ic new file mode 100644 index 00000000000..9fc12f766bf --- /dev/null +++ b/storage/innobase/include/log0log.ic @@ -0,0 +1,462 @@ +/***************************************************************************** + +Copyright (c) 1995, 2010, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/log0log.ic +Database log + +Created 12/9/1995 Heikki Tuuri +*******************************************************/ + +#include "os0file.h" +#include "mach0data.h" +#include "mtr0mtr.h" +#include "srv0mon.h" + +#ifdef UNIV_LOG_DEBUG +/******************************************************//** +Checks by parsing that the catenated log segment for a single mtr is +consistent. */ +UNIV_INTERN +ibool +log_check_log_recs( +/*===============*/ + const byte* buf, /*!< in: pointer to the start of + the log segment in the + log_sys->buf log buffer */ + ulint len, /*!< in: segment length in bytes */ + ib_uint64_t buf_start_lsn); /*!< in: buffer start lsn */ +#endif /* UNIV_LOG_DEBUG */ + +/************************************************************//** +Gets a log block flush bit. +@return TRUE if this block was the first to be written in a log flush */ +UNIV_INLINE +ibool +log_block_get_flush_bit( +/*====================*/ + const byte* log_block) /*!< in: log block */ +{ + if (LOG_BLOCK_FLUSH_BIT_MASK + & mach_read_from_4(log_block + LOG_BLOCK_HDR_NO)) { + + return(TRUE); + } + + return(FALSE); +} + +/************************************************************//** +Sets the log block flush bit. */ +UNIV_INLINE +void +log_block_set_flush_bit( +/*====================*/ + byte* log_block, /*!< in/out: log block */ + ibool val) /*!< in: value to set */ +{ + ulint field; + + field = mach_read_from_4(log_block + LOG_BLOCK_HDR_NO); + + if (val) { + field = field | LOG_BLOCK_FLUSH_BIT_MASK; + } else { + field = field & ~LOG_BLOCK_FLUSH_BIT_MASK; + } + + mach_write_to_4(log_block + LOG_BLOCK_HDR_NO, field); +} + +/************************************************************//** +Gets a log block number stored in the header. +@return log block number stored in the block header */ +UNIV_INLINE +ulint +log_block_get_hdr_no( +/*=================*/ + const byte* log_block) /*!< in: log block */ +{ + return(~LOG_BLOCK_FLUSH_BIT_MASK + & mach_read_from_4(log_block + LOG_BLOCK_HDR_NO)); +} + +/************************************************************//** +Sets the log block number stored in the header; NOTE that this must be set +before the flush bit! */ +UNIV_INLINE +void +log_block_set_hdr_no( +/*=================*/ + byte* log_block, /*!< in/out: log block */ + ulint n) /*!< in: log block number: must be > 0 and + < LOG_BLOCK_FLUSH_BIT_MASK */ +{ + ut_ad(n > 0); + ut_ad(n < LOG_BLOCK_FLUSH_BIT_MASK); + + mach_write_to_4(log_block + LOG_BLOCK_HDR_NO, n); +} + +/************************************************************//** +Gets a log block data length. +@return log block data length measured as a byte offset from the block start */ +UNIV_INLINE +ulint +log_block_get_data_len( +/*===================*/ + const byte* log_block) /*!< in: log block */ +{ + return(mach_read_from_2(log_block + LOG_BLOCK_HDR_DATA_LEN)); +} + +/************************************************************//** +Sets the log block data length. */ +UNIV_INLINE +void +log_block_set_data_len( +/*===================*/ + byte* log_block, /*!< in/out: log block */ + ulint len) /*!< in: data length */ +{ + mach_write_to_2(log_block + LOG_BLOCK_HDR_DATA_LEN, len); +} + +/************************************************************//** +Gets a log block first mtr log record group offset. +@return first mtr log record group byte offset from the block start, 0 +if none */ +UNIV_INLINE +ulint +log_block_get_first_rec_group( +/*==========================*/ + const byte* log_block) /*!< in: log block */ +{ + return(mach_read_from_2(log_block + LOG_BLOCK_FIRST_REC_GROUP)); +} + +/************************************************************//** +Sets the log block first mtr log record group offset. */ +UNIV_INLINE +void +log_block_set_first_rec_group( +/*==========================*/ + byte* log_block, /*!< in/out: log block */ + ulint offset) /*!< in: offset, 0 if none */ +{ + mach_write_to_2(log_block + LOG_BLOCK_FIRST_REC_GROUP, offset); +} + +/************************************************************//** +Gets a log block checkpoint number field (4 lowest bytes). +@return checkpoint no (4 lowest bytes) */ +UNIV_INLINE +ulint +log_block_get_checkpoint_no( +/*========================*/ + const byte* log_block) /*!< in: log block */ +{ + return(mach_read_from_4(log_block + LOG_BLOCK_CHECKPOINT_NO)); +} + +/************************************************************//** +Sets a log block checkpoint number field (4 lowest bytes). */ +UNIV_INLINE +void +log_block_set_checkpoint_no( +/*========================*/ + byte* log_block, /*!< in/out: log block */ + ib_uint64_t no) /*!< in: checkpoint no */ +{ + mach_write_to_4(log_block + LOG_BLOCK_CHECKPOINT_NO, (ulint) no); +} + +/************************************************************//** +Converts a lsn to a log block number. +@return log block number, it is > 0 and <= 1G */ +UNIV_INLINE +ulint +log_block_convert_lsn_to_no( +/*========================*/ + lsn_t lsn) /*!< in: lsn of a byte within the block */ +{ + return(((ulint) (lsn / OS_FILE_LOG_BLOCK_SIZE) & 0x3FFFFFFFUL) + 1); +} + +/************************************************************//** +Calculates the checksum for a log block. +@return checksum */ +UNIV_INLINE +ulint +log_block_calc_checksum( +/*====================*/ + const byte* block) /*!< in: log block */ +{ + ulint sum; + ulint sh; + ulint i; + + sum = 1; + sh = 0; + + for (i = 0; i < OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_TRL_SIZE; i++) { + ulint b = (ulint) block[i]; + sum &= 0x7FFFFFFFUL; + sum += b; + sum += b << sh; + sh++; + if (sh > 24) { + sh = 0; + } + } + + return(sum); +} + +/************************************************************//** +Gets a log block checksum field value. +@return checksum */ +UNIV_INLINE +ulint +log_block_get_checksum( +/*===================*/ + const byte* log_block) /*!< in: log block */ +{ + return(mach_read_from_4(log_block + OS_FILE_LOG_BLOCK_SIZE + - LOG_BLOCK_CHECKSUM)); +} + +/************************************************************//** +Sets a log block checksum field value. */ +UNIV_INLINE +void +log_block_set_checksum( +/*===================*/ + byte* log_block, /*!< in/out: log block */ + ulint checksum) /*!< in: checksum */ +{ + mach_write_to_4(log_block + OS_FILE_LOG_BLOCK_SIZE + - LOG_BLOCK_CHECKSUM, + checksum); +} + +/************************************************************//** +Initializes a log block in the log buffer. */ +UNIV_INLINE +void +log_block_init( +/*===========*/ + byte* log_block, /*!< in: pointer to the log buffer */ + lsn_t lsn) /*!< in: lsn within the log block */ +{ + ulint no; + + ut_ad(mutex_own(&(log_sys->mutex))); + + no = log_block_convert_lsn_to_no(lsn); + + log_block_set_hdr_no(log_block, no); + + log_block_set_data_len(log_block, LOG_BLOCK_HDR_SIZE); + log_block_set_first_rec_group(log_block, 0); +} + +/************************************************************//** +Initializes a log block in the log buffer in the old format, where there +was no checksum yet. */ +UNIV_INLINE +void +log_block_init_in_old_format( +/*=========================*/ + byte* log_block, /*!< in: pointer to the log buffer */ + lsn_t lsn) /*!< in: lsn within the log block */ +{ + ulint no; + + ut_ad(mutex_own(&(log_sys->mutex))); + + no = log_block_convert_lsn_to_no(lsn); + + log_block_set_hdr_no(log_block, no); + mach_write_to_4(log_block + OS_FILE_LOG_BLOCK_SIZE + - LOG_BLOCK_CHECKSUM, no); + log_block_set_data_len(log_block, LOG_BLOCK_HDR_SIZE); + log_block_set_first_rec_group(log_block, 0); +} + +#ifndef UNIV_HOTBACKUP +/************************************************************//** +Writes to the log the string given. The log must be released with +log_release. +@return end lsn of the log record, zero if did not succeed */ +UNIV_INLINE +lsn_t +log_reserve_and_write_fast( +/*=======================*/ + const void* str, /*!< in: string */ + ulint len, /*!< in: string length */ + lsn_t* start_lsn)/*!< out: start lsn of the log record */ +{ + ulint data_len; +#ifdef UNIV_LOG_LSN_DEBUG + /* length of the LSN pseudo-record */ + ulint lsn_len; +#endif /* UNIV_LOG_LSN_DEBUG */ + + mutex_enter(&log_sys->mutex); +#ifdef UNIV_LOG_LSN_DEBUG + lsn_len = 1 + + mach_get_compressed_size(log_sys->lsn >> 32) + + mach_get_compressed_size(log_sys->lsn & 0xFFFFFFFFUL); +#endif /* UNIV_LOG_LSN_DEBUG */ + + data_len = len +#ifdef UNIV_LOG_LSN_DEBUG + + lsn_len +#endif /* UNIV_LOG_LSN_DEBUG */ + + log_sys->buf_free % OS_FILE_LOG_BLOCK_SIZE; + + if (data_len >= OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_TRL_SIZE) { + + /* The string does not fit within the current log block + or the log block would become full */ + + mutex_exit(&log_sys->mutex); + + return(0); + } + + *start_lsn = log_sys->lsn; + +#ifdef UNIV_LOG_LSN_DEBUG + { + /* Write the LSN pseudo-record. */ + byte* b = &log_sys->buf[log_sys->buf_free]; + *b++ = MLOG_LSN | (MLOG_SINGLE_REC_FLAG & *(const byte*) str); + /* Write the LSN in two parts, + as a pseudo page number and space id. */ + b += mach_write_compressed(b, log_sys->lsn >> 32); + b += mach_write_compressed(b, log_sys->lsn & 0xFFFFFFFFUL); + ut_a(b - lsn_len == &log_sys->buf[log_sys->buf_free]); + + memcpy(b, str, len); + len += lsn_len; + } +#else /* UNIV_LOG_LSN_DEBUG */ + memcpy(log_sys->buf + log_sys->buf_free, str, len); +#endif /* UNIV_LOG_LSN_DEBUG */ + + log_block_set_data_len((byte*) ut_align_down(log_sys->buf + + log_sys->buf_free, + OS_FILE_LOG_BLOCK_SIZE), + data_len); +#ifdef UNIV_LOG_DEBUG + log_sys->old_buf_free = log_sys->buf_free; + log_sys->old_lsn = log_sys->lsn; +#endif + log_sys->buf_free += len; + + ut_ad(log_sys->buf_free <= log_sys->buf_size); + + log_sys->lsn += len; + + MONITOR_SET(MONITOR_LSN_CHECKPOINT_AGE, + log_sys->lsn - log_sys->last_checkpoint_lsn); + +#ifdef UNIV_LOG_DEBUG + log_check_log_recs(log_sys->buf + log_sys->old_buf_free, + log_sys->buf_free - log_sys->old_buf_free, + log_sys->old_lsn); +#endif + return(log_sys->lsn); +} + +/***********************************************************************//** +Releases the log mutex. */ +UNIV_INLINE +void +log_release(void) +/*=============*/ +{ + mutex_exit(&(log_sys->mutex)); +} + +/************************************************************//** +Gets the current lsn. +@return current lsn */ +UNIV_INLINE +lsn_t +log_get_lsn(void) +/*=============*/ +{ + lsn_t lsn; + + mutex_enter(&(log_sys->mutex)); + + lsn = log_sys->lsn; + + mutex_exit(&(log_sys->mutex)); + + return(lsn); +} + +/**************************************************************** +Gets the log group capacity. It is OK to read the value without +holding log_sys->mutex because it is constant. +@return log group capacity */ +UNIV_INLINE +lsn_t +log_get_capacity(void) +/*==================*/ +{ + return(log_sys->log_group_capacity); +} + +/**************************************************************** +Get log_sys::max_modified_age_async. It is OK to read the value without +holding log_sys::mutex because it is constant. +@return max_modified_age_async */ +UNIV_INLINE +lsn_t +log_get_max_modified_age_async(void) +/*================================*/ +{ + return(log_sys->max_modified_age_async); +} + +/***********************************************************************//** +Checks if there is need for a log buffer flush or a new checkpoint, and does +this if yes. Any database operation should call this when it has modified +more than about 4 pages. NOTE that this function may only be called when the +OS thread owns no synchronization objects except the dictionary mutex. */ +UNIV_INLINE +void +log_free_check(void) +/*================*/ +{ + +#ifdef UNIV_SYNC_DEBUG + ut_ad(sync_thread_levels_empty_except_dict()); +#endif /* UNIV_SYNC_DEBUG */ + + if (log_sys->check_flush_or_checkpoint) { + + log_check_margins(); + } +} +#endif /* !UNIV_HOTBACKUP */ diff --git a/storage/innobase/include/log0recv.h b/storage/innobase/include/log0recv.h new file mode 100644 index 00000000000..8ede49d4ecc --- /dev/null +++ b/storage/innobase/include/log0recv.h @@ -0,0 +1,505 @@ +/***************************************************************************** + +Copyright (c) 1997, 2014, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/log0recv.h +Recovery + +Created 9/20/1997 Heikki Tuuri +*******************************************************/ + +#ifndef log0recv_h +#define log0recv_h + +#include "univ.i" +#include "ut0byte.h" +#include "buf0types.h" +#include "hash0hash.h" +#include "log0log.h" +#include <list> + +#ifdef UNIV_HOTBACKUP +extern ibool recv_replay_file_ops; + +/*******************************************************************//** +Reads the checkpoint info needed in hot backup. +@return TRUE if success */ +UNIV_INTERN +ibool +recv_read_checkpoint_info_for_backup( +/*=================================*/ + const byte* hdr, /*!< in: buffer containing the log group + header */ + lsn_t* lsn, /*!< out: checkpoint lsn */ + lsn_t* offset, /*!< out: checkpoint offset in the log group */ + lsn_t* cp_no, /*!< out: checkpoint number */ + lsn_t* first_header_lsn) + /*!< out: lsn of of the start of the + first log file */ + __attribute__((nonnull)); +/*******************************************************************//** +Scans the log segment and n_bytes_scanned is set to the length of valid +log scanned. */ +UNIV_INTERN +void +recv_scan_log_seg_for_backup( +/*=========================*/ + byte* buf, /*!< in: buffer containing log data */ + ulint buf_len, /*!< in: data length in that buffer */ + lsn_t* scanned_lsn, /*!< in/out: lsn of buffer start, + we return scanned lsn */ + ulint* scanned_checkpoint_no, + /*!< in/out: 4 lowest bytes of the + highest scanned checkpoint number so + far */ + ulint* n_bytes_scanned);/*!< out: how much we were able to + scan, smaller than buf_len if log + data ended here */ +#endif /* UNIV_HOTBACKUP */ +/*******************************************************************//** +Returns TRUE if recovery is currently running. +@return recv_recovery_on */ +UNIV_INLINE +ibool +recv_recovery_is_on(void); +/*=====================*/ +#ifdef UNIV_LOG_ARCHIVE +/*******************************************************************//** +Returns TRUE if recovery from backup is currently running. +@return recv_recovery_from_backup_on */ +UNIV_INLINE +ibool +recv_recovery_from_backup_is_on(void); +/*=================================*/ +#endif /* UNIV_LOG_ARCHIVE */ +/************************************************************************//** +Applies the hashed log records to the page, if the page lsn is less than the +lsn of a log record. This can be called when a buffer page has just been +read in, or also for a page already in the buffer pool. */ +UNIV_INTERN +void +recv_recover_page_func( +/*===================*/ +#ifndef UNIV_HOTBACKUP + ibool just_read_in, + /*!< in: TRUE if the i/o handler calls + this for a freshly read page */ +#endif /* !UNIV_HOTBACKUP */ + buf_block_t* block); /*!< in/out: buffer block */ +#ifndef UNIV_HOTBACKUP +/** Wrapper for recv_recover_page_func(). +Applies the hashed log records to the page, if the page lsn is less than the +lsn of a log record. This can be called when a buffer page has just been +read in, or also for a page already in the buffer pool. +@param jri in: TRUE if just read in (the i/o handler calls this for +a freshly read page) +@param block in/out: the buffer block +*/ +# define recv_recover_page(jri, block) recv_recover_page_func(jri, block) +#else /* !UNIV_HOTBACKUP */ +/** Wrapper for recv_recover_page_func(). +Applies the hashed log records to the page, if the page lsn is less than the +lsn of a log record. This can be called when a buffer page has just been +read in, or also for a page already in the buffer pool. +@param jri in: TRUE if just read in (the i/o handler calls this for +a freshly read page) +@param block in/out: the buffer block +*/ +# define recv_recover_page(jri, block) recv_recover_page_func(block) +#endif /* !UNIV_HOTBACKUP */ +/********************************************************//** +Recovers from a checkpoint. When this function returns, the database is able +to start processing of new user transactions, but the function +recv_recovery_from_checkpoint_finish should be called later to complete +the recovery and free the resources used in it. +@return error code or DB_SUCCESS */ +UNIV_INTERN +dberr_t +recv_recovery_from_checkpoint_start_func( +/*=====================================*/ +#ifdef UNIV_LOG_ARCHIVE + ulint type, /*!< in: LOG_CHECKPOINT or + LOG_ARCHIVE */ + lsn_t limit_lsn, /*!< in: recover up to this lsn + if possible */ +#endif /* UNIV_LOG_ARCHIVE */ + lsn_t min_flushed_lsn,/*!< in: min flushed lsn from + data files */ + lsn_t max_flushed_lsn);/*!< in: max flushed lsn from + data files */ +#ifdef UNIV_LOG_ARCHIVE +/** Wrapper for recv_recovery_from_checkpoint_start_func(). +Recovers from a checkpoint. When this function returns, the database is able +to start processing of new user transactions, but the function +recv_recovery_from_checkpoint_finish should be called later to complete +the recovery and free the resources used in it. +@param type in: LOG_CHECKPOINT or LOG_ARCHIVE +@param lim in: recover up to this log sequence number if possible +@param min in: minimum flushed log sequence number from data files +@param max in: maximum flushed log sequence number from data files +@return error code or DB_SUCCESS */ +# define recv_recovery_from_checkpoint_start(type,lim,min,max) \ + recv_recovery_from_checkpoint_start_func(type,lim,min,max) +#else /* UNIV_LOG_ARCHIVE */ +/** Wrapper for recv_recovery_from_checkpoint_start_func(). +Recovers from a checkpoint. When this function returns, the database is able +to start processing of new user transactions, but the function +recv_recovery_from_checkpoint_finish should be called later to complete +the recovery and free the resources used in it. +@param type ignored: LOG_CHECKPOINT or LOG_ARCHIVE +@param lim ignored: recover up to this log sequence number if possible +@param min in: minimum flushed log sequence number from data files +@param max in: maximum flushed log sequence number from data files +@return error code or DB_SUCCESS */ +# define recv_recovery_from_checkpoint_start(type,lim,min,max) \ + recv_recovery_from_checkpoint_start_func(min,max) +#endif /* UNIV_LOG_ARCHIVE */ +/********************************************************//** +Completes recovery from a checkpoint. */ +UNIV_INTERN +void +recv_recovery_from_checkpoint_finish(void); +/*======================================*/ +/********************************************************//** +Initiates the rollback of active transactions. */ +UNIV_INTERN +void +recv_recovery_rollback_active(void); +/*===============================*/ +/*******************************************************//** +Scans log from a buffer and stores new log data to the parsing buffer. +Parses and hashes the log records if new data found. Unless +UNIV_HOTBACKUP is defined, this function will apply log records +automatically when the hash table becomes full. +@return TRUE if limit_lsn has been reached, or not able to scan any +more in this log group */ +UNIV_INTERN +ibool +recv_scan_log_recs( +/*===============*/ + ulint available_memory,/*!< in: we let the hash table of recs + to grow to this size, at the maximum */ + ibool store_to_hash, /*!< in: TRUE if the records should be + stored to the hash table; this is set + to FALSE if just debug checking is + needed */ + const byte* buf, /*!< in: buffer containing a log + segment or garbage */ + ulint len, /*!< in: buffer length */ + lsn_t start_lsn, /*!< in: buffer start lsn */ + lsn_t* contiguous_lsn, /*!< in/out: it is known that all log + groups contain contiguous log data up + to this lsn */ + lsn_t* group_scanned_lsn);/*!< out: scanning succeeded up to + this lsn */ +/******************************************************//** +Resets the logs. The contents of log files will be lost! */ +UNIV_INTERN +void +recv_reset_logs( +/*============*/ +#ifdef UNIV_LOG_ARCHIVE + ulint arch_log_no, /*!< in: next archived log file number */ + ibool new_logs_created,/*!< in: TRUE if resetting logs + is done at the log creation; + FALSE if it is done after + archive recovery */ +#endif /* UNIV_LOG_ARCHIVE */ + lsn_t lsn); /*!< in: reset to this lsn + rounded up to be divisible by + OS_FILE_LOG_BLOCK_SIZE, after + which we add + LOG_BLOCK_HDR_SIZE */ +#ifdef UNIV_HOTBACKUP +/******************************************************//** +Creates new log files after a backup has been restored. */ +UNIV_INTERN +void +recv_reset_log_files_for_backup( +/*============================*/ + const char* log_dir, /*!< in: log file directory path */ + ulint n_log_files, /*!< in: number of log files */ + lsn_t log_file_size, /*!< in: log file size */ + lsn_t lsn); /*!< in: new start lsn, must be + divisible by OS_FILE_LOG_BLOCK_SIZE */ +#endif /* UNIV_HOTBACKUP */ +/********************************************************//** +Creates the recovery system. */ +UNIV_INTERN +void +recv_sys_create(void); +/*=================*/ +/**********************************************************//** +Release recovery system mutexes. */ +UNIV_INTERN +void +recv_sys_close(void); +/*================*/ +/********************************************************//** +Frees the recovery system memory. */ +UNIV_INTERN +void +recv_sys_mem_free(void); +/*===================*/ +/********************************************************//** +Inits the recovery system for a recovery operation. */ +UNIV_INTERN +void +recv_sys_init( +/*==========*/ + ulint available_memory); /*!< in: available memory in bytes */ +#ifndef UNIV_HOTBACKUP +/********************************************************//** +Reset the state of the recovery system variables. */ +UNIV_INTERN +void +recv_sys_var_init(void); +/*===================*/ +#endif /* !UNIV_HOTBACKUP */ +/*******************************************************************//** +Empties the hash table of stored log records, applying them to appropriate +pages. */ +UNIV_INTERN +void +recv_apply_hashed_log_recs( +/*=======================*/ + ibool allow_ibuf); /*!< in: if TRUE, also ibuf operations are + allowed during the application; if FALSE, + no ibuf operations are allowed, and after + the application all file pages are flushed to + disk and invalidated in buffer pool: this + alternative means that no new log records + can be generated during the application */ +#ifdef UNIV_HOTBACKUP +/*******************************************************************//** +Applies log records in the hash table to a backup. */ +UNIV_INTERN +void +recv_apply_log_recs_for_backup(void); +/*================================*/ +#endif +#ifdef UNIV_LOG_ARCHIVE +/********************************************************//** +Recovers from archived log files, and also from log files, if they exist. +@return error code or DB_SUCCESS */ +UNIV_INTERN +ulint +recv_recovery_from_archive_start( +/*=============================*/ + lsn_t min_flushed_lsn,/*!< in: min flushed lsn field from the + data files */ + lsn_t limit_lsn, /*!< in: recover up to this lsn if + possible */ + ulint first_log_no); /*!< in: number of the first archived + log file to use in the recovery; the + file will be searched from + INNOBASE_LOG_ARCH_DIR specified in + server config file */ +/********************************************************//** +Completes recovery from archive. */ +UNIV_INTERN +void +recv_recovery_from_archive_finish(void); +/*===================================*/ +#endif /* UNIV_LOG_ARCHIVE */ + +/** Block of log record data */ +struct recv_data_t{ + recv_data_t* next; /*!< pointer to the next block or NULL */ + /*!< the log record data is stored physically + immediately after this struct, max amount + RECV_DATA_BLOCK_SIZE bytes of it */ +}; + +/** Stored log record struct */ +struct recv_t{ + byte type; /*!< log record type */ + ulint len; /*!< log record body length in bytes */ + recv_data_t* data; /*!< chain of blocks containing the log record + body */ + lsn_t start_lsn;/*!< start lsn of the log segment written by + the mtr which generated this log record: NOTE + that this is not necessarily the start lsn of + this log record */ + lsn_t end_lsn;/*!< end lsn of the log segment written by + the mtr which generated this log record: NOTE + that this is not necessarily the end lsn of + this log record */ + UT_LIST_NODE_T(recv_t) + rec_list;/*!< list of log records for this page */ +}; + +/** States of recv_addr_t */ +enum recv_addr_state { + /** not yet processed */ + RECV_NOT_PROCESSED, + /** page is being read */ + RECV_BEING_READ, + /** log records are being applied on the page */ + RECV_BEING_PROCESSED, + /** log records have been applied on the page, or they have + been discarded because the tablespace does not exist */ + RECV_PROCESSED +}; + +/** Hashed page file address struct */ +struct recv_addr_t{ + enum recv_addr_state state; + /*!< recovery state of the page */ + unsigned space:32;/*!< space id */ + unsigned page_no:32;/*!< page number */ + UT_LIST_BASE_NODE_T(recv_t) + rec_list;/*!< list of log records for this page */ + hash_node_t addr_hash;/*!< hash node in the hash bucket chain */ +}; + +struct recv_dblwr_t { + void add(byte* page); + + byte* find_page(ulint space_id, ulint page_no); + + std::list<byte *> pages; /* Pages from double write buffer */ + + void operator() () { + pages.clear(); + } +}; + +/** Recovery system data structure */ +struct recv_sys_t{ +#ifndef UNIV_HOTBACKUP + ib_mutex_t mutex; /*!< mutex protecting the fields apply_log_recs, + n_addrs, and the state field in each recv_addr + struct */ + ib_mutex_t writer_mutex;/*!< mutex coordinating + flushing between recv_writer_thread and + the recovery thread. */ +#endif /* !UNIV_HOTBACKUP */ + ibool apply_log_recs; + /*!< this is TRUE when log rec application to + pages is allowed; this flag tells the + i/o-handler if it should do log record + application */ + ibool apply_batch_on; + /*!< this is TRUE when a log rec application + batch is running */ + lsn_t lsn; /*!< log sequence number */ + ulint last_log_buf_size; + /*!< size of the log buffer when the database + last time wrote to the log */ + byte* last_block; + /*!< possible incomplete last recovered log + block */ + byte* last_block_buf_start; + /*!< the nonaligned start address of the + preceding buffer */ + byte* buf; /*!< buffer for parsing log records */ + ulint len; /*!< amount of data in buf */ + lsn_t parse_start_lsn; + /*!< this is the lsn from which we were able to + start parsing log records and adding them to + the hash table; zero if a suitable + start point not found yet */ + lsn_t scanned_lsn; + /*!< the log data has been scanned up to this + lsn */ + ulint scanned_checkpoint_no; + /*!< the log data has been scanned up to this + checkpoint number (lowest 4 bytes) */ + ulint recovered_offset; + /*!< start offset of non-parsed log records in + buf */ + lsn_t recovered_lsn; + /*!< the log records have been parsed up to + this lsn */ + lsn_t limit_lsn;/*!< recovery should be made at most + up to this lsn */ + ibool found_corrupt_log; + /*!< this is set to TRUE if we during log + scan find a corrupt log block, or a corrupt + log record, or there is a log parsing + buffer overflow */ +#ifdef UNIV_LOG_ARCHIVE + log_group_t* archive_group; + /*!< in archive recovery: the log group whose + archive is read */ +#endif /* !UNIV_LOG_ARCHIVE */ + mem_heap_t* heap; /*!< memory heap of log records and file + addresses*/ + hash_table_t* addr_hash;/*!< hash table of file addresses of pages */ + ulint n_addrs;/*!< number of not processed hashed file + addresses in the hash table */ + + recv_dblwr_t dblwr; +}; + +/** The recovery system */ +extern recv_sys_t* recv_sys; + +/** TRUE when applying redo log records during crash recovery; FALSE +otherwise. Note that this is FALSE while a background thread is +rolling back incomplete transactions. */ +extern ibool recv_recovery_on; +/** If the following is TRUE, the buffer pool file pages must be invalidated +after recovery and no ibuf operations are allowed; this becomes TRUE if +the log record hash table becomes too full, and log records must be merged +to file pages already before the recovery is finished: in this case no +ibuf operations are allowed, as they could modify the pages read in the +buffer pool before the pages have been recovered to the up-to-date state. + +TRUE means that recovery is running and no operations on the log files +are allowed yet: the variable name is misleading. */ +extern ibool recv_no_ibuf_operations; +/** TRUE when recv_init_crash_recovery() has been called. */ +extern ibool recv_needed_recovery; +#ifdef UNIV_DEBUG +/** TRUE if writing to the redo log (mtr_commit) is forbidden. +Protected by log_sys->mutex. */ +extern ibool recv_no_log_write; +#endif /* UNIV_DEBUG */ + +/** TRUE if buf_page_is_corrupted() should check if the log sequence +number (FIL_PAGE_LSN) is in the future. Initially FALSE, and set by +recv_recovery_from_checkpoint_start_func(). */ +extern ibool recv_lsn_checks_on; +#ifdef UNIV_HOTBACKUP +/** TRUE when the redo log is being backed up */ +extern ibool recv_is_making_a_backup; +#endif /* UNIV_HOTBACKUP */ +/** Maximum page number encountered in the redo log */ +extern ulint recv_max_parsed_page_no; + +/** Size of the parsing buffer; it must accommodate RECV_SCAN_SIZE many +times! */ +#define RECV_PARSING_BUF_SIZE (2 * 1024 * 1024) + +/** Size of block reads when the log groups are scanned forward to do a +roll-forward */ +#define RECV_SCAN_SIZE (4 * UNIV_PAGE_SIZE) + +/** This many frames must be left free in the buffer pool when we scan +the log and store the scanned log records in the buffer pool: we will +use these free frames to read in pages when we start applying the +log records to the database. */ +extern ulint recv_n_pool_free_frames; + +#ifndef UNIV_NONINL +#include "log0recv.ic" +#endif + +#endif diff --git a/storage/innobase/include/log0recv.ic b/storage/innobase/include/log0recv.ic new file mode 100644 index 00000000000..32c28dd03e6 --- /dev/null +++ b/storage/innobase/include/log0recv.ic @@ -0,0 +1,53 @@ +/***************************************************************************** + +Copyright (c) 1997, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/log0recv.ic +Recovery + +Created 9/20/1997 Heikki Tuuri +*******************************************************/ + +#include "univ.i" + +/*******************************************************************//** +Returns TRUE if recovery is currently running. +@return recv_recovery_on */ +UNIV_INLINE +ibool +recv_recovery_is_on(void) +/*=====================*/ +{ + return(recv_recovery_on); +} + +#ifdef UNIV_LOG_ARCHIVE +/** TRUE when applying redo log records from an archived log file */ +extern ibool recv_recovery_from_backup_on; + +/*******************************************************************//** +Returns TRUE if recovery from backup is currently running. +@return recv_recovery_from_backup_on */ +UNIV_INLINE +ibool +recv_recovery_from_backup_is_on(void) +/*=================================*/ +{ + return(recv_recovery_from_backup_on); +} +#endif /* UNIV_LOG_ARCHIVE */ diff --git a/storage/innobase/include/mach0data.h b/storage/innobase/include/mach0data.h new file mode 100644 index 00000000000..d0087f56aaa --- /dev/null +++ b/storage/innobase/include/mach0data.h @@ -0,0 +1,418 @@ +/***************************************************************************** + +Copyright (c) 1995, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/mach0data.h +Utilities for converting data from the database file +to the machine format. + +Created 11/28/1995 Heikki Tuuri +***********************************************************************/ + +#ifndef mach0data_h +#define mach0data_h + +#ifndef UNIV_INNOCHECKSUM + +#include "univ.i" +#include "ut0byte.h" + +/* The data and all fields are always stored in a database file +in the same format: ascii, big-endian, ... . +All data in the files MUST be accessed using the functions in this +module. */ + +/*******************************************************//** +The following function is used to store data in one byte. */ +UNIV_INLINE +void +mach_write_to_1( +/*============*/ + byte* b, /*!< in: pointer to byte where to store */ + ulint n); /*!< in: ulint integer to be stored, >= 0, < 256 */ +/********************************************************//** +The following function is used to fetch data from one byte. +@return ulint integer, >= 0, < 256 */ +UNIV_INLINE +ulint +mach_read_from_1( +/*=============*/ + const byte* b) /*!< in: pointer to byte */ + __attribute__((nonnull, pure)); +/*******************************************************//** +The following function is used to store data in two consecutive +bytes. We store the most significant byte to the lower address. */ +UNIV_INLINE +void +mach_write_to_2( +/*============*/ + byte* b, /*!< in: pointer to two bytes where to store */ + ulint n); /*!< in: ulint integer to be stored, >= 0, < 64k */ +/********************************************************//** +The following function is used to fetch data from two consecutive +bytes. The most significant byte is at the lowest address. +@return ulint integer, >= 0, < 64k */ +UNIV_INLINE +ulint +mach_read_from_2( +/*=============*/ + const byte* b) /*!< in: pointer to two bytes */ + __attribute__((nonnull, pure)); + +/********************************************************//** +The following function is used to convert a 16-bit data item +to the canonical format, for fast bytewise equality test +against memory. +@return 16-bit integer in canonical format */ +UNIV_INLINE +uint16 +mach_encode_2( +/*==========*/ + ulint n) /*!< in: integer in machine-dependent format */ + __attribute__((const)); +/********************************************************//** +The following function is used to convert a 16-bit data item +from the canonical format, for fast bytewise equality test +against memory. +@return integer in machine-dependent format */ +UNIV_INLINE +ulint +mach_decode_2( +/*==========*/ + uint16 n) /*!< in: 16-bit integer in canonical format */ + __attribute__((const)); +/*******************************************************//** +The following function is used to store data in 3 consecutive +bytes. We store the most significant byte to the lowest address. */ +UNIV_INLINE +void +mach_write_to_3( +/*============*/ + byte* b, /*!< in: pointer to 3 bytes where to store */ + ulint n); /*!< in: ulint integer to be stored */ +/********************************************************//** +The following function is used to fetch data from 3 consecutive +bytes. The most significant byte is at the lowest address. +@return ulint integer */ +UNIV_INLINE +ulint +mach_read_from_3( +/*=============*/ + const byte* b) /*!< in: pointer to 3 bytes */ + __attribute__((nonnull, pure)); +/*******************************************************//** +The following function is used to store data in four consecutive +bytes. We store the most significant byte to the lowest address. */ +UNIV_INLINE +void +mach_write_to_4( +/*============*/ + byte* b, /*!< in: pointer to four bytes where to store */ + ulint n); /*!< in: ulint integer to be stored */ +/********************************************************//** +The following function is used to fetch data from 4 consecutive +bytes. The most significant byte is at the lowest address. +@return ulint integer */ +UNIV_INLINE +ulint +mach_read_from_4( +/*=============*/ + const byte* b) /*!< in: pointer to four bytes */ + __attribute__((nonnull, pure)); +/*********************************************************//** +Writes a ulint in a compressed form (1..5 bytes). +@return stored size in bytes */ +UNIV_INLINE +ulint +mach_write_compressed( +/*==================*/ + byte* b, /*!< in: pointer to memory where to store */ + ulint n); /*!< in: ulint integer to be stored */ +/*********************************************************//** +Returns the size of an ulint when written in the compressed form. +@return compressed size in bytes */ +UNIV_INLINE +ulint +mach_get_compressed_size( +/*=====================*/ + ulint n) /*!< in: ulint integer to be stored */ + __attribute__((const)); +/*********************************************************//** +Reads a ulint in a compressed form. +@return read integer */ +UNIV_INLINE +ulint +mach_read_compressed( +/*=================*/ + const byte* b) /*!< in: pointer to memory from where to read */ + __attribute__((nonnull, pure)); +/*******************************************************//** +The following function is used to store data in 6 consecutive +bytes. We store the most significant byte to the lowest address. */ +UNIV_INLINE +void +mach_write_to_6( +/*============*/ + byte* b, /*!< in: pointer to 6 bytes where to store */ + ib_uint64_t id); /*!< in: 48-bit integer */ +/********************************************************//** +The following function is used to fetch data from 6 consecutive +bytes. The most significant byte is at the lowest address. +@return 48-bit integer */ +UNIV_INLINE +ib_uint64_t +mach_read_from_6( +/*=============*/ + const byte* b) /*!< in: pointer to 6 bytes */ + __attribute__((nonnull, pure)); +/*******************************************************//** +The following function is used to store data in 7 consecutive +bytes. We store the most significant byte to the lowest address. */ +UNIV_INLINE +void +mach_write_to_7( +/*============*/ + byte* b, /*!< in: pointer to 7 bytes where to store */ + ib_uint64_t n); /*!< in: 56-bit integer */ +/********************************************************//** +The following function is used to fetch data from 7 consecutive +bytes. The most significant byte is at the lowest address. +@return 56-bit integer */ +UNIV_INLINE +ib_uint64_t +mach_read_from_7( +/*=============*/ + const byte* b) /*!< in: pointer to 7 bytes */ + __attribute__((nonnull, pure)); +/*******************************************************//** +The following function is used to store data in 8 consecutive +bytes. We store the most significant byte to the lowest address. */ +UNIV_INLINE +void +mach_write_to_8( +/*============*/ + void* b, /*!< in: pointer to 8 bytes where to store */ + ib_uint64_t n); /*!< in: 64-bit integer to be stored */ +/********************************************************//** +The following function is used to fetch data from 8 consecutive +bytes. The most significant byte is at the lowest address. +@return 64-bit integer */ +UNIV_INLINE +ib_uint64_t +mach_read_from_8( +/*=============*/ + const byte* b) /*!< in: pointer to 8 bytes */ + __attribute__((nonnull, pure)); +/*********************************************************//** +Writes a 64-bit integer in a compressed form (5..9 bytes). +@return size in bytes */ +UNIV_INLINE +ulint +mach_ull_write_compressed( +/*======================*/ + byte* b, /*!< in: pointer to memory where to store */ + ib_uint64_t n); /*!< in: 64-bit integer to be stored */ +/*********************************************************//** +Returns the size of a 64-bit integer when written in the compressed form. +@return compressed size in bytes */ +UNIV_INLINE +ulint +mach_ull_get_compressed_size( +/*=========================*/ + ib_uint64_t n); /*!< in: 64-bit integer to be stored */ +/*********************************************************//** +Reads a 64-bit integer in a compressed form. +@return the value read */ +UNIV_INLINE +ib_uint64_t +mach_ull_read_compressed( +/*=====================*/ + const byte* b) /*!< in: pointer to memory from where to read */ + __attribute__((nonnull, pure)); +/*********************************************************//** +Writes a 64-bit integer in a compressed form (1..11 bytes). +@return size in bytes */ +UNIV_INLINE +ulint +mach_ull_write_much_compressed( +/*===========================*/ + byte* b, /*!< in: pointer to memory where to store */ + ib_uint64_t n); /*!< in: 64-bit integer to be stored */ +/*********************************************************//** +Returns the size of a 64-bit integer when written in the compressed form. +@return compressed size in bytes */ +UNIV_INLINE +ulint +mach_ull_get_much_compressed_size( +/*==============================*/ + ib_uint64_t n) /*!< in: 64-bit integer to be stored */ + __attribute__((const)); +/*********************************************************//** +Reads a 64-bit integer in a compressed form. +@return the value read */ +UNIV_INLINE +ib_uint64_t +mach_ull_read_much_compressed( +/*==========================*/ + const byte* b) /*!< in: pointer to memory from where to read */ + __attribute__((nonnull, pure)); +/*********************************************************//** +Reads a ulint in a compressed form if the log record fully contains it. +@return pointer to end of the stored field, NULL if not complete */ +UNIV_INTERN +byte* +mach_parse_compressed( +/*==================*/ + byte* ptr, /*!< in: pointer to buffer from where to read */ + byte* end_ptr,/*!< in: pointer to end of the buffer */ + ulint* val); /*!< out: read value */ +/*********************************************************//** +Reads a 64-bit integer in a compressed form +if the log record fully contains it. +@return pointer to end of the stored field, NULL if not complete */ +UNIV_INLINE +byte* +mach_ull_parse_compressed( +/*======================*/ + byte* ptr, /*!< in: pointer to buffer from where to read */ + byte* end_ptr,/*!< in: pointer to end of the buffer */ + ib_uint64_t* val); /*!< out: read value */ +#ifndef UNIV_HOTBACKUP +/*********************************************************//** +Reads a double. It is stored in a little-endian format. +@return double read */ +UNIV_INLINE +double +mach_double_read( +/*=============*/ + const byte* b) /*!< in: pointer to memory from where to read */ + __attribute__((nonnull, pure)); +/*********************************************************//** +Writes a double. It is stored in a little-endian format. */ +UNIV_INLINE +void +mach_double_write( +/*==============*/ + byte* b, /*!< in: pointer to memory where to write */ + double d); /*!< in: double */ +/*********************************************************//** +Reads a float. It is stored in a little-endian format. +@return float read */ +UNIV_INLINE +float +mach_float_read( +/*============*/ + const byte* b) /*!< in: pointer to memory from where to read */ + __attribute__((nonnull, pure)); +/*********************************************************//** +Writes a float. It is stored in a little-endian format. */ +UNIV_INLINE +void +mach_float_write( +/*=============*/ + byte* b, /*!< in: pointer to memory where to write */ + float d); /*!< in: float */ +/*********************************************************//** +Reads a ulint stored in the little-endian format. +@return unsigned long int */ +UNIV_INLINE +ulint +mach_read_from_n_little_endian( +/*===========================*/ + const byte* buf, /*!< in: from where to read */ + ulint buf_size) /*!< in: from how many bytes to read */ + __attribute__((nonnull, pure)); +/*********************************************************//** +Writes a ulint in the little-endian format. */ +UNIV_INLINE +void +mach_write_to_n_little_endian( +/*==========================*/ + byte* dest, /*!< in: where to write */ + ulint dest_size, /*!< in: into how many bytes to write */ + ulint n); /*!< in: unsigned long int to write */ +/*********************************************************//** +Reads a ulint stored in the little-endian format. +@return unsigned long int */ +UNIV_INLINE +ulint +mach_read_from_2_little_endian( +/*===========================*/ + const byte* buf) /*!< in: from where to read */ + __attribute__((nonnull, pure)); +/*********************************************************//** +Writes a ulint in the little-endian format. */ +UNIV_INLINE +void +mach_write_to_2_little_endian( +/*==========================*/ + byte* dest, /*!< in: where to write */ + ulint n); /*!< in: unsigned long int to write */ +/*********************************************************//** +Convert integral type from storage byte order (big endian) to +host byte order. +@return integer value */ +UNIV_INLINE +ib_uint64_t +mach_read_int_type( +/*===============*/ + const byte* src, /*!< in: where to read from */ + ulint len, /*!< in: length of src */ + ibool unsigned_type); /*!< in: signed or unsigned flag */ +/***********************************************************//** +Convert integral type from host byte order to (big-endian) storage +byte order. */ +UNIV_INLINE +void +mach_write_int_type( +/*================*/ + byte* dest, /*!< in: where to write*/ + const byte* src, /*!< in: where to read from */ + ulint len, /*!< in: length of src */ + bool usign); /*!< in: signed or unsigned flag */ + +/************************************************************* +Convert a ulonglong integer from host byte order to (big-endian) +storage byte order. */ +UNIV_INLINE +void +mach_write_ulonglong( +/*=================*/ + byte* dest, /*!< in: where to write */ + ulonglong src, /*!< in: where to read from */ + ulint len, /*!< in: length of dest */ + bool usign); /*!< in: signed or unsigned flag */ + +/********************************************************//** +Reads 1 - 4 bytes from a file page buffered in the buffer pool. +@return value read */ +UNIV_INLINE +ulint +mach_read_ulint( +/*============*/ + const byte* ptr, /*!< in: pointer from where to read */ + ulint type); /*!< in: MLOG_1BYTE, MLOG_2BYTES, MLOG_4BYTES */ + +#endif /* !UNIV_HOTBACKUP */ +#endif /* !UNIV_INNOCHECKSUM */ + +#ifndef UNIV_NONINL +#include "mach0data.ic" +#endif + +#endif diff --git a/storage/innobase/include/mach0data.ic b/storage/innobase/include/mach0data.ic new file mode 100644 index 00000000000..7449d2da2b8 --- /dev/null +++ b/storage/innobase/include/mach0data.ic @@ -0,0 +1,881 @@ +/***************************************************************************** + +Copyright (c) 1995, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/mach0data.ic +Utilities for converting data from the database file +to the machine format. + +Created 11/28/1995 Heikki Tuuri +***********************************************************************/ + +#ifndef UNIV_INNOCHECKSUM + +#include "ut0mem.h" + +/*******************************************************//** +The following function is used to store data in one byte. */ +UNIV_INLINE +void +mach_write_to_1( +/*============*/ + byte* b, /*!< in: pointer to byte where to store */ + ulint n) /*!< in: ulint integer to be stored, >= 0, < 256 */ +{ + ut_ad(b); + ut_ad((n | 0xFFUL) <= 0xFFUL); + + b[0] = (byte) n; +} + +/********************************************************//** +The following function is used to fetch data from one byte. +@return ulint integer, >= 0, < 256 */ +UNIV_INLINE +ulint +mach_read_from_1( +/*=============*/ + const byte* b) /*!< in: pointer to byte */ +{ + ut_ad(b); + return((ulint)(b[0])); +} + +/*******************************************************//** +The following function is used to store data in two consecutive +bytes. We store the most significant byte to the lowest address. */ +UNIV_INLINE +void +mach_write_to_2( +/*============*/ + byte* b, /*!< in: pointer to two bytes where to store */ + ulint n) /*!< in: ulint integer to be stored */ +{ + ut_ad(b); + ut_ad((n | 0xFFFFUL) <= 0xFFFFUL); + + b[0] = (byte)(n >> 8); + b[1] = (byte)(n); +} + +/********************************************************//** +The following function is used to fetch data from 2 consecutive +bytes. The most significant byte is at the lowest address. +@return ulint integer */ +UNIV_INLINE +ulint +mach_read_from_2( +/*=============*/ + const byte* b) /*!< in: pointer to 2 bytes */ +{ + return(((ulint)(b[0]) << 8) | (ulint)(b[1])); +} + +/********************************************************//** +The following function is used to convert a 16-bit data item +to the canonical format, for fast bytewise equality test +against memory. +@return 16-bit integer in canonical format */ +UNIV_INLINE +uint16 +mach_encode_2( +/*==========*/ + ulint n) /*!< in: integer in machine-dependent format */ +{ + uint16 ret; + ut_ad(2 == sizeof ret); + mach_write_to_2((byte*) &ret, n); + return(ret); +} +/********************************************************//** +The following function is used to convert a 16-bit data item +from the canonical format, for fast bytewise equality test +against memory. +@return integer in machine-dependent format */ +UNIV_INLINE +ulint +mach_decode_2( +/*==========*/ + uint16 n) /*!< in: 16-bit integer in canonical format */ +{ + ut_ad(2 == sizeof n); + return(mach_read_from_2((const byte*) &n)); +} + +/*******************************************************//** +The following function is used to store data in 3 consecutive +bytes. We store the most significant byte to the lowest address. */ +UNIV_INLINE +void +mach_write_to_3( +/*============*/ + byte* b, /*!< in: pointer to 3 bytes where to store */ + ulint n) /*!< in: ulint integer to be stored */ +{ + ut_ad(b); + ut_ad((n | 0xFFFFFFUL) <= 0xFFFFFFUL); + + b[0] = (byte)(n >> 16); + b[1] = (byte)(n >> 8); + b[2] = (byte)(n); +} + +/********************************************************//** +The following function is used to fetch data from 3 consecutive +bytes. The most significant byte is at the lowest address. +@return ulint integer */ +UNIV_INLINE +ulint +mach_read_from_3( +/*=============*/ + const byte* b) /*!< in: pointer to 3 bytes */ +{ + ut_ad(b); + return( ((ulint)(b[0]) << 16) + | ((ulint)(b[1]) << 8) + | (ulint)(b[2]) + ); +} + +/*******************************************************//** +The following function is used to store data in four consecutive +bytes. We store the most significant byte to the lowest address. */ +UNIV_INLINE +void +mach_write_to_4( +/*============*/ + byte* b, /*!< in: pointer to four bytes where to store */ + ulint n) /*!< in: ulint integer to be stored */ +{ + ut_ad(b); + + b[0] = (byte)(n >> 24); + b[1] = (byte)(n >> 16); + b[2] = (byte)(n >> 8); + b[3] = (byte) n; +} + +#endif /* !UNIV_INNOCHECKSUM */ + +/********************************************************//** +The following function is used to fetch data from 4 consecutive +bytes. The most significant byte is at the lowest address. +@return ulint integer */ +UNIV_INLINE +ulint +mach_read_from_4( +/*=============*/ + const byte* b) /*!< in: pointer to four bytes */ +{ + ut_ad(b); + return( ((ulint)(b[0]) << 24) + | ((ulint)(b[1]) << 16) + | ((ulint)(b[2]) << 8) + | (ulint)(b[3]) + ); +} + +#ifndef UNIV_INNOCHECKSUM + +/*********************************************************//** +Writes a ulint in a compressed form where the first byte codes the +length of the stored ulint. We look at the most significant bits of +the byte. If the most significant bit is zero, it means 1-byte storage, +else if the 2nd bit is 0, it means 2-byte storage, else if 3rd is 0, +it means 3-byte storage, else if 4th is 0, it means 4-byte storage, +else the storage is 5-byte. +@return compressed size in bytes */ +UNIV_INLINE +ulint +mach_write_compressed( +/*==================*/ + byte* b, /*!< in: pointer to memory where to store */ + ulint n) /*!< in: ulint integer (< 2^32) to be stored */ +{ + ut_ad(b); + + if (n < 0x80UL) { + mach_write_to_1(b, n); + return(1); + } else if (n < 0x4000UL) { + mach_write_to_2(b, n | 0x8000UL); + return(2); + } else if (n < 0x200000UL) { + mach_write_to_3(b, n | 0xC00000UL); + return(3); + } else if (n < 0x10000000UL) { + mach_write_to_4(b, n | 0xE0000000UL); + return(4); + } else { + mach_write_to_1(b, 0xF0UL); + mach_write_to_4(b + 1, n); + return(5); + } +} + +/*********************************************************//** +Returns the size of a ulint when written in the compressed form. +@return compressed size in bytes */ +UNIV_INLINE +ulint +mach_get_compressed_size( +/*=====================*/ + ulint n) /*!< in: ulint integer (< 2^32) to be stored */ +{ + if (n < 0x80UL) { + return(1); + } else if (n < 0x4000UL) { + return(2); + } else if (n < 0x200000UL) { + return(3); + } else if (n < 0x10000000UL) { + return(4); + } else { + return(5); + } +} + +/*********************************************************//** +Reads a ulint in a compressed form. +@return read integer (< 2^32) */ +UNIV_INLINE +ulint +mach_read_compressed( +/*=================*/ + const byte* b) /*!< in: pointer to memory from where to read */ +{ + ulint flag; + + ut_ad(b); + + flag = mach_read_from_1(b); + + if (flag < 0x80UL) { + return(flag); + } else if (flag < 0xC0UL) { + return(mach_read_from_2(b) & 0x7FFFUL); + } else if (flag < 0xE0UL) { + return(mach_read_from_3(b) & 0x3FFFFFUL); + } else if (flag < 0xF0UL) { + return(mach_read_from_4(b) & 0x1FFFFFFFUL); + } else { + ut_ad(flag == 0xF0UL); + return(mach_read_from_4(b + 1)); + } +} + +/*******************************************************//** +The following function is used to store data in 8 consecutive +bytes. We store the most significant byte to the lowest address. */ +UNIV_INLINE +void +mach_write_to_8( +/*============*/ + void* b, /*!< in: pointer to 8 bytes where to store */ + ib_uint64_t n) /*!< in: 64-bit integer to be stored */ +{ + ut_ad(b); + + mach_write_to_4(static_cast<byte*>(b), (ulint) (n >> 32)); + mach_write_to_4(static_cast<byte*>(b) + 4, (ulint) n); +} + +/********************************************************//** +The following function is used to fetch data from 8 consecutive +bytes. The most significant byte is at the lowest address. +@return 64-bit integer */ +UNIV_INLINE +ib_uint64_t +mach_read_from_8( +/*=============*/ + const byte* b) /*!< in: pointer to 8 bytes */ +{ + ib_uint64_t ull; + + ull = ((ib_uint64_t) mach_read_from_4(b)) << 32; + ull |= (ib_uint64_t) mach_read_from_4(b + 4); + + return(ull); +} + +/*******************************************************//** +The following function is used to store data in 7 consecutive +bytes. We store the most significant byte to the lowest address. */ +UNIV_INLINE +void +mach_write_to_7( +/*============*/ + byte* b, /*!< in: pointer to 7 bytes where to store */ + ib_uint64_t n) /*!< in: 56-bit integer */ +{ + ut_ad(b); + + mach_write_to_3(b, (ulint) (n >> 32)); + mach_write_to_4(b + 3, (ulint) n); +} + +/********************************************************//** +The following function is used to fetch data from 7 consecutive +bytes. The most significant byte is at the lowest address. +@return 56-bit integer */ +UNIV_INLINE +ib_uint64_t +mach_read_from_7( +/*=============*/ + const byte* b) /*!< in: pointer to 7 bytes */ +{ + ut_ad(b); + + return(ut_ull_create(mach_read_from_3(b), mach_read_from_4(b + 3))); +} + +/*******************************************************//** +The following function is used to store data in 6 consecutive +bytes. We store the most significant byte to the lowest address. */ +UNIV_INLINE +void +mach_write_to_6( +/*============*/ + byte* b, /*!< in: pointer to 6 bytes where to store */ + ib_uint64_t n) /*!< in: 48-bit integer */ +{ + ut_ad(b); + + mach_write_to_2(b, (ulint) (n >> 32)); + mach_write_to_4(b + 2, (ulint) n); +} + +/********************************************************//** +The following function is used to fetch data from 6 consecutive +bytes. The most significant byte is at the lowest address. +@return 48-bit integer */ +UNIV_INLINE +ib_uint64_t +mach_read_from_6( +/*=============*/ + const byte* b) /*!< in: pointer to 6 bytes */ +{ + ut_ad(b); + + return(ut_ull_create(mach_read_from_2(b), mach_read_from_4(b + 2))); +} + +/*********************************************************//** +Writes a 64-bit integer in a compressed form (5..9 bytes). +@return size in bytes */ +UNIV_INLINE +ulint +mach_ull_write_compressed( +/*======================*/ + byte* b, /*!< in: pointer to memory where to store */ + ib_uint64_t n) /*!< in: 64-bit integer to be stored */ +{ + ulint size; + + ut_ad(b); + + size = mach_write_compressed(b, (ulint) (n >> 32)); + mach_write_to_4(b + size, (ulint) n); + + return(size + 4); +} + +/*********************************************************//** +Returns the size of a 64-bit integer when written in the compressed form. +@return compressed size in bytes */ +UNIV_INLINE +ulint +mach_ull_get_compressed_size( +/*=========================*/ + ib_uint64_t n) /*!< in: 64-bit integer to be stored */ +{ + return(4 + mach_get_compressed_size((ulint) (n >> 32))); +} + +/*********************************************************//** +Reads a 64-bit integer in a compressed form. +@return the value read */ +UNIV_INLINE +ib_uint64_t +mach_ull_read_compressed( +/*=====================*/ + const byte* b) /*!< in: pointer to memory from where to read */ +{ + ib_uint64_t n; + ulint size; + + ut_ad(b); + + n = (ib_uint64_t) mach_read_compressed(b); + + size = mach_get_compressed_size((ulint) n); + + n <<= 32; + n |= (ib_uint64_t) mach_read_from_4(b + size); + + return(n); +} + +/*********************************************************//** +Writes a 64-bit integer in a compressed form (1..11 bytes). +@return size in bytes */ +UNIV_INLINE +ulint +mach_ull_write_much_compressed( +/*===========================*/ + byte* b, /*!< in: pointer to memory where to store */ + ib_uint64_t n) /*!< in: 64-bit integer to be stored */ +{ + ulint size; + + ut_ad(b); + + if (!(n >> 32)) { + return(mach_write_compressed(b, (ulint) n)); + } + + *b = (byte)0xFF; + size = 1 + mach_write_compressed(b + 1, (ulint) (n >> 32)); + + size += mach_write_compressed(b + size, (ulint) n & 0xFFFFFFFF); + + return(size); +} + +/*********************************************************//** +Returns the size of a 64-bit integer when written in the compressed form. +@return compressed size in bytes */ +UNIV_INLINE +ulint +mach_ull_get_much_compressed_size( +/*==============================*/ + ib_uint64_t n) /*!< in: 64-bit integer to be stored */ +{ + if (!(n >> 32)) { + return(mach_get_compressed_size((ulint) n)); + } + + return(1 + mach_get_compressed_size((ulint) (n >> 32)) + + mach_get_compressed_size((ulint) n & ULINT32_MASK)); +} + +/*********************************************************//** +Reads a 64-bit integer in a compressed form. +@return the value read */ +UNIV_INLINE +ib_uint64_t +mach_ull_read_much_compressed( +/*==========================*/ + const byte* b) /*!< in: pointer to memory from where to read */ +{ + ib_uint64_t n; + ulint size; + + ut_ad(b); + + if (*b != (byte)0xFF) { + n = 0; + size = 0; + } else { + n = (ib_uint64_t) mach_read_compressed(b + 1); + + size = 1 + mach_get_compressed_size((ulint) n); + n <<= 32; + } + + n |= mach_read_compressed(b + size); + + return(n); +} + +/*********************************************************//** +Reads a 64-bit integer in a compressed form +if the log record fully contains it. +@return pointer to end of the stored field, NULL if not complete */ +UNIV_INLINE +byte* +mach_ull_parse_compressed( +/*======================*/ + byte* ptr, /* in: pointer to buffer from where to read */ + byte* end_ptr,/* in: pointer to end of the buffer */ + ib_uint64_t* val) /* out: read value */ +{ + ulint size; + + ut_ad(ptr); + ut_ad(end_ptr); + ut_ad(val); + + if (end_ptr < ptr + 5) { + + return(NULL); + } + + *val = mach_read_compressed(ptr); + + size = mach_get_compressed_size((ulint) *val); + + ptr += size; + + if (end_ptr < ptr + 4) { + + return(NULL); + } + + *val <<= 32; + *val |= mach_read_from_4(ptr); + + return(ptr + 4); +} +#ifndef UNIV_HOTBACKUP +/*********************************************************//** +Reads a double. It is stored in a little-endian format. +@return double read */ +UNIV_INLINE +double +mach_double_read( +/*=============*/ + const byte* b) /*!< in: pointer to memory from where to read */ +{ + double d; + ulint i; + byte* ptr; + + ptr = (byte*) &d; + + for (i = 0; i < sizeof(double); i++) { +#ifdef WORDS_BIGENDIAN + ptr[sizeof(double) - i - 1] = b[i]; +#else + ptr[i] = b[i]; +#endif + } + + return(d); +} + +/*********************************************************//** +Writes a double. It is stored in a little-endian format. */ +UNIV_INLINE +void +mach_double_write( +/*==============*/ + byte* b, /*!< in: pointer to memory where to write */ + double d) /*!< in: double */ +{ + ulint i; + byte* ptr; + + ptr = (byte*) &d; + + for (i = 0; i < sizeof(double); i++) { +#ifdef WORDS_BIGENDIAN + b[i] = ptr[sizeof(double) - i - 1]; +#else + b[i] = ptr[i]; +#endif + } +} + +/*********************************************************//** +Reads a float. It is stored in a little-endian format. +@return float read */ +UNIV_INLINE +float +mach_float_read( +/*============*/ + const byte* b) /*!< in: pointer to memory from where to read */ +{ + float d; + ulint i; + byte* ptr; + + ptr = (byte*) &d; + + for (i = 0; i < sizeof(float); i++) { +#ifdef WORDS_BIGENDIAN + ptr[sizeof(float) - i - 1] = b[i]; +#else + ptr[i] = b[i]; +#endif + } + + return(d); +} + +/*********************************************************//** +Writes a float. It is stored in a little-endian format. */ +UNIV_INLINE +void +mach_float_write( +/*=============*/ + byte* b, /*!< in: pointer to memory where to write */ + float d) /*!< in: float */ +{ + ulint i; + byte* ptr; + + ptr = (byte*) &d; + + for (i = 0; i < sizeof(float); i++) { +#ifdef WORDS_BIGENDIAN + b[i] = ptr[sizeof(float) - i - 1]; +#else + b[i] = ptr[i]; +#endif + } +} + +/*********************************************************//** +Reads a ulint stored in the little-endian format. +@return unsigned long int */ +UNIV_INLINE +ulint +mach_read_from_n_little_endian( +/*===========================*/ + const byte* buf, /*!< in: from where to read */ + ulint buf_size) /*!< in: from how many bytes to read */ +{ + ulint n = 0; + const byte* ptr; + + ut_ad(buf_size > 0); + + ptr = buf + buf_size; + + for (;;) { + ptr--; + + n = n << 8; + + n += (ulint)(*ptr); + + if (ptr == buf) { + break; + } + } + + return(n); +} + +/*********************************************************//** +Writes a ulint in the little-endian format. */ +UNIV_INLINE +void +mach_write_to_n_little_endian( +/*==========================*/ + byte* dest, /*!< in: where to write */ + ulint dest_size, /*!< in: into how many bytes to write */ + ulint n) /*!< in: unsigned long int to write */ +{ + byte* end; + + ut_ad(dest_size <= sizeof(ulint)); + ut_ad(dest_size > 0); + + end = dest + dest_size; + + for (;;) { + *dest = (byte)(n & 0xFF); + + n = n >> 8; + + dest++; + + if (dest == end) { + break; + } + } + + ut_ad(n == 0); +} + +/*********************************************************//** +Reads a ulint stored in the little-endian format. +@return unsigned long int */ +UNIV_INLINE +ulint +mach_read_from_2_little_endian( +/*===========================*/ + const byte* buf) /*!< in: from where to read */ +{ + return((ulint)(buf[0]) | ((ulint)(buf[1]) << 8)); +} + +/*********************************************************//** +Writes a ulint in the little-endian format. */ +UNIV_INLINE +void +mach_write_to_2_little_endian( +/*==========================*/ + byte* dest, /*!< in: where to write */ + ulint n) /*!< in: unsigned long int to write */ +{ + ut_ad(n < 256 * 256); + + *dest = (byte)(n & 0xFFUL); + + n = n >> 8; + dest++; + + *dest = (byte)(n & 0xFFUL); +} + +/*********************************************************//** +Convert integral type from storage byte order (big endian) to +host byte order. +@return integer value */ +UNIV_INLINE +ib_uint64_t +mach_read_int_type( +/*===============*/ + const byte* src, /*!< in: where to read from */ + ulint len, /*!< in: length of src */ + ibool unsigned_type) /*!< in: signed or unsigned flag */ +{ + /* XXX this can be optimized on big-endian machines */ + + ullint ret; + uint i; + + if (unsigned_type || (src[0] & 0x80)) { + + ret = 0x0000000000000000ULL; + } else { + + ret = 0xFFFFFFFFFFFFFF00ULL; + } + + if (unsigned_type) { + + ret |= src[0]; + } else { + + ret |= src[0] ^ 0x80; + } + + for (i = 1; i < len; i++) { + ret <<= 8; + ret |= src[i]; + } + + return(ret); +} +/*********************************************************//** +Swap byte ordering. */ +UNIV_INLINE +void +mach_swap_byte_order( +/*=================*/ + byte* dest, /*!< out: where to write */ + const byte* from, /*!< in: where to read from */ + ulint len) /*!< in: length of src */ +{ + ut_ad(len > 0); + ut_ad(len <= 8); + + dest += len; + + switch (len & 0x7) { + case 0: *--dest = *from++; + case 7: *--dest = *from++; + case 6: *--dest = *from++; + case 5: *--dest = *from++; + case 4: *--dest = *from++; + case 3: *--dest = *from++; + case 2: *--dest = *from++; + case 1: *--dest = *from; + } +} + +/************************************************************* +Convert integral type from host byte order (big-endian) storage +byte order. */ +UNIV_INLINE +void +mach_write_int_type( +/*================*/ + byte* dest, /*!< in: where to write */ + const byte* src, /*!< in: where to read from */ + ulint len, /*!< in: length of src */ + bool usign) /*!< in: signed or unsigned flag */ +{ +#ifdef WORDS_BIGENDIAN + memcpy(dest, src, len); +#else + mach_swap_byte_order(dest, src, len); +#endif /* WORDS_BIGENDIAN */ + + if (!usign) { + *dest ^= 0x80; + } +} + +/************************************************************* +Convert a ulonglong integer from host byte order to (big-endian) +storage byte order. */ +UNIV_INLINE +void +mach_write_ulonglong( +/*=================*/ + byte* dest, /*!< in: where to write */ + ulonglong src, /*!< in: where to read from */ + ulint len, /*!< in: length of dest */ + bool usign) /*!< in: signed or unsigned flag */ +{ + byte* ptr = reinterpret_cast<byte*>(&src); + + ut_ad(len <= sizeof(ulonglong)); + +#ifdef WORDS_BIGENDIAN + memcpy(dest, ptr + (sizeof(src) - len), len); +#else + mach_swap_byte_order(dest, reinterpret_cast<byte*>(ptr), len); +#endif /* WORDS_BIGENDIAN */ + + if (!usign) { + *dest ^= 0x80; + } +} + +/********************************************************//** +Reads 1 - 4 bytes from a file page buffered in the buffer pool. +@return value read */ +UNIV_INLINE +ulint +mach_read_ulint( +/*============*/ + const byte* ptr, /*!< in: pointer from where to read */ + ulint type) /*!< in: 1,2 or 4 bytes */ +{ + switch (type) { + case 1: + return(mach_read_from_1(ptr)); + case 2: + return(mach_read_from_2(ptr)); + case 4: + return(mach_read_from_4(ptr)); + default: + ut_error; + } + + return(0); +} + +#endif /* !UNIV_HOTBACKUP */ +#endif /* !UNIV_INNOCHECKSUM */ diff --git a/storage/innobase/include/mem0dbg.h b/storage/innobase/include/mem0dbg.h new file mode 100644 index 00000000000..cc339b82910 --- /dev/null +++ b/storage/innobase/include/mem0dbg.h @@ -0,0 +1,150 @@ +/***************************************************************************** + +Copyright (c) 1994, 2010, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/mem0dbg.h +The memory management: the debug code. This is not a compilation module, +but is included in mem0mem.* ! + +Created 6/9/1994 Heikki Tuuri +*******************************************************/ + +/* In the debug version each allocated field is surrounded with +check fields whose sizes are given below */ + +#ifdef UNIV_MEM_DEBUG +# ifndef UNIV_HOTBACKUP +/* The mutex which protects in the debug version the hash table +containing the list of live memory heaps, and also the global +variables in mem0dbg.cc. */ +extern ib_mutex_t mem_hash_mutex; +# endif /* !UNIV_HOTBACKUP */ + +#define MEM_FIELD_HEADER_SIZE ut_calc_align(2 * sizeof(ulint),\ + UNIV_MEM_ALIGNMENT) +#define MEM_FIELD_TRAILER_SIZE sizeof(ulint) +#else +#define MEM_FIELD_HEADER_SIZE 0 +#endif + + +/* Space needed when allocating for a user a field of +length N. The space is allocated only in multiples of +UNIV_MEM_ALIGNMENT. In the debug version there are also +check fields at the both ends of the field. */ +#ifdef UNIV_MEM_DEBUG +#define MEM_SPACE_NEEDED(N) ut_calc_align((N) + MEM_FIELD_HEADER_SIZE\ + + MEM_FIELD_TRAILER_SIZE, UNIV_MEM_ALIGNMENT) +#else +#define MEM_SPACE_NEEDED(N) ut_calc_align((N), UNIV_MEM_ALIGNMENT) +#endif + +#if defined UNIV_MEM_DEBUG || defined UNIV_DEBUG +/***************************************************************//** +Checks a memory heap for consistency and prints the contents if requested. +Outputs the sum of sizes of buffers given to the user (only in +the debug version), the physical size of the heap and the number of +blocks in the heap. In case of error returns 0 as sizes and number +of blocks. */ +UNIV_INTERN +void +mem_heap_validate_or_print( +/*=======================*/ + mem_heap_t* heap, /*!< in: memory heap */ + byte* top, /*!< in: calculate and validate only until + this top pointer in the heap is reached, + if this pointer is NULL, ignored */ + ibool print, /*!< in: if TRUE, prints the contents + of the heap; works only in + the debug version */ + ibool* error, /*!< out: TRUE if error */ + ulint* us_size,/*!< out: allocated memory + (for the user) in the heap, + if a NULL pointer is passed as this + argument, it is ignored; in the + non-debug version this is always -1 */ + ulint* ph_size,/*!< out: physical size of the heap, + if a NULL pointer is passed as this + argument, it is ignored */ + ulint* n_blocks); /*!< out: number of blocks in the heap, + if a NULL pointer is passed as this + argument, it is ignored */ +/**************************************************************//** +Validates the contents of a memory heap. +@return TRUE if ok */ +UNIV_INTERN +ibool +mem_heap_validate( +/*==============*/ + mem_heap_t* heap); /*!< in: memory heap */ +#endif /* UNIV_MEM_DEBUG || UNIV_DEBUG */ +#ifdef UNIV_DEBUG +/**************************************************************//** +Checks that an object is a memory heap (or a block of it) +@return TRUE if ok */ +UNIV_INTERN +ibool +mem_heap_check( +/*===========*/ + mem_heap_t* heap); /*!< in: memory heap */ +#endif /* UNIV_DEBUG */ +#ifdef UNIV_MEM_DEBUG +/*****************************************************************//** +TRUE if no memory is currently allocated. +@return TRUE if no heaps exist */ +UNIV_INTERN +ibool +mem_all_freed(void); +/*===============*/ +/*****************************************************************//** +Validates the dynamic memory +@return TRUE if error */ +UNIV_INTERN +ibool +mem_validate_no_assert(void); +/*=========================*/ +/************************************************************//** +Validates the dynamic memory +@return TRUE if ok */ +UNIV_INTERN +ibool +mem_validate(void); +/*===============*/ +#endif /* UNIV_MEM_DEBUG */ +/************************************************************//** +Tries to find neigboring memory allocation blocks and dumps to stderr +the neighborhood of a given pointer. */ +UNIV_INTERN +void +mem_analyze_corruption( +/*===================*/ + void* ptr); /*!< in: pointer to place of possible corruption */ +/*****************************************************************//** +Prints information of dynamic memory usage and currently allocated memory +heaps or buffers. Can only be used in the debug version. */ +UNIV_INTERN +void +mem_print_info(void); +/*================*/ +/*****************************************************************//** +Prints information of dynamic memory usage and currently allocated memory +heaps or buffers since the last ..._print_info or..._print_new_info. */ +UNIV_INTERN +void +mem_print_new_info(void); +/*====================*/ diff --git a/storage/innobase/include/mem0dbg.ic b/storage/innobase/include/mem0dbg.ic new file mode 100644 index 00000000000..ec60ed35337 --- /dev/null +++ b/storage/innobase/include/mem0dbg.ic @@ -0,0 +1,109 @@ +/***************************************************************************** + +Copyright (c) 1994, 2010, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/********************************************************************//** +@file include/mem0dbg.ic +The memory management: the debug code. This is not an independent +compilation module but is included in mem0mem.*. + +Created 6/8/1994 Heikki Tuuri +*************************************************************************/ + +#ifdef UNIV_MEM_DEBUG +extern ulint mem_current_allocated_memory; + +/******************************************************************//** +Initializes an allocated memory field in the debug version. */ +UNIV_INTERN +void +mem_field_init( +/*===========*/ + byte* buf, /*!< in: memory field */ + ulint n); /*!< in: how many bytes the user requested */ +/******************************************************************//** +Erases an allocated memory field in the debug version. */ +UNIV_INTERN +void +mem_field_erase( +/*============*/ + byte* buf, /*!< in: memory field */ + ulint n); /*!< in: how many bytes the user requested */ +/***************************************************************//** +Initializes a buffer to a random combination of hex BA and BE. +Used to initialize allocated memory. */ +UNIV_INTERN +void +mem_init_buf( +/*=========*/ + byte* buf, /*!< in: pointer to buffer */ + ulint n); /*!< in: length of buffer */ +/***************************************************************//** +Initializes a buffer to a random combination of hex DE and AD. +Used to erase freed memory. */ +UNIV_INTERN +void +mem_erase_buf( +/*==========*/ + byte* buf, /*!< in: pointer to buffer */ + ulint n); /*!< in: length of buffer */ +/***************************************************************//** +Inserts a created memory heap to the hash table of +current allocated memory heaps. +Initializes the hash table when first called. */ +UNIV_INTERN +void +mem_hash_insert( +/*============*/ + mem_heap_t* heap, /*!< in: the created heap */ + const char* file_name, /*!< in: file name of creation */ + ulint line); /*!< in: line where created */ +/***************************************************************//** +Removes a memory heap (which is going to be freed by the caller) +from the list of live memory heaps. Returns the size of the heap +in terms of how much memory in bytes was allocated for the user of +the heap (not the total space occupied by the heap). +Also validates the heap. +NOTE: This function does not free the storage occupied by the +heap itself, only the node in the list of heaps. */ +UNIV_INTERN +void +mem_hash_remove( +/*============*/ + mem_heap_t* heap, /*!< in: the heap to be freed */ + const char* file_name, /*!< in: file name of freeing */ + ulint line); /*!< in: line where freed */ + + +void +mem_field_header_set_len(byte* field, ulint len); + +ulint +mem_field_header_get_len(byte* field); + +void +mem_field_header_set_check(byte* field, ulint check); + +ulint +mem_field_header_get_check(byte* field); + +void +mem_field_trailer_set_check(byte* field, ulint check); + +ulint +mem_field_trailer_get_check(byte* field); +#endif /* UNIV_MEM_DEBUG */ diff --git a/storage/innobase/include/mem0mem.h b/storage/innobase/include/mem0mem.h new file mode 100644 index 00000000000..f30034f3074 --- /dev/null +++ b/storage/innobase/include/mem0mem.h @@ -0,0 +1,425 @@ +/***************************************************************************** + +Copyright (c) 1994, 2010, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/mem0mem.h +The memory management + +Created 6/9/1994 Heikki Tuuri +*******************************************************/ + +#ifndef mem0mem_h +#define mem0mem_h + +#include "univ.i" +#include "ut0mem.h" +#include "ut0byte.h" +#include "ut0rnd.h" +#ifndef UNIV_HOTBACKUP +# include "sync0sync.h" +#endif /* UNIV_HOTBACKUP */ +#include "ut0lst.h" +#include "mach0data.h" + +/* -------------------- MEMORY HEAPS ----------------------------- */ + +/* A block of a memory heap consists of the info structure +followed by an area of memory */ +typedef struct mem_block_info_t mem_block_t; + +/* A memory heap is a nonempty linear list of memory blocks */ +typedef mem_block_t mem_heap_t; + +/* Types of allocation for memory heaps: DYNAMIC means allocation from the +dynamic memory pool of the C compiler, BUFFER means allocation from the +buffer pool; the latter method is used for very big heaps */ + +#define MEM_HEAP_DYNAMIC 0 /* the most common type */ +#define MEM_HEAP_BUFFER 1 +#define MEM_HEAP_BTR_SEARCH 2 /* this flag can optionally be + ORed to MEM_HEAP_BUFFER, in which + case heap->free_block is used in + some cases for memory allocations, + and if it's NULL, the memory + allocation functions can return + NULL. */ + +/* Different type of heaps in terms of which datastructure is using them */ +#define MEM_HEAP_FOR_BTR_SEARCH (MEM_HEAP_BTR_SEARCH | MEM_HEAP_BUFFER) +#define MEM_HEAP_FOR_PAGE_HASH (MEM_HEAP_DYNAMIC) +#define MEM_HEAP_FOR_RECV_SYS (MEM_HEAP_BUFFER) +#define MEM_HEAP_FOR_LOCK_HEAP (MEM_HEAP_BUFFER) + +/* The following start size is used for the first block in the memory heap if +the size is not specified, i.e., 0 is given as the parameter in the call of +create. The standard size is the maximum (payload) size of the blocks used for +allocations of small buffers. */ + +#define MEM_BLOCK_START_SIZE 64 +#define MEM_BLOCK_STANDARD_SIZE \ + (UNIV_PAGE_SIZE >= 16384 ? 8000 : MEM_MAX_ALLOC_IN_BUF) + +/* If a memory heap is allowed to grow into the buffer pool, the following +is the maximum size for a single allocated buffer: */ +#define MEM_MAX_ALLOC_IN_BUF (UNIV_PAGE_SIZE - 200) + +/******************************************************************//** +Initializes the memory system. */ +UNIV_INTERN +void +mem_init( +/*=====*/ + ulint size); /*!< in: common pool size in bytes */ +/******************************************************************//** +Closes the memory system. */ +UNIV_INTERN +void +mem_close(void); +/*===========*/ + +#ifdef UNIV_DEBUG +/**************************************************************//** +Use this macro instead of the corresponding function! Macro for memory +heap creation. */ + +# define mem_heap_create(N) mem_heap_create_func( \ + (N), __FILE__, __LINE__, MEM_HEAP_DYNAMIC) +/**************************************************************//** +Use this macro instead of the corresponding function! Macro for memory +heap creation. */ + +# define mem_heap_create_typed(N, T) mem_heap_create_func( \ + (N), __FILE__, __LINE__, (T)) + +#else /* UNIV_DEBUG */ +/**************************************************************//** +Use this macro instead of the corresponding function! Macro for memory +heap creation. */ + +# define mem_heap_create(N) mem_heap_create_func( \ + (N), MEM_HEAP_DYNAMIC) +/**************************************************************//** +Use this macro instead of the corresponding function! Macro for memory +heap creation. */ + +# define mem_heap_create_typed(N, T) mem_heap_create_func( \ + (N), (T)) + +#endif /* UNIV_DEBUG */ +/**************************************************************//** +Use this macro instead of the corresponding function! Macro for memory +heap freeing. */ + +#define mem_heap_free(heap) mem_heap_free_func(\ + (heap), __FILE__, __LINE__) +/*****************************************************************//** +NOTE: Use the corresponding macros instead of this function. Creates a +memory heap. For debugging purposes, takes also the file name and line as +arguments. +@return own: memory heap, NULL if did not succeed (only possible for +MEM_HEAP_BTR_SEARCH type heaps) */ +UNIV_INLINE +mem_heap_t* +mem_heap_create_func( +/*=================*/ + ulint n, /*!< in: desired start block size, + this means that a single user buffer + of size n will fit in the block, + 0 creates a default size block */ +#ifdef UNIV_DEBUG + const char* file_name, /*!< in: file name where created */ + ulint line, /*!< in: line where created */ +#endif /* UNIV_DEBUG */ + ulint type); /*!< in: heap type */ +/*****************************************************************//** +NOTE: Use the corresponding macro instead of this function. Frees the space +occupied by a memory heap. In the debug version erases the heap memory +blocks. */ +UNIV_INLINE +void +mem_heap_free_func( +/*===============*/ + mem_heap_t* heap, /*!< in, own: heap to be freed */ + const char* file_name, /*!< in: file name where freed */ + ulint line); /*!< in: line where freed */ +/***************************************************************//** +Allocates and zero-fills n bytes of memory from a memory heap. +@return allocated, zero-filled storage */ +UNIV_INLINE +void* +mem_heap_zalloc( +/*============*/ + mem_heap_t* heap, /*!< in: memory heap */ + ulint n); /*!< in: number of bytes; if the heap is allowed + to grow into the buffer pool, this must be + <= MEM_MAX_ALLOC_IN_BUF */ +/***************************************************************//** +Allocates n bytes of memory from a memory heap. +@return allocated storage, NULL if did not succeed (only possible for +MEM_HEAP_BTR_SEARCH type heaps) */ +UNIV_INLINE +void* +mem_heap_alloc( +/*===========*/ + mem_heap_t* heap, /*!< in: memory heap */ + ulint n); /*!< in: number of bytes; if the heap is allowed + to grow into the buffer pool, this must be + <= MEM_MAX_ALLOC_IN_BUF */ +/*****************************************************************//** +Returns a pointer to the heap top. +@return pointer to the heap top */ +UNIV_INLINE +byte* +mem_heap_get_heap_top( +/*==================*/ + mem_heap_t* heap); /*!< in: memory heap */ +/*****************************************************************//** +Frees the space in a memory heap exceeding the pointer given. The +pointer must have been acquired from mem_heap_get_heap_top. The first +memory block of the heap is not freed. */ +UNIV_INLINE +void +mem_heap_free_heap_top( +/*===================*/ + mem_heap_t* heap, /*!< in: heap from which to free */ + byte* old_top);/*!< in: pointer to old top of heap */ +/*****************************************************************//** +Empties a memory heap. The first memory block of the heap is not freed. */ +UNIV_INLINE +void +mem_heap_empty( +/*===========*/ + mem_heap_t* heap); /*!< in: heap to empty */ +/*****************************************************************//** +Returns a pointer to the topmost element in a memory heap. +The size of the element must be given. +@return pointer to the topmost element */ +UNIV_INLINE +void* +mem_heap_get_top( +/*=============*/ + mem_heap_t* heap, /*!< in: memory heap */ + ulint n); /*!< in: size of the topmost element */ +/*****************************************************************//** +Frees the topmost element in a memory heap. +The size of the element must be given. */ +UNIV_INLINE +void +mem_heap_free_top( +/*==============*/ + mem_heap_t* heap, /*!< in: memory heap */ + ulint n); /*!< in: size of the topmost element */ +/*****************************************************************//** +Returns the space in bytes occupied by a memory heap. */ +UNIV_INLINE +ulint +mem_heap_get_size( +/*==============*/ + mem_heap_t* heap); /*!< in: heap */ +/**************************************************************//** +Use this macro instead of the corresponding function! +Macro for memory buffer allocation */ + +#define mem_zalloc(N) memset(mem_alloc(N), 0, (N)) + +#ifdef UNIV_DEBUG +#define mem_alloc(N) mem_alloc_func((N), __FILE__, __LINE__, NULL) +#define mem_alloc2(N,S) mem_alloc_func((N), __FILE__, __LINE__, (S)) +#else /* UNIV_DEBUG */ +#define mem_alloc(N) mem_alloc_func((N), NULL) +#define mem_alloc2(N,S) mem_alloc_func((N), (S)) +#endif /* UNIV_DEBUG */ + +/***************************************************************//** +NOTE: Use the corresponding macro instead of this function. +Allocates a single buffer of memory from the dynamic memory of +the C compiler. Is like malloc of C. The buffer must be freed +with mem_free. +@return own: free storage */ +UNIV_INLINE +void* +mem_alloc_func( +/*===========*/ + ulint n, /*!< in: requested size in bytes */ +#ifdef UNIV_DEBUG + const char* file_name, /*!< in: file name where created */ + ulint line, /*!< in: line where created */ +#endif /* UNIV_DEBUG */ + ulint* size); /*!< out: allocated size in bytes, + or NULL */ + +/**************************************************************//** +Use this macro instead of the corresponding function! +Macro for memory buffer freeing */ + +#define mem_free(PTR) mem_free_func((PTR), __FILE__, __LINE__) +/***************************************************************//** +NOTE: Use the corresponding macro instead of this function. +Frees a single buffer of storage from +the dynamic memory of C compiler. Similar to free of C. */ +UNIV_INLINE +void +mem_free_func( +/*==========*/ + void* ptr, /*!< in, own: buffer to be freed */ + const char* file_name, /*!< in: file name where created */ + ulint line); /*!< in: line where created */ + +/**********************************************************************//** +Duplicates a NUL-terminated string. +@return own: a copy of the string, must be deallocated with mem_free */ +UNIV_INLINE +char* +mem_strdup( +/*=======*/ + const char* str); /*!< in: string to be copied */ +/**********************************************************************//** +Makes a NUL-terminated copy of a nonterminated string. +@return own: a copy of the string, must be deallocated with mem_free */ +UNIV_INLINE +char* +mem_strdupl( +/*========*/ + const char* str, /*!< in: string to be copied */ + ulint len); /*!< in: length of str, in bytes */ + +/**********************************************************************//** +Duplicates a NUL-terminated string, allocated from a memory heap. +@return own: a copy of the string */ +UNIV_INTERN +char* +mem_heap_strdup( +/*============*/ + mem_heap_t* heap, /*!< in: memory heap where string is allocated */ + const char* str); /*!< in: string to be copied */ +/**********************************************************************//** +Makes a NUL-terminated copy of a nonterminated string, +allocated from a memory heap. +@return own: a copy of the string */ +UNIV_INLINE +char* +mem_heap_strdupl( +/*=============*/ + mem_heap_t* heap, /*!< in: memory heap where string is allocated */ + const char* str, /*!< in: string to be copied */ + ulint len); /*!< in: length of str, in bytes */ + +/**********************************************************************//** +Concatenate two strings and return the result, using a memory heap. +@return own: the result */ +UNIV_INTERN +char* +mem_heap_strcat( +/*============*/ + mem_heap_t* heap, /*!< in: memory heap where string is allocated */ + const char* s1, /*!< in: string 1 */ + const char* s2); /*!< in: string 2 */ + +/**********************************************************************//** +Duplicate a block of data, allocated from a memory heap. +@return own: a copy of the data */ +UNIV_INTERN +void* +mem_heap_dup( +/*=========*/ + mem_heap_t* heap, /*!< in: memory heap where copy is allocated */ + const void* data, /*!< in: data to be copied */ + ulint len); /*!< in: length of data, in bytes */ + +/****************************************************************//** +A simple sprintf replacement that dynamically allocates the space for the +formatted string from the given heap. This supports a very limited set of +the printf syntax: types 's' and 'u' and length modifier 'l' (which is +required for the 'u' type). +@return heap-allocated formatted string */ +UNIV_INTERN +char* +mem_heap_printf( +/*============*/ + mem_heap_t* heap, /*!< in: memory heap */ + const char* format, /*!< in: format string */ + ...) __attribute__ ((format (printf, 2, 3))); + +#ifdef MEM_PERIODIC_CHECK +/******************************************************************//** +Goes through the list of all allocated mem blocks, checks their magic +numbers, and reports possible corruption. */ +UNIV_INTERN +void +mem_validate_all_blocks(void); +/*=========================*/ +#endif + +/*#######################################################################*/ + +/** The info structure stored at the beginning of a heap block */ +struct mem_block_info_t { + ulint magic_n;/* magic number for debugging */ +#ifdef UNIV_DEBUG + char file_name[8];/* file name where the mem heap was created */ + ulint line; /*!< line number where the mem heap was created */ +#endif /* UNIV_DEBUG */ + UT_LIST_BASE_NODE_T(mem_block_t) base; /* In the first block in the + the list this is the base node of the list of blocks; + in subsequent blocks this is undefined */ + UT_LIST_NODE_T(mem_block_t) list; /* This contains pointers to next + and prev in the list. The first block allocated + to the heap is also the first block in this list, + though it also contains the base node of the list. */ + ulint len; /*!< physical length of this block in bytes */ + ulint total_size; /*!< physical length in bytes of all blocks + in the heap. This is defined only in the base + node and is set to ULINT_UNDEFINED in others. */ + ulint type; /*!< type of heap: MEM_HEAP_DYNAMIC, or + MEM_HEAP_BUF possibly ORed to MEM_HEAP_BTR_SEARCH */ + ulint free; /*!< offset in bytes of the first free position for + user data in the block */ + ulint start; /*!< the value of the struct field 'free' at the + creation of the block */ +#ifndef UNIV_HOTBACKUP + void* free_block; + /* if the MEM_HEAP_BTR_SEARCH bit is set in type, + and this is the heap root, this can contain an + allocated buffer frame, which can be appended as a + free block to the heap, if we need more space; + otherwise, this is NULL */ + void* buf_block; + /* if this block has been allocated from the buffer + pool, this contains the buf_block_t handle; + otherwise, this is NULL */ +#endif /* !UNIV_HOTBACKUP */ +#ifdef MEM_PERIODIC_CHECK + UT_LIST_NODE_T(mem_block_t) mem_block_list; + /* List of all mem blocks allocated; protected + by the mem_comm_pool mutex */ +#endif +}; + +#define MEM_BLOCK_MAGIC_N 764741555 +#define MEM_FREED_BLOCK_MAGIC_N 547711122 + +/* Header size for a memory heap block */ +#define MEM_BLOCK_HEADER_SIZE ut_calc_align(sizeof(mem_block_info_t),\ + UNIV_MEM_ALIGNMENT) +#include "mem0dbg.h" + +#ifndef UNIV_NONINL +#include "mem0mem.ic" +#endif + +#endif diff --git a/storage/innobase/include/mem0mem.ic b/storage/innobase/include/mem0mem.ic new file mode 100644 index 00000000000..0d983d69e1a --- /dev/null +++ b/storage/innobase/include/mem0mem.ic @@ -0,0 +1,649 @@ +/***************************************************************************** + +Copyright (c) 1994, 2010, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/********************************************************************//** +@file include/mem0mem.ic +The memory management + +Created 6/8/1994 Heikki Tuuri +*************************************************************************/ + +#include "mem0dbg.ic" +#ifndef UNIV_HOTBACKUP +# include "mem0pool.h" +#endif /* !UNIV_HOTBACKUP */ + +#ifdef UNIV_DEBUG +# define mem_heap_create_block(heap, n, type, file_name, line) \ + mem_heap_create_block_func(heap, n, file_name, line, type) +# define mem_heap_create_at(N, file_name, line) \ + mem_heap_create_func(N, file_name, line, MEM_HEAP_DYNAMIC) +#else /* UNIV_DEBUG */ +# define mem_heap_create_block(heap, n, type, file_name, line) \ + mem_heap_create_block_func(heap, n, type) +# define mem_heap_create_at(N, file_name, line) \ + mem_heap_create_func(N, MEM_HEAP_DYNAMIC) +#endif /* UNIV_DEBUG */ +/***************************************************************//** +Creates a memory heap block where data can be allocated. +@return own: memory heap block, NULL if did not succeed (only possible +for MEM_HEAP_BTR_SEARCH type heaps) */ +UNIV_INTERN +mem_block_t* +mem_heap_create_block_func( +/*=======================*/ + mem_heap_t* heap, /*!< in: memory heap or NULL if first block + should be created */ + ulint n, /*!< in: number of bytes needed for user data */ +#ifdef UNIV_DEBUG + const char* file_name,/*!< in: file name where created */ + ulint line, /*!< in: line where created */ +#endif /* UNIV_DEBUG */ + ulint type); /*!< in: type of heap: MEM_HEAP_DYNAMIC or + MEM_HEAP_BUFFER */ +/******************************************************************//** +Frees a block from a memory heap. */ +UNIV_INTERN +void +mem_heap_block_free( +/*================*/ + mem_heap_t* heap, /*!< in: heap */ + mem_block_t* block); /*!< in: block to free */ +#ifndef UNIV_HOTBACKUP +/******************************************************************//** +Frees the free_block field from a memory heap. */ +UNIV_INTERN +void +mem_heap_free_block_free( +/*=====================*/ + mem_heap_t* heap); /*!< in: heap */ +#endif /* !UNIV_HOTBACKUP */ +/***************************************************************//** +Adds a new block to a memory heap. +@return created block, NULL if did not succeed (only possible for +MEM_HEAP_BTR_SEARCH type heaps) */ +UNIV_INTERN +mem_block_t* +mem_heap_add_block( +/*===============*/ + mem_heap_t* heap, /*!< in: memory heap */ + ulint n); /*!< in: number of bytes user needs */ + +UNIV_INLINE +void +mem_block_set_len(mem_block_t* block, ulint len) +{ + ut_ad(len > 0); + + block->len = len; +} + +UNIV_INLINE +ulint +mem_block_get_len(mem_block_t* block) +{ + return(block->len); +} + +UNIV_INLINE +void +mem_block_set_type(mem_block_t* block, ulint type) +{ + ut_ad((type == MEM_HEAP_DYNAMIC) || (type == MEM_HEAP_BUFFER) + || (type == MEM_HEAP_BUFFER + MEM_HEAP_BTR_SEARCH)); + + block->type = type; +} + +UNIV_INLINE +ulint +mem_block_get_type(mem_block_t* block) +{ + return(block->type); +} + +UNIV_INLINE +void +mem_block_set_free(mem_block_t* block, ulint free) +{ + ut_ad(free > 0); + ut_ad(free <= mem_block_get_len(block)); + + block->free = free; +} + +UNIV_INLINE +ulint +mem_block_get_free(mem_block_t* block) +{ + return(block->free); +} + +UNIV_INLINE +void +mem_block_set_start(mem_block_t* block, ulint start) +{ + ut_ad(start > 0); + + block->start = start; +} + +UNIV_INLINE +ulint +mem_block_get_start(mem_block_t* block) +{ + return(block->start); +} + +/***************************************************************//** +Allocates and zero-fills n bytes of memory from a memory heap. +@return allocated, zero-filled storage */ +UNIV_INLINE +void* +mem_heap_zalloc( +/*============*/ + mem_heap_t* heap, /*!< in: memory heap */ + ulint n) /*!< in: number of bytes; if the heap is allowed + to grow into the buffer pool, this must be + <= MEM_MAX_ALLOC_IN_BUF */ +{ + ut_ad(heap); + ut_ad(!(heap->type & MEM_HEAP_BTR_SEARCH)); + return(memset(mem_heap_alloc(heap, n), 0, n)); +} + +/***************************************************************//** +Allocates n bytes of memory from a memory heap. +@return allocated storage, NULL if did not succeed (only possible for +MEM_HEAP_BTR_SEARCH type heaps) */ +UNIV_INLINE +void* +mem_heap_alloc( +/*===========*/ + mem_heap_t* heap, /*!< in: memory heap */ + ulint n) /*!< in: number of bytes; if the heap is allowed + to grow into the buffer pool, this must be + <= MEM_MAX_ALLOC_IN_BUF */ +{ + mem_block_t* block; + void* buf; + ulint free; + + ut_ad(mem_heap_check(heap)); + + block = UT_LIST_GET_LAST(heap->base); + + ut_ad(!(block->type & MEM_HEAP_BUFFER) || (n <= MEM_MAX_ALLOC_IN_BUF)); + + /* Check if there is enough space in block. If not, create a new + block to the heap */ + + if (mem_block_get_len(block) + < mem_block_get_free(block) + MEM_SPACE_NEEDED(n)) { + + block = mem_heap_add_block(heap, n); + + if (block == NULL) { + + return(NULL); + } + } + + free = mem_block_get_free(block); + + buf = (byte*) block + free; + + mem_block_set_free(block, free + MEM_SPACE_NEEDED(n)); + +#ifdef UNIV_MEM_DEBUG + UNIV_MEM_ALLOC(buf, + n + MEM_FIELD_HEADER_SIZE + MEM_FIELD_TRAILER_SIZE); + + /* In the debug version write debugging info to the field */ + mem_field_init((byte*) buf, n); + + /* Advance buf to point at the storage which will be given to the + caller */ + buf = (byte*) buf + MEM_FIELD_HEADER_SIZE; + +#endif + UNIV_MEM_ALLOC(buf, n); + return(buf); +} + +/*****************************************************************//** +Returns a pointer to the heap top. +@return pointer to the heap top */ +UNIV_INLINE +byte* +mem_heap_get_heap_top( +/*==================*/ + mem_heap_t* heap) /*!< in: memory heap */ +{ + mem_block_t* block; + byte* buf; + + ut_ad(mem_heap_check(heap)); + + block = UT_LIST_GET_LAST(heap->base); + + buf = (byte*) block + mem_block_get_free(block); + + return(buf); +} + +/*****************************************************************//** +Frees the space in a memory heap exceeding the pointer given. The +pointer must have been acquired from mem_heap_get_heap_top. The first +memory block of the heap is not freed. */ +UNIV_INLINE +void +mem_heap_free_heap_top( +/*===================*/ + mem_heap_t* heap, /*!< in: heap from which to free */ + byte* old_top)/*!< in: pointer to old top of heap */ +{ + mem_block_t* block; + mem_block_t* prev_block; +#if defined UNIV_MEM_DEBUG || defined UNIV_DEBUG + ibool error; + ulint total_size; + ulint size; + + ut_ad(mem_heap_check(heap)); + + /* Validate the heap and get its total allocated size */ + mem_heap_validate_or_print(heap, NULL, FALSE, &error, &total_size, + NULL, NULL); + ut_a(!error); + + /* Get the size below top pointer */ + mem_heap_validate_or_print(heap, old_top, FALSE, &error, &size, NULL, + NULL); + ut_a(!error); + +#endif + + block = UT_LIST_GET_LAST(heap->base); + + while (block != NULL) { + if (((byte*) block + mem_block_get_free(block) >= old_top) + && ((byte*) block <= old_top)) { + /* Found the right block */ + + break; + } + + /* Store prev_block value before freeing the current block + (the current block will be erased in freeing) */ + + prev_block = UT_LIST_GET_PREV(list, block); + + mem_heap_block_free(heap, block); + + block = prev_block; + } + + ut_ad(block); + + /* Set the free field of block */ + mem_block_set_free(block, old_top - (byte*) block); + + ut_ad(mem_block_get_start(block) <= mem_block_get_free(block)); + UNIV_MEM_ASSERT_W(old_top, (byte*) block + block->len - old_top); +#if defined UNIV_MEM_DEBUG + /* In the debug version erase block from top up */ + mem_erase_buf(old_top, (byte*) block + block->len - old_top); + + /* Update allocated memory count */ + mutex_enter(&mem_hash_mutex); + mem_current_allocated_memory -= (total_size - size); + mutex_exit(&mem_hash_mutex); +#endif /* UNIV_MEM_DEBUG */ + UNIV_MEM_ALLOC(old_top, (byte*) block + block->len - old_top); + + /* If free == start, we may free the block if it is not the first + one */ + + if ((heap != block) && (mem_block_get_free(block) + == mem_block_get_start(block))) { + mem_heap_block_free(heap, block); + } +} + +/*****************************************************************//** +Empties a memory heap. The first memory block of the heap is not freed. */ +UNIV_INLINE +void +mem_heap_empty( +/*===========*/ + mem_heap_t* heap) /*!< in: heap to empty */ +{ + mem_heap_free_heap_top(heap, (byte*) heap + mem_block_get_start(heap)); +#ifndef UNIV_HOTBACKUP + if (heap->free_block) { + mem_heap_free_block_free(heap); + } +#endif /* !UNIV_HOTBACKUP */ +} + +/*****************************************************************//** +Returns a pointer to the topmost element in a memory heap. The size of the +element must be given. +@return pointer to the topmost element */ +UNIV_INLINE +void* +mem_heap_get_top( +/*=============*/ + mem_heap_t* heap, /*!< in: memory heap */ + ulint n) /*!< in: size of the topmost element */ +{ + mem_block_t* block; + byte* buf; + + ut_ad(mem_heap_check(heap)); + + block = UT_LIST_GET_LAST(heap->base); + + buf = (byte*) block + mem_block_get_free(block) - MEM_SPACE_NEEDED(n); + +#ifdef UNIV_MEM_DEBUG + ut_ad(mem_block_get_start(block) <= (ulint) (buf - (byte*) block)); + + /* In the debug version, advance buf to point at the storage which + was given to the caller in the allocation*/ + + buf += MEM_FIELD_HEADER_SIZE; + + /* Check that the field lengths agree */ + ut_ad(n == mem_field_header_get_len(buf)); +#endif + + return((void*) buf); +} + +/*****************************************************************//** +Frees the topmost element in a memory heap. The size of the element must be +given. */ +UNIV_INLINE +void +mem_heap_free_top( +/*==============*/ + mem_heap_t* heap, /*!< in: memory heap */ + ulint n) /*!< in: size of the topmost element */ +{ + mem_block_t* block; + + ut_ad(mem_heap_check(heap)); + + block = UT_LIST_GET_LAST(heap->base); + + /* Subtract the free field of block */ + mem_block_set_free(block, mem_block_get_free(block) + - MEM_SPACE_NEEDED(n)); + UNIV_MEM_ASSERT_W((byte*) block + mem_block_get_free(block), n); +#ifdef UNIV_MEM_DEBUG + + ut_ad(mem_block_get_start(block) <= mem_block_get_free(block)); + + /* In the debug version check the consistency, and erase field */ + mem_field_erase((byte*) block + mem_block_get_free(block), n); +#endif + + /* If free == start, we may free the block if it is not the first + one */ + + if ((heap != block) && (mem_block_get_free(block) + == mem_block_get_start(block))) { + mem_heap_block_free(heap, block); + } else { + /* Avoid a bogus UNIV_MEM_ASSERT_W() warning in a + subsequent invocation of mem_heap_free_top(). + Originally, this was UNIV_MEM_FREE(), to catch writes + to freed memory. */ + UNIV_MEM_ALLOC((byte*) block + mem_block_get_free(block), n); + } +} + +/*****************************************************************//** +NOTE: Use the corresponding macros instead of this function. Creates a +memory heap. For debugging purposes, takes also the file name and line as +argument. +@return own: memory heap, NULL if did not succeed (only possible for +MEM_HEAP_BTR_SEARCH type heaps) */ +UNIV_INLINE +mem_heap_t* +mem_heap_create_func( +/*=================*/ + ulint n, /*!< in: desired start block size, + this means that a single user buffer + of size n will fit in the block, + 0 creates a default size block */ +#ifdef UNIV_DEBUG + const char* file_name, /*!< in: file name where created */ + ulint line, /*!< in: line where created */ +#endif /* UNIV_DEBUG */ + ulint type) /*!< in: heap type */ +{ + mem_block_t* block; + + if (!n) { + n = MEM_BLOCK_START_SIZE; + } + + block = mem_heap_create_block(NULL, n, type, file_name, line); + + if (block == NULL) { + + return(NULL); + } + + UT_LIST_INIT(block->base); + + /* Add the created block itself as the first block in the list */ + UT_LIST_ADD_FIRST(list, block->base, block); + +#ifdef UNIV_MEM_DEBUG + + mem_hash_insert(block, file_name, line); + +#endif + + return(block); +} + +/*****************************************************************//** +NOTE: Use the corresponding macro instead of this function. Frees the space +occupied by a memory heap. In the debug version erases the heap memory +blocks. */ +UNIV_INLINE +void +mem_heap_free_func( +/*===============*/ + mem_heap_t* heap, /*!< in, own: heap to be freed */ + const char* file_name __attribute__((unused)), + /*!< in: file name where freed */ + ulint line __attribute__((unused))) +{ + mem_block_t* block; + mem_block_t* prev_block; + + ut_ad(mem_heap_check(heap)); + + block = UT_LIST_GET_LAST(heap->base); + +#ifdef UNIV_MEM_DEBUG + + /* In the debug version remove the heap from the hash table of heaps + and check its consistency */ + + mem_hash_remove(heap, file_name, line); + +#endif +#ifndef UNIV_HOTBACKUP + if (heap->free_block) { + mem_heap_free_block_free(heap); + } +#endif /* !UNIV_HOTBACKUP */ + + while (block != NULL) { + /* Store the contents of info before freeing current block + (it is erased in freeing) */ + + prev_block = UT_LIST_GET_PREV(list, block); + + mem_heap_block_free(heap, block); + + block = prev_block; + } +} + +/***************************************************************//** +NOTE: Use the corresponding macro instead of this function. +Allocates a single buffer of memory from the dynamic memory of +the C compiler. Is like malloc of C. The buffer must be freed +with mem_free. +@return own: free storage */ +UNIV_INLINE +void* +mem_alloc_func( +/*===========*/ + ulint n, /*!< in: desired number of bytes */ +#ifdef UNIV_DEBUG + const char* file_name, /*!< in: file name where created */ + ulint line, /*!< in: line where created */ +#endif /* UNIV_DEBUG */ + ulint* size) /*!< out: allocated size in bytes, + or NULL */ +{ + mem_heap_t* heap; + void* buf; + + heap = mem_heap_create_at(n, file_name, line); + + /* Note that as we created the first block in the heap big enough + for the buffer requested by the caller, the buffer will be in the + first block and thus we can calculate the pointer to the heap from + the pointer to the buffer when we free the memory buffer. */ + + if (size) { + /* Adjust the allocation to the actual size of the + memory block. */ + ulint m = mem_block_get_len(heap) + - mem_block_get_free(heap); +#ifdef UNIV_MEM_DEBUG + m -= MEM_FIELD_HEADER_SIZE + MEM_FIELD_TRAILER_SIZE; +#endif /* UNIV_MEM_DEBUG */ + ut_ad(m >= n); + n = m; + *size = m; + } + + buf = mem_heap_alloc(heap, n); + + ut_a((byte*) heap == (byte*) buf - MEM_BLOCK_HEADER_SIZE + - MEM_FIELD_HEADER_SIZE); + return(buf); +} + +/***************************************************************//** +NOTE: Use the corresponding macro instead of this function. Frees a single +buffer of storage from the dynamic memory of the C compiler. Similar to the +free of C. */ +UNIV_INLINE +void +mem_free_func( +/*==========*/ + void* ptr, /*!< in, own: buffer to be freed */ + const char* file_name, /*!< in: file name where created */ + ulint line) /*!< in: line where created */ +{ + mem_heap_t* heap; + + heap = (mem_heap_t*)((byte*) ptr - MEM_BLOCK_HEADER_SIZE + - MEM_FIELD_HEADER_SIZE); + mem_heap_free_func(heap, file_name, line); +} + +/*****************************************************************//** +Returns the space in bytes occupied by a memory heap. */ +UNIV_INLINE +ulint +mem_heap_get_size( +/*==============*/ + mem_heap_t* heap) /*!< in: heap */ +{ + ulint size = 0; + + ut_ad(mem_heap_check(heap)); + + size = heap->total_size; + +#ifndef UNIV_HOTBACKUP + if (heap->free_block) { + size += UNIV_PAGE_SIZE; + } +#endif /* !UNIV_HOTBACKUP */ + + return(size); +} + +/**********************************************************************//** +Duplicates a NUL-terminated string. +@return own: a copy of the string, must be deallocated with mem_free */ +UNIV_INLINE +char* +mem_strdup( +/*=======*/ + const char* str) /*!< in: string to be copied */ +{ + ulint len = strlen(str) + 1; + return((char*) memcpy(mem_alloc(len), str, len)); +} + +/**********************************************************************//** +Makes a NUL-terminated copy of a nonterminated string. +@return own: a copy of the string, must be deallocated with mem_free */ +UNIV_INLINE +char* +mem_strdupl( +/*========*/ + const char* str, /*!< in: string to be copied */ + ulint len) /*!< in: length of str, in bytes */ +{ + char* s = (char*) mem_alloc(len + 1); + s[len] = 0; + return((char*) memcpy(s, str, len)); +} + +/**********************************************************************//** +Makes a NUL-terminated copy of a nonterminated string, +allocated from a memory heap. +@return own: a copy of the string */ +UNIV_INLINE +char* +mem_heap_strdupl( +/*=============*/ + mem_heap_t* heap, /*!< in: memory heap where string is allocated */ + const char* str, /*!< in: string to be copied */ + ulint len) /*!< in: length of str, in bytes */ +{ + char* s = (char*) mem_heap_alloc(heap, len + 1); + s[len] = 0; + return((char*) memcpy(s, str, len)); +} diff --git a/storage/innobase/include/mem0pool.h b/storage/innobase/include/mem0pool.h new file mode 100644 index 00000000000..a65ba50fdf9 --- /dev/null +++ b/storage/innobase/include/mem0pool.h @@ -0,0 +1,121 @@ +/***************************************************************************** + +Copyright (c) 1994, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/mem0pool.h +The lowest-level memory management + +Created 6/9/1994 Heikki Tuuri +*******************************************************/ + +#ifndef mem0pool_h +#define mem0pool_h + +#include "univ.i" +#include "os0file.h" +#include "ut0lst.h" + +/** Memory pool */ +struct mem_pool_t; + +/** The common memory pool */ +extern mem_pool_t* mem_comm_pool; + +/** Memory area header */ +struct mem_area_t{ + ulint size_and_free; /*!< memory area size is obtained by + anding with ~MEM_AREA_FREE; area in + a free list if ANDing with + MEM_AREA_FREE results in nonzero */ + UT_LIST_NODE_T(mem_area_t) + free_list; /*!< free list node */ +}; + +/** Each memory area takes this many extra bytes for control information */ +#define MEM_AREA_EXTRA_SIZE (ut_calc_align(sizeof(struct mem_area_t),\ + UNIV_MEM_ALIGNMENT)) + +/********************************************************************//** +Creates a memory pool. +@return memory pool */ +UNIV_INTERN +mem_pool_t* +mem_pool_create( +/*============*/ + ulint size); /*!< in: pool size in bytes */ +/********************************************************************//** +Frees a memory pool. */ +UNIV_INTERN +void +mem_pool_free( +/*==========*/ + mem_pool_t* pool); /*!< in, own: memory pool */ +/********************************************************************//** +Allocates memory from a pool. NOTE: This low-level function should only be +used in mem0mem.*! +@return own: allocated memory buffer */ +UNIV_INTERN +void* +mem_area_alloc( +/*===========*/ + ulint* psize, /*!< in: requested size in bytes; for optimum + space usage, the size should be a power of 2 + minus MEM_AREA_EXTRA_SIZE; + out: allocated size in bytes (greater than + or equal to the requested size) */ + mem_pool_t* pool); /*!< in: memory pool */ +/********************************************************************//** +Frees memory to a pool. */ +UNIV_INTERN +void +mem_area_free( +/*==========*/ + void* ptr, /*!< in, own: pointer to allocated memory + buffer */ + mem_pool_t* pool); /*!< in: memory pool */ +/********************************************************************//** +Returns the amount of reserved memory. +@return reserved mmeory in bytes */ +UNIV_INTERN +ulint +mem_pool_get_reserved( +/*==================*/ + mem_pool_t* pool); /*!< in: memory pool */ +/********************************************************************//** +Validates a memory pool. +@return TRUE if ok */ +UNIV_INTERN +ibool +mem_pool_validate( +/*==============*/ + mem_pool_t* pool); /*!< in: memory pool */ +/********************************************************************//** +Prints info of a memory pool. */ +UNIV_INTERN +void +mem_pool_print_info( +/*================*/ + FILE* outfile,/*!< in: output file to write to */ + mem_pool_t* pool); /*!< in: memory pool */ + + +#ifndef UNIV_NONINL +#include "mem0pool.ic" +#endif + +#endif diff --git a/storage/innobase/include/mem0pool.ic b/storage/innobase/include/mem0pool.ic new file mode 100644 index 00000000000..f4bafb8ba63 --- /dev/null +++ b/storage/innobase/include/mem0pool.ic @@ -0,0 +1,24 @@ +/***************************************************************************** + +Copyright (c) 1994, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/********************************************************************//** +@file include/mem0pool.ic +The lowest-level memory management + +Created 6/8/1994 Heikki Tuuri +*************************************************************************/ diff --git a/storage/innobase/include/mtr0log.h b/storage/innobase/include/mtr0log.h new file mode 100644 index 00000000000..18a345d050f --- /dev/null +++ b/storage/innobase/include/mtr0log.h @@ -0,0 +1,251 @@ +/***************************************************************************** + +Copyright (c) 1995, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/mtr0log.h +Mini-transaction logging routines + +Created 12/7/1995 Heikki Tuuri +*******************************************************/ + +#ifndef mtr0log_h +#define mtr0log_h + +#include "univ.i" +#include "mtr0mtr.h" +#include "dict0types.h" + +#ifndef UNIV_HOTBACKUP +/********************************************************//** +Writes 1, 2 or 4 bytes to a file page. Writes the corresponding log +record to the mini-transaction log if mtr is not NULL. */ +UNIV_INTERN +void +mlog_write_ulint( +/*=============*/ + byte* ptr, /*!< in: pointer where to write */ + ulint val, /*!< in: value to write */ + byte type, /*!< in: MLOG_1BYTE, MLOG_2BYTES, MLOG_4BYTES */ + mtr_t* mtr); /*!< in: mini-transaction handle */ +/********************************************************//** +Writes 8 bytes to a file page. Writes the corresponding log +record to the mini-transaction log, only if mtr is not NULL */ +UNIV_INTERN +void +mlog_write_ull( +/*===========*/ + byte* ptr, /*!< in: pointer where to write */ + ib_uint64_t val, /*!< in: value to write */ + mtr_t* mtr); /*!< in: mini-transaction handle */ +/********************************************************//** +Writes a string to a file page buffered in the buffer pool. Writes the +corresponding log record to the mini-transaction log. */ +UNIV_INTERN +void +mlog_write_string( +/*==============*/ + byte* ptr, /*!< in: pointer where to write */ + const byte* str, /*!< in: string to write */ + ulint len, /*!< in: string length */ + mtr_t* mtr); /*!< in: mini-transaction handle */ +/********************************************************//** +Logs a write of a string to a file page buffered in the buffer pool. +Writes the corresponding log record to the mini-transaction log. */ +UNIV_INTERN +void +mlog_log_string( +/*============*/ + byte* ptr, /*!< in: pointer written to */ + ulint len, /*!< in: string length */ + mtr_t* mtr); /*!< in: mini-transaction handle */ +/********************************************************//** +Writes initial part of a log record consisting of one-byte item +type and four-byte space and page numbers. */ +UNIV_INTERN +void +mlog_write_initial_log_record( +/*==========================*/ + const byte* ptr, /*!< in: pointer to (inside) a buffer + frame holding the file page where + modification is made */ + byte type, /*!< in: log item type: MLOG_1BYTE, ... */ + mtr_t* mtr); /*!< in: mini-transaction handle */ +/********************************************************//** +Writes a log record about an .ibd file create/delete/rename. +@return new value of log_ptr */ +UNIV_INLINE +byte* +mlog_write_initial_log_record_for_file_op( +/*======================================*/ + ulint type, /*!< in: MLOG_FILE_CREATE, MLOG_FILE_DELETE, or + MLOG_FILE_RENAME */ + ulint space_id,/*!< in: space id, if applicable */ + ulint page_no,/*!< in: page number (not relevant currently) */ + byte* log_ptr,/*!< in: pointer to mtr log which has been opened */ + mtr_t* mtr); /*!< in: mtr */ +/********************************************************//** +Catenates 1 - 4 bytes to the mtr log. */ +UNIV_INLINE +void +mlog_catenate_ulint( +/*================*/ + mtr_t* mtr, /*!< in: mtr */ + ulint val, /*!< in: value to write */ + ulint type); /*!< in: MLOG_1BYTE, MLOG_2BYTES, MLOG_4BYTES */ +/********************************************************//** +Catenates n bytes to the mtr log. */ +UNIV_INTERN +void +mlog_catenate_string( +/*=================*/ + mtr_t* mtr, /*!< in: mtr */ + const byte* str, /*!< in: string to write */ + ulint len); /*!< in: string length */ +/********************************************************//** +Catenates a compressed ulint to mlog. */ +UNIV_INLINE +void +mlog_catenate_ulint_compressed( +/*===========================*/ + mtr_t* mtr, /*!< in: mtr */ + ulint val); /*!< in: value to write */ +/********************************************************//** +Catenates a compressed 64-bit integer to mlog. */ +UNIV_INLINE +void +mlog_catenate_ull_compressed( +/*=========================*/ + mtr_t* mtr, /*!< in: mtr */ + ib_uint64_t val); /*!< in: value to write */ +/********************************************************//** +Opens a buffer to mlog. It must be closed with mlog_close. +@return buffer, NULL if log mode MTR_LOG_NONE */ +UNIV_INLINE +byte* +mlog_open( +/*======*/ + mtr_t* mtr, /*!< in: mtr */ + ulint size); /*!< in: buffer size in bytes; MUST be + smaller than DYN_ARRAY_DATA_SIZE! */ +/********************************************************//** +Closes a buffer opened to mlog. */ +UNIV_INLINE +void +mlog_close( +/*=======*/ + mtr_t* mtr, /*!< in: mtr */ + byte* ptr); /*!< in: buffer space from ptr up was not used */ +/********************************************************//** +Writes the initial part of a log record (3..11 bytes). +If the implementation of this function is changed, all +size parameters to mlog_open() should be adjusted accordingly! +@return new value of log_ptr */ +UNIV_INLINE +byte* +mlog_write_initial_log_record_fast( +/*===============================*/ + const byte* ptr, /*!< in: pointer to (inside) a buffer + frame holding the file page where + modification is made */ + byte type, /*!< in: log item type: MLOG_1BYTE, ... */ + byte* log_ptr,/*!< in: pointer to mtr log which has + been opened */ + mtr_t* mtr); /*!< in: mtr */ +#else /* !UNIV_HOTBACKUP */ +# define mlog_write_initial_log_record(ptr,type,mtr) ((void) 0) +# define mlog_write_initial_log_record_fast(ptr,type,log_ptr,mtr) ((byte*) 0) +#endif /* !UNIV_HOTBACKUP */ +/********************************************************//** +Parses an initial log record written by mlog_write_initial_log_record. +@return parsed record end, NULL if not a complete record */ +UNIV_INTERN +byte* +mlog_parse_initial_log_record( +/*==========================*/ + byte* ptr, /*!< in: buffer */ + byte* end_ptr,/*!< in: buffer end */ + byte* type, /*!< out: log record type: MLOG_1BYTE, ... */ + ulint* space, /*!< out: space id */ + ulint* page_no);/*!< out: page number */ +/********************************************************//** +Parses a log record written by mlog_write_ulint or mlog_write_ull. +@return parsed record end, NULL if not a complete record */ +UNIV_INTERN +byte* +mlog_parse_nbytes( +/*==============*/ + ulint type, /*!< in: log record type: MLOG_1BYTE, ... */ + byte* ptr, /*!< in: buffer */ + byte* end_ptr,/*!< in: buffer end */ + byte* page, /*!< in: page where to apply the log record, or NULL */ + void* page_zip);/*!< in/out: compressed page, or NULL */ +/********************************************************//** +Parses a log record written by mlog_write_string. +@return parsed record end, NULL if not a complete record */ +UNIV_INTERN +byte* +mlog_parse_string( +/*==============*/ + byte* ptr, /*!< in: buffer */ + byte* end_ptr,/*!< in: buffer end */ + byte* page, /*!< in: page where to apply the log record, or NULL */ + void* page_zip);/*!< in/out: compressed page, or NULL */ + +#ifndef UNIV_HOTBACKUP +/********************************************************//** +Opens a buffer for mlog, writes the initial log record and, +if needed, the field lengths of an index. Reserves space +for further log entries. The log entry must be closed with +mtr_close(). +@return buffer, NULL if log mode MTR_LOG_NONE */ +UNIV_INTERN +byte* +mlog_open_and_write_index( +/*======================*/ + mtr_t* mtr, /*!< in: mtr */ + const byte* rec, /*!< in: index record or page */ + const dict_index_t* index, /*!< in: record descriptor */ + byte type, /*!< in: log item type */ + ulint size); /*!< in: requested buffer size in bytes + (if 0, calls mlog_close() and + returns NULL) */ +#endif /* !UNIV_HOTBACKUP */ + +/********************************************************//** +Parses a log record written by mlog_open_and_write_index. +@return parsed record end, NULL if not a complete record */ +UNIV_INTERN +byte* +mlog_parse_index( +/*=============*/ + byte* ptr, /*!< in: buffer */ + const byte* end_ptr,/*!< in: buffer end */ + ibool comp, /*!< in: TRUE=compact record format */ + dict_index_t** index); /*!< out, own: dummy index */ + +#ifndef UNIV_HOTBACKUP +/* Insert, update, and maybe other functions may use this value to define an +extra mlog buffer size for variable size data */ +#define MLOG_BUF_MARGIN 256 +#endif /* !UNIV_HOTBACKUP */ + +#ifndef UNIV_NONINL +#include "mtr0log.ic" +#endif + +#endif diff --git a/storage/innobase/include/mtr0log.ic b/storage/innobase/include/mtr0log.ic new file mode 100644 index 00000000000..3ed4876eeab --- /dev/null +++ b/storage/innobase/include/mtr0log.ic @@ -0,0 +1,276 @@ +/***************************************************************************** + +Copyright (c) 1995, 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/mtr0log.ic +Mini-transaction logging routines + +Created 12/7/1995 Heikki Tuuri +*******************************************************/ + +#include "mach0data.h" +#include "ut0lst.h" +#include "buf0buf.h" +#include "buf0dblwr.h" +#include "fsp0types.h" +#include "trx0sys.h" + +/********************************************************//** +Opens a buffer to mlog. It must be closed with mlog_close. +@return buffer, NULL if log mode MTR_LOG_NONE */ +UNIV_INLINE +byte* +mlog_open( +/*======*/ + mtr_t* mtr, /*!< in: mtr */ + ulint size) /*!< in: buffer size in bytes; MUST be + smaller than DYN_ARRAY_DATA_SIZE! */ +{ + dyn_array_t* mlog; + + mtr->modifications = TRUE; + + if (mtr_get_log_mode(mtr) == MTR_LOG_NONE) { + + return(NULL); + } + + mlog = &(mtr->log); + + return(dyn_array_open(mlog, size)); +} + +/********************************************************//** +Closes a buffer opened to mlog. */ +UNIV_INLINE +void +mlog_close( +/*=======*/ + mtr_t* mtr, /*!< in: mtr */ + byte* ptr) /*!< in: buffer space from ptr up was not used */ +{ + dyn_array_t* mlog; + + ut_ad(mtr_get_log_mode(mtr) != MTR_LOG_NONE); + + mlog = &(mtr->log); + + dyn_array_close(mlog, ptr); +} + +#ifndef UNIV_HOTBACKUP +/********************************************************//** +Catenates 1 - 4 bytes to the mtr log. The value is not compressed. */ +UNIV_INLINE +void +mlog_catenate_ulint( +/*================*/ + mtr_t* mtr, /*!< in: mtr */ + ulint val, /*!< in: value to write */ + ulint type) /*!< in: MLOG_1BYTE, MLOG_2BYTES, MLOG_4BYTES */ +{ + dyn_array_t* mlog; + byte* ptr; + + if (mtr_get_log_mode(mtr) == MTR_LOG_NONE) { + + return; + } + + mlog = &(mtr->log); + +#if MLOG_1BYTE != 1 +# error "MLOG_1BYTE != 1" +#endif +#if MLOG_2BYTES != 2 +# error "MLOG_2BYTES != 2" +#endif +#if MLOG_4BYTES != 4 +# error "MLOG_4BYTES != 4" +#endif +#if MLOG_8BYTES != 8 +# error "MLOG_8BYTES != 8" +#endif + ptr = (byte*) dyn_array_push(mlog, type); + + if (type == MLOG_4BYTES) { + mach_write_to_4(ptr, val); + } else if (type == MLOG_2BYTES) { + mach_write_to_2(ptr, val); + } else { + ut_ad(type == MLOG_1BYTE); + mach_write_to_1(ptr, val); + } +} + +/********************************************************//** +Catenates a compressed ulint to mlog. */ +UNIV_INLINE +void +mlog_catenate_ulint_compressed( +/*===========================*/ + mtr_t* mtr, /*!< in: mtr */ + ulint val) /*!< in: value to write */ +{ + byte* log_ptr; + + log_ptr = mlog_open(mtr, 10); + + /* If no logging is requested, we may return now */ + if (log_ptr == NULL) { + + return; + } + + log_ptr += mach_write_compressed(log_ptr, val); + + mlog_close(mtr, log_ptr); +} + +/********************************************************//** +Catenates a compressed 64-bit integer to mlog. */ +UNIV_INLINE +void +mlog_catenate_ull_compressed( +/*=========================*/ + mtr_t* mtr, /*!< in: mtr */ + ib_uint64_t val) /*!< in: value to write */ +{ + byte* log_ptr; + + log_ptr = mlog_open(mtr, 15); + + /* If no logging is requested, we may return now */ + if (log_ptr == NULL) { + + return; + } + + log_ptr += mach_ull_write_compressed(log_ptr, val); + + mlog_close(mtr, log_ptr); +} + +/********************************************************//** +Writes the initial part of a log record (3..11 bytes). +If the implementation of this function is changed, all +size parameters to mlog_open() should be adjusted accordingly! +@return new value of log_ptr */ +UNIV_INLINE +byte* +mlog_write_initial_log_record_fast( +/*===============================*/ + const byte* ptr, /*!< in: pointer to (inside) a buffer + frame holding the file page where + modification is made */ + byte type, /*!< in: log item type: MLOG_1BYTE, ... */ + byte* log_ptr,/*!< in: pointer to mtr log which has + been opened */ + mtr_t* mtr) /*!< in: mtr */ +{ +#ifdef UNIV_DEBUG + buf_block_t* block; +#endif + const byte* page; + ulint space; + ulint offset; + + ut_ad(mtr_memo_contains_page(mtr, ptr, MTR_MEMO_PAGE_X_FIX)); + ut_ad(type <= MLOG_BIGGEST_TYPE); + ut_ad(ptr && log_ptr); + + page = (const byte*) ut_align_down(ptr, UNIV_PAGE_SIZE); + space = mach_read_from_4(page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID); + offset = mach_read_from_4(page + FIL_PAGE_OFFSET); + + /* check whether the page is in the doublewrite buffer; + the doublewrite buffer is located in pages + FSP_EXTENT_SIZE, ..., 3 * FSP_EXTENT_SIZE - 1 in the + system tablespace */ + if (space == TRX_SYS_SPACE + && offset >= FSP_EXTENT_SIZE && offset < 3 * FSP_EXTENT_SIZE) { + if (buf_dblwr_being_created) { + /* Do nothing: we only come to this branch in an + InnoDB database creation. We do not redo log + anything for the doublewrite buffer pages. */ + return(log_ptr); + } else { + fprintf(stderr, + "Error: trying to redo log a record of type " + "%d on page %lu of space %lu in the " + "doublewrite buffer, continuing anyway.\n" + "Please post a bug report to " + "bugs.mysql.com.\n", + type, offset, space); + ut_ad(0); + } + } + + mach_write_to_1(log_ptr, type); + log_ptr++; + log_ptr += mach_write_compressed(log_ptr, space); + log_ptr += mach_write_compressed(log_ptr, offset); + + mtr->n_log_recs++; + +#ifdef UNIV_LOG_DEBUG + fprintf(stderr, + "Adding to mtr log record type %lu space %lu page no %lu\n", + (ulong) type, space, offset); +#endif + +#ifdef UNIV_DEBUG + /* We now assume that all x-latched pages have been modified! */ + block = (buf_block_t*) buf_block_align(ptr); + + if (!mtr_memo_contains(mtr, block, MTR_MEMO_MODIFY)) { + + mtr_memo_push(mtr, block, MTR_MEMO_MODIFY); + } +#endif + return(log_ptr); +} + +/********************************************************//** +Writes a log record about an .ibd file create/delete/rename. +@return new value of log_ptr */ +UNIV_INLINE +byte* +mlog_write_initial_log_record_for_file_op( +/*======================================*/ + ulint type, /*!< in: MLOG_FILE_CREATE, MLOG_FILE_DELETE, or + MLOG_FILE_RENAME */ + ulint space_id,/*!< in: space id, if applicable */ + ulint page_no,/*!< in: page number (not relevant currently) */ + byte* log_ptr,/*!< in: pointer to mtr log which has been opened */ + mtr_t* mtr) /*!< in: mtr */ +{ + ut_ad(log_ptr); + + mach_write_to_1(log_ptr, type); + log_ptr++; + + /* We write dummy space id and page number */ + log_ptr += mach_write_compressed(log_ptr, space_id); + log_ptr += mach_write_compressed(log_ptr, page_no); + + mtr->n_log_recs++; + + return(log_ptr); +} +#endif /* !UNIV_HOTBACKUP */ diff --git a/storage/innobase/include/mtr0mtr.h b/storage/innobase/include/mtr0mtr.h new file mode 100644 index 00000000000..ed7fd76d425 --- /dev/null +++ b/storage/innobase/include/mtr0mtr.h @@ -0,0 +1,420 @@ +/***************************************************************************** + +Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2012, Facebook Inc. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/mtr0mtr.h +Mini-transaction buffer + +Created 11/26/1995 Heikki Tuuri +*******************************************************/ + +#ifndef mtr0mtr_h +#define mtr0mtr_h + +#include "univ.i" +#include "mem0mem.h" +#include "dyn0dyn.h" +#include "buf0types.h" +#include "sync0rw.h" +#include "ut0byte.h" +#include "mtr0types.h" +#include "page0types.h" + +/* Logging modes for a mini-transaction */ +#define MTR_LOG_ALL 21 /* default mode: log all operations + modifying disk-based data */ +#define MTR_LOG_NONE 22 /* log no operations */ +#define MTR_LOG_NO_REDO 23 /* Don't generate REDO */ +/*#define MTR_LOG_SPACE 23 */ /* log only operations modifying + file space page allocation data + (operations in fsp0fsp.* ) */ +#define MTR_LOG_SHORT_INSERTS 24 /* inserts are logged in a shorter + form */ + +/* Types for the mlock objects to store in the mtr memo; NOTE that the +first 3 values must be RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH */ +#define MTR_MEMO_PAGE_S_FIX RW_S_LATCH +#define MTR_MEMO_PAGE_X_FIX RW_X_LATCH +#define MTR_MEMO_BUF_FIX RW_NO_LATCH +#ifdef UNIV_DEBUG +# define MTR_MEMO_MODIFY 54 +#endif /* UNIV_DEBUG */ +#define MTR_MEMO_S_LOCK 55 +#define MTR_MEMO_X_LOCK 56 + +/** @name Log item types +The log items are declared 'byte' so that the compiler can warn if val +and type parameters are switched in a call to mlog_write_ulint. NOTE! +For 1 - 8 bytes, the flag value must give the length also! @{ */ +#define MLOG_SINGLE_REC_FLAG 128 /*!< if the mtr contains only + one log record for one page, + i.e., write_initial_log_record + has been called only once, + this flag is ORed to the type + of that first log record */ +#define MLOG_1BYTE (1) /*!< one byte is written */ +#define MLOG_2BYTES (2) /*!< 2 bytes ... */ +#define MLOG_4BYTES (4) /*!< 4 bytes ... */ +#define MLOG_8BYTES (8) /*!< 8 bytes ... */ +#define MLOG_REC_INSERT ((byte)9) /*!< record insert */ +#define MLOG_REC_CLUST_DELETE_MARK ((byte)10) /*!< mark clustered index record + deleted */ +#define MLOG_REC_SEC_DELETE_MARK ((byte)11) /*!< mark secondary index record + deleted */ +#define MLOG_REC_UPDATE_IN_PLACE ((byte)13) /*!< update of a record, + preserves record field sizes */ +#define MLOG_REC_DELETE ((byte)14) /*!< delete a record from a + page */ +#define MLOG_LIST_END_DELETE ((byte)15) /*!< delete record list end on + index page */ +#define MLOG_LIST_START_DELETE ((byte)16) /*!< delete record list start on + index page */ +#define MLOG_LIST_END_COPY_CREATED ((byte)17) /*!< copy record list end to a + new created index page */ +#define MLOG_PAGE_REORGANIZE ((byte)18) /*!< reorganize an + index page in + ROW_FORMAT=REDUNDANT */ +#define MLOG_PAGE_CREATE ((byte)19) /*!< create an index page */ +#define MLOG_UNDO_INSERT ((byte)20) /*!< insert entry in an undo + log */ +#define MLOG_UNDO_ERASE_END ((byte)21) /*!< erase an undo log + page end */ +#define MLOG_UNDO_INIT ((byte)22) /*!< initialize a page in an + undo log */ +#define MLOG_UNDO_HDR_DISCARD ((byte)23) /*!< discard an update undo log + header */ +#define MLOG_UNDO_HDR_REUSE ((byte)24) /*!< reuse an insert undo log + header */ +#define MLOG_UNDO_HDR_CREATE ((byte)25) /*!< create an undo + log header */ +#define MLOG_REC_MIN_MARK ((byte)26) /*!< mark an index + record as the + predefined minimum + record */ +#define MLOG_IBUF_BITMAP_INIT ((byte)27) /*!< initialize an + ibuf bitmap page */ +/*#define MLOG_FULL_PAGE ((byte)28) full contents of a page */ +#ifdef UNIV_LOG_LSN_DEBUG +# define MLOG_LSN ((byte)28) /* current LSN */ +#endif +#define MLOG_INIT_FILE_PAGE ((byte)29) /*!< this means that a + file page is taken + into use and the prior + contents of the page + should be ignored: in + recovery we must not + trust the lsn values + stored to the file + page */ +#define MLOG_WRITE_STRING ((byte)30) /*!< write a string to + a page */ +#define MLOG_MULTI_REC_END ((byte)31) /*!< if a single mtr writes + several log records, + this log record ends the + sequence of these records */ +#define MLOG_DUMMY_RECORD ((byte)32) /*!< dummy log record used to + pad a log block full */ +#define MLOG_FILE_CREATE ((byte)33) /*!< log record about an .ibd + file creation */ +#define MLOG_FILE_RENAME ((byte)34) /*!< log record about an .ibd + file rename */ +#define MLOG_FILE_DELETE ((byte)35) /*!< log record about an .ibd + file deletion */ +#define MLOG_COMP_REC_MIN_MARK ((byte)36) /*!< mark a compact + index record as the + predefined minimum + record */ +#define MLOG_COMP_PAGE_CREATE ((byte)37) /*!< create a compact + index page */ +#define MLOG_COMP_REC_INSERT ((byte)38) /*!< compact record insert */ +#define MLOG_COMP_REC_CLUST_DELETE_MARK ((byte)39) + /*!< mark compact + clustered index record + deleted */ +#define MLOG_COMP_REC_SEC_DELETE_MARK ((byte)40)/*!< mark compact + secondary index record + deleted; this log + record type is + redundant, as + MLOG_REC_SEC_DELETE_MARK + is independent of the + record format. */ +#define MLOG_COMP_REC_UPDATE_IN_PLACE ((byte)41)/*!< update of a + compact record, + preserves record field + sizes */ +#define MLOG_COMP_REC_DELETE ((byte)42) /*!< delete a compact record + from a page */ +#define MLOG_COMP_LIST_END_DELETE ((byte)43) /*!< delete compact record list + end on index page */ +#define MLOG_COMP_LIST_START_DELETE ((byte)44) /*!< delete compact record list + start on index page */ +#define MLOG_COMP_LIST_END_COPY_CREATED ((byte)45) + /*!< copy compact + record list end to a + new created index + page */ +#define MLOG_COMP_PAGE_REORGANIZE ((byte)46) /*!< reorganize an index page */ +#define MLOG_FILE_CREATE2 ((byte)47) /*!< log record about creating + an .ibd file, with format */ +#define MLOG_ZIP_WRITE_NODE_PTR ((byte)48) /*!< write the node pointer of + a record on a compressed + non-leaf B-tree page */ +#define MLOG_ZIP_WRITE_BLOB_PTR ((byte)49) /*!< write the BLOB pointer + of an externally stored column + on a compressed page */ +#define MLOG_ZIP_WRITE_HEADER ((byte)50) /*!< write to compressed page + header */ +#define MLOG_ZIP_PAGE_COMPRESS ((byte)51) /*!< compress an index page */ +#define MLOG_ZIP_PAGE_COMPRESS_NO_DATA ((byte)52)/*!< compress an index page + without logging it's image */ +#define MLOG_ZIP_PAGE_REORGANIZE ((byte)53) /*!< reorganize a compressed + page */ +#define MLOG_BIGGEST_TYPE ((byte)53) /*!< biggest value (used in + assertions) */ +/* @} */ + +/** @name Flags for MLOG_FILE operations +(stored in the page number parameter, called log_flags in the +functions). The page number parameter was originally written as 0. @{ */ +#define MLOG_FILE_FLAG_TEMP 1 /*!< identifies TEMPORARY TABLE in + MLOG_FILE_CREATE, MLOG_FILE_CREATE2 */ +/* @} */ + +/* included here because it needs MLOG_LSN defined */ +#include "log0log.h" + +/***************************************************************//** +Starts a mini-transaction. */ +UNIV_INLINE +void +mtr_start( +/*======*/ + mtr_t* mtr) /*!< out: mini-transaction */ + __attribute__((nonnull)); +/***************************************************************//** +Commits a mini-transaction. */ +UNIV_INTERN +void +mtr_commit( +/*=======*/ + mtr_t* mtr) /*!< in/out: mini-transaction */ + __attribute__((nonnull)); +/**********************************************************//** +Sets and returns a savepoint in mtr. +@return savepoint */ +UNIV_INLINE +ulint +mtr_set_savepoint( +/*==============*/ + mtr_t* mtr); /*!< in: mtr */ +#ifndef UNIV_HOTBACKUP +/**********************************************************//** +Releases the (index tree) s-latch stored in an mtr memo after a +savepoint. */ +UNIV_INLINE +void +mtr_release_s_latch_at_savepoint( +/*=============================*/ + mtr_t* mtr, /*!< in: mtr */ + ulint savepoint, /*!< in: savepoint */ + rw_lock_t* lock); /*!< in: latch to release */ +#else /* !UNIV_HOTBACKUP */ +# define mtr_release_s_latch_at_savepoint(mtr,savepoint,lock) ((void) 0) +#endif /* !UNIV_HOTBACKUP */ +/***************************************************************//** +Gets the logging mode of a mini-transaction. +@return logging mode: MTR_LOG_NONE, ... */ +UNIV_INLINE +ulint +mtr_get_log_mode( +/*=============*/ + mtr_t* mtr); /*!< in: mtr */ +/***************************************************************//** +Changes the logging mode of a mini-transaction. +@return old mode */ +UNIV_INLINE +ulint +mtr_set_log_mode( +/*=============*/ + mtr_t* mtr, /*!< in: mtr */ + ulint mode); /*!< in: logging mode: MTR_LOG_NONE, ... */ +/********************************************************//** +Reads 1 - 4 bytes from a file page buffered in the buffer pool. +@return value read */ +UNIV_INTERN +ulint +mtr_read_ulint( +/*===========*/ + const byte* ptr, /*!< in: pointer from where to read */ + ulint type, /*!< in: MLOG_1BYTE, MLOG_2BYTES, MLOG_4BYTES */ + mtr_t* mtr); /*!< in: mini-transaction handle */ +#ifndef UNIV_HOTBACKUP +/*********************************************************************//** +This macro locks an rw-lock in s-mode. */ +#define mtr_s_lock(B, MTR) mtr_s_lock_func((B), __FILE__, __LINE__,\ + (MTR)) +/*********************************************************************//** +This macro locks an rw-lock in x-mode. */ +#define mtr_x_lock(B, MTR) mtr_x_lock_func((B), __FILE__, __LINE__,\ + (MTR)) +/*********************************************************************//** +NOTE! Use the macro above! +Locks a lock in s-mode. */ +UNIV_INLINE +void +mtr_s_lock_func( +/*============*/ + rw_lock_t* lock, /*!< in: rw-lock */ + const char* file, /*!< in: file name */ + ulint line, /*!< in: line number */ + mtr_t* mtr); /*!< in: mtr */ +/*********************************************************************//** +NOTE! Use the macro above! +Locks a lock in x-mode. */ +UNIV_INLINE +void +mtr_x_lock_func( +/*============*/ + rw_lock_t* lock, /*!< in: rw-lock */ + const char* file, /*!< in: file name */ + ulint line, /*!< in: line number */ + mtr_t* mtr); /*!< in: mtr */ +#endif /* !UNIV_HOTBACKUP */ + +/***************************************************//** +Releases an object in the memo stack. +@return true if released */ +UNIV_INTERN +bool +mtr_memo_release( +/*=============*/ + mtr_t* mtr, /*!< in/out: mini-transaction */ + void* object, /*!< in: object */ + ulint type) /*!< in: object type: MTR_MEMO_S_LOCK, ... */ + __attribute__((nonnull)); +#ifdef UNIV_DEBUG +# ifndef UNIV_HOTBACKUP +/**********************************************************//** +Checks if memo contains the given item. +@return TRUE if contains */ +UNIV_INLINE +bool +mtr_memo_contains( +/*==============*/ + mtr_t* mtr, /*!< in: mtr */ + const void* object, /*!< in: object to search */ + ulint type) /*!< in: type of object */ + __attribute__((warn_unused_result, nonnull)); + +/**********************************************************//** +Checks if memo contains the given page. +@return TRUE if contains */ +UNIV_INTERN +ibool +mtr_memo_contains_page( +/*===================*/ + mtr_t* mtr, /*!< in: mtr */ + const byte* ptr, /*!< in: pointer to buffer frame */ + ulint type); /*!< in: type of object */ +/*********************************************************//** +Prints info of an mtr handle. */ +UNIV_INTERN +void +mtr_print( +/*======*/ + mtr_t* mtr); /*!< in: mtr */ +# else /* !UNIV_HOTBACKUP */ +# define mtr_memo_contains(mtr, object, type) TRUE +# define mtr_memo_contains_page(mtr, ptr, type) TRUE +# endif /* !UNIV_HOTBACKUP */ +#endif /* UNIV_DEBUG */ +/*######################################################################*/ + +#define MTR_BUF_MEMO_SIZE 200 /* number of slots in memo */ + +/***************************************************************//** +Returns the log object of a mini-transaction buffer. +@return log */ +UNIV_INLINE +dyn_array_t* +mtr_get_log( +/*========*/ + mtr_t* mtr); /*!< in: mini-transaction */ +/***************************************************//** +Pushes an object to an mtr memo stack. */ +UNIV_INLINE +void +mtr_memo_push( +/*==========*/ + mtr_t* mtr, /*!< in: mtr */ + void* object, /*!< in: object */ + ulint type); /*!< in: object type: MTR_MEMO_S_LOCK, ... */ + +/** Mini-transaction memo stack slot. */ +struct mtr_memo_slot_t{ + ulint type; /*!< type of the stored object (MTR_MEMO_S_LOCK, ...) */ + void* object; /*!< pointer to the object */ +}; + +/* Mini-transaction handle and buffer */ +struct mtr_t{ +#ifdef UNIV_DEBUG + ulint state; /*!< MTR_ACTIVE, MTR_COMMITTING, MTR_COMMITTED */ +#endif + dyn_array_t memo; /*!< memo stack for locks etc. */ + dyn_array_t log; /*!< mini-transaction log */ + unsigned inside_ibuf:1; + /*!< TRUE if inside ibuf changes */ + unsigned modifications:1; + /*!< TRUE if the mini-transaction + modified buffer pool pages */ + unsigned made_dirty:1; + /*!< TRUE if mtr has made at least + one buffer pool page dirty */ + ulint n_log_recs; + /* count of how many page initial log records + have been written to the mtr log */ + ulint n_freed_pages; + /* number of pages that have been freed in + this mini-transaction */ + ulint log_mode; /* specifies which operations should be + logged; default value MTR_LOG_ALL */ + lsn_t start_lsn;/* start lsn of the possible log entry for + this mtr */ + lsn_t end_lsn;/* end lsn of the possible log entry for + this mtr */ +#ifdef UNIV_DEBUG + ulint magic_n; +#endif /* UNIV_DEBUG */ +}; + +#ifdef UNIV_DEBUG +# define MTR_MAGIC_N 54551 +#endif /* UNIV_DEBUG */ + +#define MTR_ACTIVE 12231 +#define MTR_COMMITTING 56456 +#define MTR_COMMITTED 34676 + +#ifndef UNIV_NONINL +#include "mtr0mtr.ic" +#endif + +#endif diff --git a/storage/innobase/include/mtr0mtr.ic b/storage/innobase/include/mtr0mtr.ic new file mode 100644 index 00000000000..a9f02430220 --- /dev/null +++ b/storage/innobase/include/mtr0mtr.ic @@ -0,0 +1,296 @@ +/***************************************************************************** + +Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/mtr0mtr.ic +Mini-transaction buffer + +Created 11/26/1995 Heikki Tuuri +*******************************************************/ + +#ifndef UNIV_HOTBACKUP +# include "sync0sync.h" +# include "sync0rw.h" +#endif /* !UNIV_HOTBACKUP */ +#include "mach0data.h" + +/***************************************************//** +Checks if a mini-transaction is dirtying a clean page. +@return TRUE if the mtr is dirtying a clean page. */ +UNIV_INTERN +ibool +mtr_block_dirtied( +/*==============*/ + const buf_block_t* block) /*!< in: block being x-fixed */ + __attribute__((nonnull,warn_unused_result)); + +/***************************************************************//** +Starts a mini-transaction. */ +UNIV_INLINE +void +mtr_start( +/*======*/ + mtr_t* mtr) /*!< out: mini-transaction */ +{ + UNIV_MEM_INVALID(mtr, sizeof *mtr); + + dyn_array_create(&(mtr->memo)); + dyn_array_create(&(mtr->log)); + + mtr->log_mode = MTR_LOG_ALL; + mtr->inside_ibuf = FALSE; + mtr->modifications = FALSE; + mtr->made_dirty = FALSE; + mtr->n_log_recs = 0; + mtr->n_freed_pages = 0; + + ut_d(mtr->state = MTR_ACTIVE); + ut_d(mtr->magic_n = MTR_MAGIC_N); +} + +/***************************************************//** +Pushes an object to an mtr memo stack. */ +UNIV_INLINE +void +mtr_memo_push( +/*==========*/ + mtr_t* mtr, /*!< in: mtr */ + void* object, /*!< in: object */ + ulint type) /*!< in: object type: MTR_MEMO_S_LOCK, ... */ +{ + dyn_array_t* memo; + mtr_memo_slot_t* slot; + + ut_ad(object); + ut_ad(type >= MTR_MEMO_PAGE_S_FIX); + ut_ad(type <= MTR_MEMO_X_LOCK); + ut_ad(mtr); + ut_ad(mtr->magic_n == MTR_MAGIC_N); + ut_ad(mtr->state == MTR_ACTIVE); + + /* If this mtr has x-fixed a clean page then we set + the made_dirty flag. This tells us if we need to + grab log_flush_order_mutex at mtr_commit so that we + can insert the dirtied page to the flush list. */ + if (type == MTR_MEMO_PAGE_X_FIX && !mtr->made_dirty) { + mtr->made_dirty = + mtr_block_dirtied((const buf_block_t*) object); + } + + memo = &(mtr->memo); + + slot = (mtr_memo_slot_t*) dyn_array_push(memo, sizeof *slot); + + slot->object = object; + slot->type = type; +} + +/**********************************************************//** +Sets and returns a savepoint in mtr. +@return savepoint */ +UNIV_INLINE +ulint +mtr_set_savepoint( +/*==============*/ + mtr_t* mtr) /*!< in: mtr */ +{ + dyn_array_t* memo; + + ut_ad(mtr); + ut_ad(mtr->magic_n == MTR_MAGIC_N); + ut_ad(mtr->state == MTR_ACTIVE); + + memo = &(mtr->memo); + + return(dyn_array_get_data_size(memo)); +} + +#ifndef UNIV_HOTBACKUP +/**********************************************************//** +Releases the (index tree) s-latch stored in an mtr memo after a +savepoint. */ +UNIV_INLINE +void +mtr_release_s_latch_at_savepoint( +/*=============================*/ + mtr_t* mtr, /*!< in: mtr */ + ulint savepoint, /*!< in: savepoint */ + rw_lock_t* lock) /*!< in: latch to release */ +{ + mtr_memo_slot_t* slot; + dyn_array_t* memo; + + ut_ad(mtr); + ut_ad(mtr->magic_n == MTR_MAGIC_N); + ut_ad(mtr->state == MTR_ACTIVE); + + memo = &(mtr->memo); + + ut_ad(dyn_array_get_data_size(memo) > savepoint); + + slot = (mtr_memo_slot_t*) dyn_array_get_element(memo, savepoint); + + ut_ad(slot->object == lock); + ut_ad(slot->type == MTR_MEMO_S_LOCK); + + rw_lock_s_unlock(lock); + + slot->object = NULL; +} + +# ifdef UNIV_DEBUG +/**********************************************************//** +Checks if memo contains the given item. +@return TRUE if contains */ +UNIV_INLINE +bool +mtr_memo_contains( +/*==============*/ + mtr_t* mtr, /*!< in: mtr */ + const void* object, /*!< in: object to search */ + ulint type) /*!< in: type of object */ +{ + ut_ad(mtr); + ut_ad(mtr->magic_n == MTR_MAGIC_N); + ut_ad(mtr->state == MTR_ACTIVE || mtr->state == MTR_COMMITTING); + + for (const dyn_block_t* block = dyn_array_get_last_block(&mtr->memo); + block; + block = dyn_array_get_prev_block(&mtr->memo, block)) { + const mtr_memo_slot_t* start + = reinterpret_cast<mtr_memo_slot_t*>( + dyn_block_get_data(block)); + mtr_memo_slot_t* slot + = reinterpret_cast<mtr_memo_slot_t*>( + dyn_block_get_data(block) + + dyn_block_get_used(block)); + + ut_ad(!(dyn_block_get_used(block) % sizeof(mtr_memo_slot_t))); + + while (slot-- != start) { + if (object == slot->object && type == slot->type) { + return(true); + } + } + } + + return(false); +} +# endif /* UNIV_DEBUG */ +#endif /* !UNIV_HOTBACKUP */ + +/***************************************************************//** +Returns the log object of a mini-transaction buffer. +@return log */ +UNIV_INLINE +dyn_array_t* +mtr_get_log( +/*========*/ + mtr_t* mtr) /*!< in: mini-transaction */ +{ + ut_ad(mtr); + ut_ad(mtr->magic_n == MTR_MAGIC_N); + + return(&(mtr->log)); +} + +/***************************************************************//** +Gets the logging mode of a mini-transaction. +@return logging mode: MTR_LOG_NONE, ... */ +UNIV_INLINE +ulint +mtr_get_log_mode( +/*=============*/ + mtr_t* mtr) /*!< in: mtr */ +{ + ut_ad(mtr); + ut_ad(mtr->log_mode >= MTR_LOG_ALL); + ut_ad(mtr->log_mode <= MTR_LOG_SHORT_INSERTS); + + return(mtr->log_mode); +} + +/***************************************************************//** +Changes the logging mode of a mini-transaction. +@return old mode */ +UNIV_INLINE +ulint +mtr_set_log_mode( +/*=============*/ + mtr_t* mtr, /*!< in: mtr */ + ulint mode) /*!< in: logging mode: MTR_LOG_NONE, ... */ +{ + ulint old_mode; + + ut_ad(mtr); + ut_ad(mode >= MTR_LOG_ALL); + ut_ad(mode <= MTR_LOG_SHORT_INSERTS); + + old_mode = mtr->log_mode; + + if ((mode == MTR_LOG_SHORT_INSERTS) && (old_mode == MTR_LOG_NONE)) { + /* Do nothing */ + } else { + mtr->log_mode = mode; + } + + ut_ad(old_mode >= MTR_LOG_ALL); + ut_ad(old_mode <= MTR_LOG_SHORT_INSERTS); + + return(old_mode); +} + +#ifndef UNIV_HOTBACKUP +/*********************************************************************//** +Locks a lock in s-mode. */ +UNIV_INLINE +void +mtr_s_lock_func( +/*============*/ + rw_lock_t* lock, /*!< in: rw-lock */ + const char* file, /*!< in: file name */ + ulint line, /*!< in: line number */ + mtr_t* mtr) /*!< in: mtr */ +{ + ut_ad(mtr); + ut_ad(lock); + + rw_lock_s_lock_inline(lock, 0, file, line); + + mtr_memo_push(mtr, lock, MTR_MEMO_S_LOCK); +} + +/*********************************************************************//** +Locks a lock in x-mode. */ +UNIV_INLINE +void +mtr_x_lock_func( +/*============*/ + rw_lock_t* lock, /*!< in: rw-lock */ + const char* file, /*!< in: file name */ + ulint line, /*!< in: line number */ + mtr_t* mtr) /*!< in: mtr */ +{ + ut_ad(mtr); + ut_ad(lock); + + rw_lock_x_lock_inline(lock, 0, file, line); + + mtr_memo_push(mtr, lock, MTR_MEMO_X_LOCK); +} +#endif /* !UNIV_HOTBACKUP */ diff --git a/storage/innobase/include/mtr0types.h b/storage/innobase/include/mtr0types.h new file mode 100644 index 00000000000..43368c0b726 --- /dev/null +++ b/storage/innobase/include/mtr0types.h @@ -0,0 +1,31 @@ +/***************************************************************************** + +Copyright (c) 1995, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/mtr0types.h +Mini-transaction buffer global types + +Created 11/26/1995 Heikki Tuuri +*******************************************************/ + +#ifndef mtr0types_h +#define mtr0types_h + +struct mtr_t; + +#endif diff --git a/storage/innobase/include/os0file.h b/storage/innobase/include/os0file.h new file mode 100644 index 00000000000..ad9b6a9ac10 --- /dev/null +++ b/storage/innobase/include/os0file.h @@ -0,0 +1,1289 @@ +/*********************************************************************** + +Copyright (c) 1995, 2014, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2009, Percona Inc. + +Portions of this file contain modifications contributed and copyrighted +by Percona Inc.. Those modifications are +gratefully acknowledged and are described briefly in the InnoDB +documentation. The contributions by Percona Inc. are incorporated with +their permission, and subject to the conditions contained in the file +COPYING.Percona. + +This program is free software; you can redistribute it and/or modify it +under the terms of the GNU General Public License as published by the +Free Software Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General +Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +***********************************************************************/ + +/**************************************************//** +@file include/os0file.h +The interface to the operating system file io + +Created 10/21/1995 Heikki Tuuri +*******************************************************/ + +#ifndef os0file_h +#define os0file_h + +#include "univ.i" + +#ifndef __WIN__ +#include <dirent.h> +#include <sys/stat.h> +#include <time.h> +#endif + +/** File node of a tablespace or the log data space */ +struct fil_node_t; + +extern ibool os_has_said_disk_full; +/** Flag: enable debug printout for asynchronous i/o */ +extern ibool os_aio_print_debug; + +/** Number of pending os_file_pread() operations */ +extern ulint os_file_n_pending_preads; +/** Number of pending os_file_pwrite() operations */ +extern ulint os_file_n_pending_pwrites; + +/** Number of pending read operations */ +extern ulint os_n_pending_reads; +/** Number of pending write operations */ +extern ulint os_n_pending_writes; + +#ifdef __WIN__ + +/** We define always WIN_ASYNC_IO, and check at run-time whether + the OS actually supports it: Win 95 does not, NT does. */ +#define WIN_ASYNC_IO + +/** Use unbuffered I/O */ +#define UNIV_NON_BUFFERED_IO + +#endif + +/** File offset in bytes */ +typedef ib_uint64_t os_offset_t; +#ifdef __WIN__ +/** File handle */ +# define os_file_t HANDLE +/** Convert a C file descriptor to a native file handle +@param fd file descriptor +@return native file handle */ +# define OS_FILE_FROM_FD(fd) (HANDLE) _get_osfhandle(fd) +#else +/** File handle */ +typedef int os_file_t; +/** Convert a C file descriptor to a native file handle +@param fd file descriptor +@return native file handle */ +# define OS_FILE_FROM_FD(fd) fd +#endif + +/** Umask for creating files */ +extern ulint os_innodb_umask; + +/** The next value should be smaller or equal to the smallest sector size used +on any disk. A log block is required to be a portion of disk which is written +so that if the start and the end of a block get written to disk, then the +whole block gets written. This should be true even in most cases of a crash: +if this fails for a log block, then it is equivalent to a media failure in the +log. */ + +#define OS_FILE_LOG_BLOCK_SIZE 512 + +/** Options for os_file_create_func @{ */ +enum os_file_create_t { + OS_FILE_OPEN = 51, /*!< to open an existing file (if + doesn't exist, error) */ + OS_FILE_CREATE, /*!< to create new file (if + exists, error) */ + OS_FILE_OVERWRITE, /*!< to create a new file, if exists + the overwrite old file */ + OS_FILE_OPEN_RAW, /*!< to open a raw device or disk + partition */ + OS_FILE_CREATE_PATH, /*!< to create the directories */ + OS_FILE_OPEN_RETRY, /*!< open with retry */ + + /** Flags that can be combined with the above values. Please ensure + that the above values stay below 128. */ + + OS_FILE_ON_ERROR_NO_EXIT = 128, /*!< do not exit on unknown errors */ + OS_FILE_ON_ERROR_SILENT = 256 /*!< don't print diagnostic messages to + the log unless it is a fatal error, + this flag is only used if + ON_ERROR_NO_EXIT is set */ +}; + +#define OS_FILE_READ_ONLY 333 +#define OS_FILE_READ_WRITE 444 +#define OS_FILE_READ_ALLOW_DELETE 555 /* for mysqlbackup */ + +/* Options for file_create */ +#define OS_FILE_AIO 61 +#define OS_FILE_NORMAL 62 +/* @} */ + +/** Types for file create @{ */ +#define OS_DATA_FILE 100 +#define OS_LOG_FILE 101 +/* @} */ + +/** Error codes from os_file_get_last_error @{ */ +#define OS_FILE_NOT_FOUND 71 +#define OS_FILE_DISK_FULL 72 +#define OS_FILE_ALREADY_EXISTS 73 +#define OS_FILE_PATH_ERROR 74 +#define OS_FILE_AIO_RESOURCES_RESERVED 75 /* wait for OS aio resources + to become available again */ +#define OS_FILE_SHARING_VIOLATION 76 +#define OS_FILE_ERROR_NOT_SPECIFIED 77 +#define OS_FILE_INSUFFICIENT_RESOURCE 78 +#define OS_FILE_AIO_INTERRUPTED 79 +#define OS_FILE_OPERATION_ABORTED 80 + +#define OS_FILE_ACCESS_VIOLATION 81 + +#define OS_FILE_ERROR_MAX 100 +/* @} */ + +/** Types for aio operations @{ */ +#define OS_FILE_READ 10 +#define OS_FILE_WRITE 11 + +#define OS_FILE_LOG 256 /* This can be ORed to type */ +/* @} */ + +#define OS_AIO_N_PENDING_IOS_PER_THREAD 32 /*!< Win NT does not allow more + than 64 */ + +/** Modes for aio operations @{ */ +#define OS_AIO_NORMAL 21 /*!< Normal asynchronous i/o not for ibuf + pages or ibuf bitmap pages */ +#define OS_AIO_IBUF 22 /*!< Asynchronous i/o for ibuf pages or ibuf + bitmap pages */ +#define OS_AIO_LOG 23 /*!< Asynchronous i/o for the log */ +#define OS_AIO_SYNC 24 /*!< Asynchronous i/o where the calling thread + will itself wait for the i/o to complete, + doing also the job of the i/o-handler thread; + can be used for any pages, ibuf or non-ibuf. + This is used to save CPU time, as we can do + with fewer thread switches. Plain synchronous + i/o is not as good, because it must serialize + the file seek and read or write, causing a + bottleneck for parallelism. */ + +#define OS_AIO_SIMULATED_WAKE_LATER 512 /*!< This can be ORed to mode + in the call of os_aio(...), + if the caller wants to post several i/o + requests in a batch, and only after that + wake the i/o-handler thread; this has + effect only in simulated aio */ +/* @} */ + +#define OS_WIN31 1 /*!< Microsoft Windows 3.x */ +#define OS_WIN95 2 /*!< Microsoft Windows 95 */ +#define OS_WINNT 3 /*!< Microsoft Windows NT 3.x */ +#define OS_WIN2000 4 /*!< Microsoft Windows 2000 */ +#define OS_WINXP 5 /*!< Microsoft Windows XP + or Windows Server 2003 */ +#define OS_WINVISTA 6 /*!< Microsoft Windows Vista + or Windows Server 2008 */ +#define OS_WIN7 7 /*!< Microsoft Windows 7 + or Windows Server 2008 R2 */ + + +extern ulint os_n_file_reads; +extern ulint os_n_file_writes; +extern ulint os_n_fsyncs; + +#ifdef UNIV_PFS_IO +/* Keys to register InnoDB I/O with performance schema */ +extern mysql_pfs_key_t innodb_file_data_key; +extern mysql_pfs_key_t innodb_file_log_key; +extern mysql_pfs_key_t innodb_file_temp_key; + +/* Following four macros are instumentations to register +various file I/O operations with performance schema. +1) register_pfs_file_open_begin() and register_pfs_file_open_end() are +used to register file creation, opening, closing and renaming. +2) register_pfs_file_io_begin() and register_pfs_file_io_end() are +used to register actual file read, write and flush +3) register_pfs_file_close_begin() and register_pfs_file_close_end() +are used to register file deletion operations*/ +# define register_pfs_file_open_begin(state, locker, key, op, name, \ + src_file, src_line) \ +do { \ + locker = PSI_FILE_CALL(get_thread_file_name_locker)( \ + state, key, op, name, &locker); \ + if (UNIV_LIKELY(locker != NULL)) { \ + PSI_FILE_CALL(start_file_open_wait)( \ + locker, src_file, src_line); \ + } \ +} while (0) + +# define register_pfs_file_open_end(locker, file) \ +do { \ + if (UNIV_LIKELY(locker != NULL)) { \ + PSI_FILE_CALL(end_file_open_wait_and_bind_to_descriptor)(\ + locker, file); \ + } \ +} while (0) + +# define register_pfs_file_close_begin(state, locker, key, op, name, \ + src_file, src_line) \ +do { \ + locker = PSI_FILE_CALL(get_thread_file_name_locker)( \ + state, key, op, name, &locker); \ + if (UNIV_LIKELY(locker != NULL)) { \ + PSI_FILE_CALL(start_file_close_wait)( \ + locker, src_file, src_line); \ + } \ +} while (0) + +# define register_pfs_file_close_end(locker, result) \ +do { \ + if (UNIV_LIKELY(locker != NULL)) { \ + PSI_FILE_CALL(end_file_close_wait)( \ + locker, result); \ + } \ +} while (0) + +# define register_pfs_file_io_begin(state, locker, file, count, op, \ + src_file, src_line) \ +do { \ + locker = PSI_FILE_CALL(get_thread_file_descriptor_locker)( \ + state, file, op); \ + if (UNIV_LIKELY(locker != NULL)) { \ + PSI_FILE_CALL(start_file_wait)( \ + locker, count, src_file, src_line); \ + } \ +} while (0) + +# define register_pfs_file_io_end(locker, count) \ +do { \ + if (UNIV_LIKELY(locker != NULL)) { \ + PSI_FILE_CALL(end_file_wait)(locker, count); \ + } \ +} while (0) +#endif /* UNIV_PFS_IO */ + +/* Following macros/functions are file I/O APIs that would be performance +schema instrumented if "UNIV_PFS_IO" is defined. They would point to +wrapper functions with performance schema instrumentation in such case. + +os_file_create +os_file_create_simple +os_file_create_simple_no_error_handling +os_file_close +os_file_rename +os_aio +os_file_read +os_file_read_no_error_handling +os_file_write + +The wrapper functions have the prefix of "innodb_". */ + +#ifdef UNIV_PFS_IO +# define os_file_create(key, name, create, purpose, type, success) \ + pfs_os_file_create_func(key, name, create, purpose, type, \ + success, __FILE__, __LINE__) + +# define os_file_create_simple(key, name, create, access, success) \ + pfs_os_file_create_simple_func(key, name, create, access, \ + success, __FILE__, __LINE__) + +# define os_file_create_simple_no_error_handling( \ + key, name, create_mode, access, success) \ + pfs_os_file_create_simple_no_error_handling_func( \ + key, name, create_mode, access, success, __FILE__, __LINE__) + +# define os_file_close(file) \ + pfs_os_file_close_func(file, __FILE__, __LINE__) + +# define os_aio(type, mode, name, file, buf, offset, \ + n, message1, message2) \ + pfs_os_aio_func(type, mode, name, file, buf, offset, \ + n, message1, message2, __FILE__, __LINE__) + +# define os_file_read(file, buf, offset, n) \ + pfs_os_file_read_func(file, buf, offset, n, __FILE__, __LINE__) + +# define os_file_read_no_error_handling(file, buf, offset, n) \ + pfs_os_file_read_no_error_handling_func(file, buf, offset, n, \ + __FILE__, __LINE__) + +# define os_file_write(name, file, buf, offset, n) \ + pfs_os_file_write_func(name, file, buf, offset, \ + n, __FILE__, __LINE__) + +# define os_file_flush(file) \ + pfs_os_file_flush_func(file, __FILE__, __LINE__) + +# define os_file_rename(key, oldpath, newpath) \ + pfs_os_file_rename_func(key, oldpath, newpath, __FILE__, __LINE__) + +# define os_file_delete(key, name) \ + pfs_os_file_delete_func(key, name, __FILE__, __LINE__) + +# define os_file_delete_if_exists(key, name) \ + pfs_os_file_delete_if_exists_func(key, name, __FILE__, __LINE__) +#else /* UNIV_PFS_IO */ + +/* If UNIV_PFS_IO is not defined, these I/O APIs point +to original un-instrumented file I/O APIs */ +# define os_file_create(key, name, create, purpose, type, success) \ + os_file_create_func(name, create, purpose, type, success) + +# define os_file_create_simple(key, name, create_mode, access, success) \ + os_file_create_simple_func(name, create_mode, access, success) + +# define os_file_create_simple_no_error_handling( \ + key, name, create_mode, access, success) \ + os_file_create_simple_no_error_handling_func( \ + name, create_mode, access, success) + +# define os_file_close(file) os_file_close_func(file) + +# define os_aio(type, mode, name, file, buf, offset, n, message1, message2) \ + os_aio_func(type, mode, name, file, buf, offset, n, \ + message1, message2) + +# define os_file_read(file, buf, offset, n) \ + os_file_read_func(file, buf, offset, n) + +# define os_file_read_no_error_handling(file, buf, offset, n) \ + os_file_read_no_error_handling_func(file, buf, offset, n) + +# define os_file_write(name, file, buf, offset, n) \ + os_file_write_func(name, file, buf, offset, n) + +# define os_file_flush(file) os_file_flush_func(file) + +# define os_file_rename(key, oldpath, newpath) \ + os_file_rename_func(oldpath, newpath) + +# define os_file_delete(key, name) os_file_delete_func(name) + +# define os_file_delete_if_exists(key, name) \ + os_file_delete_if_exists_func(name) + +#endif /* UNIV_PFS_IO */ + +/* File types for directory entry data type */ + +enum os_file_type_t { + OS_FILE_TYPE_UNKNOWN = 0, + OS_FILE_TYPE_FILE, /* regular file */ + OS_FILE_TYPE_DIR, /* directory */ + OS_FILE_TYPE_LINK, /* symbolic link */ + OS_FILE_TYPE_BLOCK /* block device */ +}; + +/* Maximum path string length in bytes when referring to tables with in the +'./databasename/tablename.ibd' path format; we can allocate at least 2 buffers +of this size from the thread stack; that is why this should not be made much +bigger than 4000 bytes */ +#define OS_FILE_MAX_PATH 4000 + +/** Struct used in fetching information of a file in a directory */ +struct os_file_stat_t { + char name[OS_FILE_MAX_PATH]; /*!< path to a file */ + os_file_type_t type; /*!< file type */ + ib_int64_t size; /*!< file size */ + time_t ctime; /*!< creation time */ + time_t mtime; /*!< modification time */ + time_t atime; /*!< access time */ + bool rw_perm; /*!< true if can be opened + in read-write mode. Only valid + if type == OS_FILE_TYPE_FILE */ +}; + +#ifdef __WIN__ +typedef HANDLE os_file_dir_t; /*!< directory stream */ +#else +typedef DIR* os_file_dir_t; /*!< directory stream */ +#endif + +#ifdef __WIN__ +/***********************************************************************//** +Gets the operating system version. Currently works only on Windows. +@return OS_WIN95, OS_WIN31, OS_WINNT, OS_WIN2000, OS_WINXP, OS_WINVISTA, +OS_WIN7. */ +UNIV_INTERN +ulint +os_get_os_version(void); +/*===================*/ +#endif /* __WIN__ */ +#ifndef UNIV_HOTBACKUP +/****************************************************************//** +Creates the seek mutexes used in positioned reads and writes. */ +UNIV_INTERN +void +os_io_init_simple(void); +/*===================*/ +/***********************************************************************//** +Creates a temporary file. This function is like tmpfile(3), but +the temporary file is created in the MySQL temporary directory. +@return temporary file handle, or NULL on error */ + +FILE* +os_file_create_tmpfile(void); +/*========================*/ +#endif /* !UNIV_HOTBACKUP */ +/***********************************************************************//** +The os_file_opendir() function opens a directory stream corresponding to the +directory named by the dirname argument. The directory stream is positioned +at the first entry. In both Unix and Windows we automatically skip the '.' +and '..' items at the start of the directory listing. +@return directory stream, NULL if error */ +UNIV_INTERN +os_file_dir_t +os_file_opendir( +/*============*/ + const char* dirname, /*!< in: directory name; it must not + contain a trailing '\' or '/' */ + ibool error_is_fatal);/*!< in: TRUE if we should treat an + error as a fatal error; if we try to + open symlinks then we do not wish a + fatal error if it happens not to be + a directory */ +/***********************************************************************//** +Closes a directory stream. +@return 0 if success, -1 if failure */ +UNIV_INTERN +int +os_file_closedir( +/*=============*/ + os_file_dir_t dir); /*!< in: directory stream */ +/***********************************************************************//** +This function returns information of the next file in the directory. We jump +over the '.' and '..' entries in the directory. +@return 0 if ok, -1 if error, 1 if at the end of the directory */ +UNIV_INTERN +int +os_file_readdir_next_file( +/*======================*/ + const char* dirname,/*!< in: directory name or path */ + os_file_dir_t dir, /*!< in: directory stream */ + os_file_stat_t* info); /*!< in/out: buffer where the info is returned */ +/*****************************************************************//** +This function attempts to create a directory named pathname. The new directory +gets default permissions. On Unix, the permissions are (0770 & ~umask). If the +directory exists already, nothing is done and the call succeeds, unless the +fail_if_exists arguments is true. +@return TRUE if call succeeds, FALSE on error */ +UNIV_INTERN +ibool +os_file_create_directory( +/*=====================*/ + const char* pathname, /*!< in: directory name as + null-terminated string */ + ibool fail_if_exists);/*!< in: if TRUE, pre-existing directory + is treated as an error. */ +/****************************************************************//** +NOTE! Use the corresponding macro os_file_create_simple(), not directly +this function! +A simple function to open or create a file. +@return own: handle to the file, not defined if error, error number +can be retrieved with os_file_get_last_error */ +UNIV_INTERN +os_file_t +os_file_create_simple_func( +/*=======================*/ + const char* name, /*!< in: name of the file or path as a + null-terminated string */ + ulint create_mode,/*!< in: create mode */ + ulint access_type,/*!< in: OS_FILE_READ_ONLY or + OS_FILE_READ_WRITE */ + ibool* success);/*!< out: TRUE if succeed, FALSE if error */ +/****************************************************************//** +NOTE! Use the corresponding macro +os_file_create_simple_no_error_handling(), not directly this function! +A simple function to open or create a file. +@return own: handle to the file, not defined if error, error number +can be retrieved with os_file_get_last_error */ +UNIV_INTERN +os_file_t +os_file_create_simple_no_error_handling_func( +/*=========================================*/ + const char* name, /*!< in: name of the file or path as a + null-terminated string */ + ulint create_mode,/*!< in: create mode */ + ulint access_type,/*!< in: OS_FILE_READ_ONLY, + OS_FILE_READ_WRITE, or + OS_FILE_READ_ALLOW_DELETE; the last option is + used by a backup program reading the file */ + ibool* success)/*!< out: TRUE if succeed, FALSE if error */ + __attribute__((nonnull, warn_unused_result)); +/****************************************************************//** +Tries to disable OS caching on an opened file descriptor. */ +UNIV_INTERN +void +os_file_set_nocache( +/*================*/ + int fd, /*!< in: file descriptor to alter */ + const char* file_name, /*!< in: file name, used in the + diagnostic message */ + const char* operation_name);/*!< in: "open" or "create"; used in the + diagnostic message */ +/****************************************************************//** +NOTE! Use the corresponding macro os_file_create(), not directly +this function! +Opens an existing file or creates a new. +@return own: handle to the file, not defined if error, error number +can be retrieved with os_file_get_last_error */ +UNIV_INTERN +os_file_t +os_file_create_func( +/*================*/ + const char* name, /*!< in: name of the file or path as a + null-terminated string */ + ulint create_mode,/*!< in: create mode */ + ulint purpose,/*!< in: OS_FILE_AIO, if asynchronous, + non-buffered i/o is desired, + OS_FILE_NORMAL, if any normal file; + NOTE that it also depends on type, os_aio_.. + and srv_.. variables whether we really use + async i/o or unbuffered i/o: look in the + function source code for the exact rules */ + ulint type, /*!< in: OS_DATA_FILE or OS_LOG_FILE */ + ibool* success)/*!< out: TRUE if succeed, FALSE if error */ + __attribute__((nonnull, warn_unused_result)); +/***********************************************************************//** +Deletes a file. The file has to be closed before calling this. +@return TRUE if success */ +UNIV_INTERN +bool +os_file_delete_func( +/*================*/ + const char* name); /*!< in: file path as a null-terminated + string */ + +/***********************************************************************//** +Deletes a file if it exists. The file has to be closed before calling this. +@return TRUE if success */ +UNIV_INTERN +bool +os_file_delete_if_exists_func( +/*==========================*/ + const char* name); /*!< in: file path as a null-terminated + string */ +/***********************************************************************//** +NOTE! Use the corresponding macro os_file_rename(), not directly +this function! +Renames a file (can also move it to another directory). It is safest that the +file is closed before calling this function. +@return TRUE if success */ +UNIV_INTERN +ibool +os_file_rename_func( +/*================*/ + const char* oldpath, /*!< in: old file path as a + null-terminated string */ + const char* newpath); /*!< in: new file path */ +/***********************************************************************//** +NOTE! Use the corresponding macro os_file_close(), not directly this +function! +Closes a file handle. In case of error, error number can be retrieved with +os_file_get_last_error. +@return TRUE if success */ +UNIV_INTERN +ibool +os_file_close_func( +/*===============*/ + os_file_t file); /*!< in, own: handle to a file */ + +#ifdef UNIV_PFS_IO +/****************************************************************//** +NOTE! Please use the corresponding macro os_file_create_simple(), +not directly this function! +A performance schema instrumented wrapper function for +os_file_create_simple() which opens or creates a file. +@return own: handle to the file, not defined if error, error number +can be retrieved with os_file_get_last_error */ +UNIV_INLINE +os_file_t +pfs_os_file_create_simple_func( +/*===========================*/ + mysql_pfs_key_t key, /*!< in: Performance Schema Key */ + const char* name, /*!< in: name of the file or path as a + null-terminated string */ + ulint create_mode,/*!< in: create mode */ + ulint access_type,/*!< in: OS_FILE_READ_ONLY or + OS_FILE_READ_WRITE */ + ibool* success,/*!< out: TRUE if succeed, FALSE if error */ + const char* src_file,/*!< in: file name where func invoked */ + ulint src_line)/*!< in: line where the func invoked */ + __attribute__((nonnull, warn_unused_result)); + +/****************************************************************//** +NOTE! Please use the corresponding macro +os_file_create_simple_no_error_handling(), not directly this function! +A performance schema instrumented wrapper function for +os_file_create_simple_no_error_handling(). Add instrumentation to +monitor file creation/open. +@return own: handle to the file, not defined if error, error number +can be retrieved with os_file_get_last_error */ +UNIV_INLINE +os_file_t +pfs_os_file_create_simple_no_error_handling_func( +/*=============================================*/ + mysql_pfs_key_t key, /*!< in: Performance Schema Key */ + const char* name, /*!< in: name of the file or path as a + null-terminated string */ + ulint create_mode, /*!< in: file create mode */ + ulint access_type,/*!< in: OS_FILE_READ_ONLY, + OS_FILE_READ_WRITE, or + OS_FILE_READ_ALLOW_DELETE; the last option is + used by a backup program reading the file */ + ibool* success,/*!< out: TRUE if succeed, FALSE if error */ + const char* src_file,/*!< in: file name where func invoked */ + ulint src_line)/*!< in: line where the func invoked */ + __attribute__((nonnull, warn_unused_result)); + +/****************************************************************//** +NOTE! Please use the corresponding macro os_file_create(), not directly +this function! +A performance schema wrapper function for os_file_create(). +Add instrumentation to monitor file creation/open. +@return own: handle to the file, not defined if error, error number +can be retrieved with os_file_get_last_error */ +UNIV_INLINE +os_file_t +pfs_os_file_create_func( +/*====================*/ + mysql_pfs_key_t key, /*!< in: Performance Schema Key */ + const char* name, /*!< in: name of the file or path as a + null-terminated string */ + ulint create_mode,/*!< in: file create mode */ + ulint purpose,/*!< in: OS_FILE_AIO, if asynchronous, + non-buffered i/o is desired, + OS_FILE_NORMAL, if any normal file; + NOTE that it also depends on type, os_aio_.. + and srv_.. variables whether we really use + async i/o or unbuffered i/o: look in the + function source code for the exact rules */ + ulint type, /*!< in: OS_DATA_FILE or OS_LOG_FILE */ + ibool* success,/*!< out: TRUE if succeed, FALSE if error */ + const char* src_file,/*!< in: file name where func invoked */ + ulint src_line)/*!< in: line where the func invoked */ + __attribute__((nonnull, warn_unused_result)); + +/***********************************************************************//** +NOTE! Please use the corresponding macro os_file_close(), not directly +this function! +A performance schema instrumented wrapper function for os_file_close(). +@return TRUE if success */ +UNIV_INLINE +ibool +pfs_os_file_close_func( +/*===================*/ + os_file_t file, /*!< in, own: handle to a file */ + const char* src_file,/*!< in: file name where func invoked */ + ulint src_line);/*!< in: line where the func invoked */ +/*******************************************************************//** +NOTE! Please use the corresponding macro os_file_read(), not directly +this function! +This is the performance schema instrumented wrapper function for +os_file_read() which requests a synchronous read operation. +@return TRUE if request was successful, FALSE if fail */ +UNIV_INLINE +ibool +pfs_os_file_read_func( +/*==================*/ + os_file_t file, /*!< in: handle to a file */ + void* buf, /*!< in: buffer where to read */ + os_offset_t offset, /*!< in: file offset where to read */ + ulint n, /*!< in: number of bytes to read */ + const char* src_file,/*!< in: file name where func invoked */ + ulint src_line);/*!< in: line where the func invoked */ + +/*******************************************************************//** +NOTE! Please use the corresponding macro os_file_read_no_error_handling(), +not directly this function! +This is the performance schema instrumented wrapper function for +os_file_read_no_error_handling_func() which requests a synchronous +read operation. +@return TRUE if request was successful, FALSE if fail */ +UNIV_INLINE +ibool +pfs_os_file_read_no_error_handling_func( +/*====================================*/ + os_file_t file, /*!< in: handle to a file */ + void* buf, /*!< in: buffer where to read */ + os_offset_t offset, /*!< in: file offset where to read */ + ulint n, /*!< in: number of bytes to read */ + const char* src_file,/*!< in: file name where func invoked */ + ulint src_line);/*!< in: line where the func invoked */ + +/*******************************************************************//** +NOTE! Please use the corresponding macro os_aio(), not directly this +function! +Performance schema wrapper function of os_aio() which requests +an asynchronous i/o operation. +@return TRUE if request was queued successfully, FALSE if fail */ +UNIV_INLINE +ibool +pfs_os_aio_func( +/*============*/ + ulint type, /*!< in: OS_FILE_READ or OS_FILE_WRITE */ + ulint mode, /*!< in: OS_AIO_NORMAL etc. I/O mode */ + const char* name, /*!< in: name of the file or path as a + null-terminated string */ + os_file_t file, /*!< in: handle to a file */ + void* buf, /*!< in: buffer where to read or from which + to write */ + os_offset_t offset, /*!< in: file offset where to read or write */ + ulint n, /*!< in: number of bytes to read or write */ + fil_node_t* message1,/*!< in: message for the aio handler + (can be used to identify a completed + aio operation); ignored if mode is + OS_AIO_SYNC */ + void* message2,/*!< in: message for the aio handler + (can be used to identify a completed + aio operation); ignored if mode is + OS_AIO_SYNC */ + const char* src_file,/*!< in: file name where func invoked */ + ulint src_line);/*!< in: line where the func invoked */ +/*******************************************************************//** +NOTE! Please use the corresponding macro os_file_write(), not directly +this function! +This is the performance schema instrumented wrapper function for +os_file_write() which requests a synchronous write operation. +@return TRUE if request was successful, FALSE if fail */ +UNIV_INLINE +ibool +pfs_os_file_write_func( +/*===================*/ + const char* name, /*!< in: name of the file or path as a + null-terminated string */ + os_file_t file, /*!< in: handle to a file */ + const void* buf, /*!< in: buffer from which to write */ + os_offset_t offset, /*!< in: file offset where to write */ + ulint n, /*!< in: number of bytes to write */ + const char* src_file,/*!< in: file name where func invoked */ + ulint src_line);/*!< in: line where the func invoked */ +/***********************************************************************//** +NOTE! Please use the corresponding macro os_file_flush(), not directly +this function! +This is the performance schema instrumented wrapper function for +os_file_flush() which flushes the write buffers of a given file to the disk. +Flushes the write buffers of a given file to the disk. +@return TRUE if success */ +UNIV_INLINE +ibool +pfs_os_file_flush_func( +/*===================*/ + os_file_t file, /*!< in, own: handle to a file */ + const char* src_file,/*!< in: file name where func invoked */ + ulint src_line);/*!< in: line where the func invoked */ + +/***********************************************************************//** +NOTE! Please use the corresponding macro os_file_rename(), not directly +this function! +This is the performance schema instrumented wrapper function for +os_file_rename() +@return TRUE if success */ +UNIV_INLINE +ibool +pfs_os_file_rename_func( +/*====================*/ + mysql_pfs_key_t key, /*!< in: Performance Schema Key */ + const char* oldpath,/*!< in: old file path as a null-terminated + string */ + const char* newpath,/*!< in: new file path */ + const char* src_file,/*!< in: file name where func invoked */ + ulint src_line);/*!< in: line where the func invoked */ + +/***********************************************************************//** +NOTE! Please use the corresponding macro os_file_delete(), not directly +this function! +This is the performance schema instrumented wrapper function for +os_file_delete() +@return TRUE if success */ +UNIV_INLINE +bool +pfs_os_file_delete_func( +/*====================*/ + mysql_pfs_key_t key, /*!< in: Performance Schema Key */ + const char* name, /*!< in: old file path as a null-terminated + string */ + const char* src_file,/*!< in: file name where func invoked */ + ulint src_line);/*!< in: line where the func invoked */ + +/***********************************************************************//** +NOTE! Please use the corresponding macro os_file_delete_if_exists(), not +directly this function! +This is the performance schema instrumented wrapper function for +os_file_delete_if_exists() +@return TRUE if success */ +UNIV_INLINE +bool +pfs_os_file_delete_if_exists_func( +/*==============================*/ + mysql_pfs_key_t key, /*!< in: Performance Schema Key */ + const char* name, /*!< in: old file path as a null-terminated + string */ + const char* src_file,/*!< in: file name where func invoked */ + ulint src_line);/*!< in: line where the func invoked */ +#endif /* UNIV_PFS_IO */ + +#ifdef UNIV_HOTBACKUP +/***********************************************************************//** +Closes a file handle. +@return TRUE if success */ +UNIV_INTERN +ibool +os_file_close_no_error_handling( +/*============================*/ + os_file_t file); /*!< in, own: handle to a file */ +#endif /* UNIV_HOTBACKUP */ +/***********************************************************************//** +Gets a file size. +@return file size, or (os_offset_t) -1 on failure */ +UNIV_INTERN +os_offset_t +os_file_get_size( +/*=============*/ + os_file_t file) /*!< in: handle to a file */ + __attribute__((warn_unused_result)); +/***********************************************************************//** +Write the specified number of zeros to a newly created file. +@return TRUE if success */ +UNIV_INTERN +ibool +os_file_set_size( +/*=============*/ + const char* name, /*!< in: name of the file or path as a + null-terminated string */ + os_file_t file, /*!< in: handle to a file */ + os_offset_t size) /*!< in: file size */ + __attribute__((nonnull, warn_unused_result)); +/***********************************************************************//** +Truncates a file at its current position. +@return TRUE if success */ +UNIV_INTERN +ibool +os_file_set_eof( +/*============*/ + FILE* file); /*!< in: file to be truncated */ +/***********************************************************************//** +NOTE! Use the corresponding macro os_file_flush(), not directly this function! +Flushes the write buffers of a given file to the disk. +@return TRUE if success */ +UNIV_INTERN +ibool +os_file_flush_func( +/*===============*/ + os_file_t file); /*!< in, own: handle to a file */ +/***********************************************************************//** +Retrieves the last error number if an error occurs in a file io function. +The number should be retrieved before any other OS calls (because they may +overwrite the error number). If the number is not known to this program, +the OS error number + 100 is returned. +@return error number, or OS error number + 100 */ +UNIV_INTERN +ulint +os_file_get_last_error( +/*===================*/ + bool report_all_errors); /*!< in: TRUE if we want an error message + printed of all errors */ +/*******************************************************************//** +NOTE! Use the corresponding macro os_file_read(), not directly this function! +Requests a synchronous read operation. +@return TRUE if request was successful, FALSE if fail */ +UNIV_INTERN +ibool +os_file_read_func( +/*==============*/ + os_file_t file, /*!< in: handle to a file */ + void* buf, /*!< in: buffer where to read */ + os_offset_t offset, /*!< in: file offset where to read */ + ulint n); /*!< in: number of bytes to read */ +/*******************************************************************//** +Rewind file to its start, read at most size - 1 bytes from it to str, and +NUL-terminate str. All errors are silently ignored. This function is +mostly meant to be used with temporary files. */ +UNIV_INTERN +void +os_file_read_string( +/*================*/ + FILE* file, /*!< in: file to read from */ + char* str, /*!< in: buffer where to read */ + ulint size); /*!< in: size of buffer */ +/*******************************************************************//** +NOTE! Use the corresponding macro os_file_read_no_error_handling(), +not directly this function! +Requests a synchronous positioned read operation. This function does not do +any error handling. In case of error it returns FALSE. +@return TRUE if request was successful, FALSE if fail */ +UNIV_INTERN +ibool +os_file_read_no_error_handling_func( +/*================================*/ + os_file_t file, /*!< in: handle to a file */ + void* buf, /*!< in: buffer where to read */ + os_offset_t offset, /*!< in: file offset where to read */ + ulint n); /*!< in: number of bytes to read */ + +/*******************************************************************//** +NOTE! Use the corresponding macro os_file_write(), not directly this +function! +Requests a synchronous write operation. +@return TRUE if request was successful, FALSE if fail */ +UNIV_INTERN +ibool +os_file_write_func( +/*===============*/ + const char* name, /*!< in: name of the file or path as a + null-terminated string */ + os_file_t file, /*!< in: handle to a file */ + const void* buf, /*!< in: buffer from which to write */ + os_offset_t offset, /*!< in: file offset where to write */ + ulint n); /*!< in: number of bytes to write */ +/*******************************************************************//** +Check the existence and type of the given file. +@return TRUE if call succeeded */ +UNIV_INTERN +ibool +os_file_status( +/*===========*/ + const char* path, /*!< in: pathname of the file */ + ibool* exists, /*!< out: TRUE if file exists */ + os_file_type_t* type); /*!< out: type of the file (if it exists) */ +/****************************************************************//** +The function os_file_dirname returns a directory component of a +null-terminated pathname string. In the usual case, dirname returns +the string up to, but not including, the final '/', and basename +is the component following the final '/'. Trailing '/' characters +are not counted as part of the pathname. + +If path does not contain a slash, dirname returns the string ".". + +Concatenating the string returned by dirname, a "/", and the basename +yields a complete pathname. + +The return value is a copy of the directory component of the pathname. +The copy is allocated from heap. It is the caller responsibility +to free it after it is no longer needed. + +The following list of examples (taken from SUSv2) shows the strings +returned by dirname and basename for different paths: + + path dirname basename + "/usr/lib" "/usr" "lib" + "/usr/" "/" "usr" + "usr" "." "usr" + "/" "/" "/" + "." "." "." + ".." "." ".." + +@return own: directory component of the pathname */ +UNIV_INTERN +char* +os_file_dirname( +/*============*/ + const char* path); /*!< in: pathname */ +/****************************************************************//** +This function returns a new path name after replacing the basename +in an old path with a new basename. The old_path is a full path +name including the extension. The tablename is in the normal +form "databasename/tablename". The new base name is found after +the forward slash. Both input strings are null terminated. + +This function allocates memory to be returned. It is the callers +responsibility to free the return value after it is no longer needed. + +@return own: new full pathname */ +UNIV_INTERN +char* +os_file_make_new_pathname( +/*======================*/ + const char* old_path, /*!< in: pathname */ + const char* new_name); /*!< in: new file name */ +/****************************************************************//** +This function returns a remote path name by combining a data directory +path provided in a DATA DIRECTORY clause with the tablename which is +in the form 'database/tablename'. It strips the file basename (which +is the tablename) found after the last directory in the path provided. +The full filepath created will include the database name as a directory +under the path provided. The filename is the tablename with the '.ibd' +extension. All input and output strings are null-terminated. + +This function allocates memory to be returned. It is the callers +responsibility to free the return value after it is no longer needed. + +@return own: A full pathname; data_dir_path/databasename/tablename.ibd */ +UNIV_INTERN +char* +os_file_make_remote_pathname( +/*=========================*/ + const char* data_dir_path, /*!< in: pathname */ + const char* tablename, /*!< in: tablename */ + const char* extention); /*!< in: file extention; ibd,cfg*/ +/****************************************************************//** +This function reduces a null-terminated full remote path name into +the path that is sent by MySQL for DATA DIRECTORY clause. It replaces +the 'databasename/tablename.ibd' found at the end of the path with just +'tablename'. + +Since the result is always smaller than the path sent in, no new memory +is allocated. The caller should allocate memory for the path sent in. +This function manipulates that path in place. + +If the path format is not as expected, just return. The result is used +to inform a SHOW CREATE TABLE command. */ +UNIV_INTERN +void +os_file_make_data_dir_path( +/*========================*/ + char* data_dir_path); /*!< in/out: full path/data_dir_path */ +/****************************************************************//** +Creates all missing subdirectories along the given path. +@return TRUE if call succeeded FALSE otherwise */ +UNIV_INTERN +ibool +os_file_create_subdirs_if_needed( +/*=============================*/ + const char* path); /*!< in: path name */ +/*********************************************************************** +Initializes the asynchronous io system. Creates one array each for ibuf +and log i/o. Also creates one array each for read and write where each +array is divided logically into n_read_segs and n_write_segs +respectively. The caller must create an i/o handler thread for each +segment in these arrays. This function also creates the sync array. +No i/o handler thread needs to be created for that */ +UNIV_INTERN +ibool +os_aio_init( +/*========*/ + ulint n_per_seg, /*<! in: maximum number of pending aio + operations allowed per segment */ + ulint n_read_segs, /*<! in: number of reader threads */ + ulint n_write_segs, /*<! in: number of writer threads */ + ulint n_slots_sync); /*<! in: number of slots in the sync aio + array */ +/*********************************************************************** +Frees the asynchronous io system. */ +UNIV_INTERN +void +os_aio_free(void); +/*=============*/ + +/*******************************************************************//** +NOTE! Use the corresponding macro os_aio(), not directly this function! +Requests an asynchronous i/o operation. +@return TRUE if request was queued successfully, FALSE if fail */ +UNIV_INTERN +ibool +os_aio_func( +/*========*/ + ulint type, /*!< in: OS_FILE_READ or OS_FILE_WRITE */ + ulint mode, /*!< in: OS_AIO_NORMAL, ..., possibly ORed + to OS_AIO_SIMULATED_WAKE_LATER: the + last flag advises this function not to wake + i/o-handler threads, but the caller will + do the waking explicitly later, in this + way the caller can post several requests in + a batch; NOTE that the batch must not be + so big that it exhausts the slots in aio + arrays! NOTE that a simulated batch + may introduce hidden chances of deadlocks, + because i/os are not actually handled until + all have been posted: use with great + caution! */ + const char* name, /*!< in: name of the file or path as a + null-terminated string */ + os_file_t file, /*!< in: handle to a file */ + void* buf, /*!< in: buffer where to read or from which + to write */ + os_offset_t offset, /*!< in: file offset where to read or write */ + ulint n, /*!< in: number of bytes to read or write */ + fil_node_t* message1,/*!< in: message for the aio handler + (can be used to identify a completed + aio operation); ignored if mode is + OS_AIO_SYNC */ + void* message2);/*!< in: message for the aio handler + (can be used to identify a completed + aio operation); ignored if mode is + OS_AIO_SYNC */ +/************************************************************************//** +Wakes up all async i/o threads so that they know to exit themselves in +shutdown. */ +UNIV_INTERN +void +os_aio_wake_all_threads_at_shutdown(void); +/*=====================================*/ +/************************************************************************//** +Waits until there are no pending writes in os_aio_write_array. There can +be other, synchronous, pending writes. */ +UNIV_INTERN +void +os_aio_wait_until_no_pending_writes(void); +/*=====================================*/ +/**********************************************************************//** +Wakes up simulated aio i/o-handler threads if they have something to do. */ +UNIV_INTERN +void +os_aio_simulated_wake_handler_threads(void); +/*=======================================*/ +/**********************************************************************//** +This function can be called if one wants to post a batch of reads and +prefers an i/o-handler thread to handle them all at once later. You must +call os_aio_simulated_wake_handler_threads later to ensure the threads +are not left sleeping! */ +UNIV_INTERN +void +os_aio_simulated_put_read_threads_to_sleep(void); +/*============================================*/ + +#ifdef WIN_ASYNC_IO +/**********************************************************************//** +This function is only used in Windows asynchronous i/o. +Waits for an aio operation to complete. This function is used to wait the +for completed requests. The aio array of pending requests is divided +into segments. The thread specifies which segment or slot it wants to wait +for. NOTE: this function will also take care of freeing the aio slot, +therefore no other thread is allowed to do the freeing! +@return TRUE if the aio operation succeeded */ +UNIV_INTERN +ibool +os_aio_windows_handle( +/*==================*/ + ulint segment, /*!< in: the number of the segment in the aio + arrays to wait for; segment 0 is the ibuf + i/o thread, segment 1 the log i/o thread, + then follow the non-ibuf read threads, and as + the last are the non-ibuf write threads; if + this is ULINT_UNDEFINED, then it means that + sync aio is used, and this parameter is + ignored */ + ulint pos, /*!< this parameter is used only in sync aio: + wait for the aio slot at this position */ + fil_node_t**message1, /*!< out: the messages passed with the aio + request; note that also in the case where + the aio operation failed, these output + parameters are valid and can be used to + restart the operation, for example */ + void** message2, + ulint* type); /*!< out: OS_FILE_WRITE or ..._READ */ +#endif + +/**********************************************************************//** +Does simulated aio. This function should be called by an i/o-handler +thread. +@return TRUE if the aio operation succeeded */ +UNIV_INTERN +ibool +os_aio_simulated_handle( +/*====================*/ + ulint segment, /*!< in: the number of the segment in the aio + arrays to wait for; segment 0 is the ibuf + i/o thread, segment 1 the log i/o thread, + then follow the non-ibuf read threads, and as + the last are the non-ibuf write threads */ + fil_node_t**message1, /*!< out: the messages passed with the aio + request; note that also in the case where + the aio operation failed, these output + parameters are valid and can be used to + restart the operation, for example */ + void** message2, + ulint* type); /*!< out: OS_FILE_WRITE or ..._READ */ +/**********************************************************************//** +Validates the consistency of the aio system. +@return TRUE if ok */ +UNIV_INTERN +ibool +os_aio_validate(void); +/*=================*/ +/**********************************************************************//** +Prints info of the aio arrays. */ +UNIV_INTERN +void +os_aio_print( +/*=========*/ + FILE* file); /*!< in: file where to print */ +/**********************************************************************//** +Refreshes the statistics used to print per-second averages. */ +UNIV_INTERN +void +os_aio_refresh_stats(void); +/*======================*/ + +#ifdef UNIV_DEBUG +/**********************************************************************//** +Checks that all slots in the system have been freed, that is, there are +no pending io operations. */ +UNIV_INTERN +ibool +os_aio_all_slots_free(void); +/*=======================*/ +#endif /* UNIV_DEBUG */ + +/*******************************************************************//** +This function returns information about the specified file +@return DB_SUCCESS if all OK */ +UNIV_INTERN +dberr_t +os_file_get_status( +/*===============*/ + const char* path, /*!< in: pathname of the file */ + os_file_stat_t* stat_info, /*!< information of a file in a + directory */ + bool check_rw_perm); /*!< in: for testing whether the + file can be opened in RW mode */ + +#if !defined(UNIV_HOTBACKUP) +/*********************************************************************//** +Creates a temporary file that will be deleted on close. +This function is defined in ha_innodb.cc. +@return temporary file descriptor, or < 0 on error */ +UNIV_INTERN +int +innobase_mysql_tmpfile(void); +/*========================*/ +#endif /* !UNIV_HOTBACKUP */ + + +#if defined(LINUX_NATIVE_AIO) +/************************************************************************** +This function is only used in Linux native asynchronous i/o. +Waits for an aio operation to complete. This function is used to wait the +for completed requests. The aio array of pending requests is divided +into segments. The thread specifies which segment or slot it wants to wait +for. NOTE: this function will also take care of freeing the aio slot, +therefore no other thread is allowed to do the freeing! +@return TRUE if the IO was successful */ +UNIV_INTERN +ibool +os_aio_linux_handle( +/*================*/ + ulint global_seg, /*!< in: segment number in the aio array + to wait for; segment 0 is the ibuf + i/o thread, segment 1 is log i/o thread, + then follow the non-ibuf read threads, + and the last are the non-ibuf write + threads. */ + fil_node_t**message1, /*!< out: the messages passed with the */ + void** message2, /*!< aio request; note that in case the + aio operation failed, these output + parameters are valid and can be used to + restart the operation. */ + ulint* type); /*!< out: OS_FILE_WRITE or ..._READ */ +#endif /* LINUX_NATIVE_AIO */ + +#ifndef UNIV_NONINL +#include "os0file.ic" +#endif + +#endif diff --git a/storage/innobase/include/os0file.ic b/storage/innobase/include/os0file.ic new file mode 100644 index 00000000000..defd8204ba3 --- /dev/null +++ b/storage/innobase/include/os0file.ic @@ -0,0 +1,449 @@ +/***************************************************************************** + +Copyright (c) 2010, 2011, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/os0file.ic +The interface to the operating system file io + +Created 2/20/2010 Jimmy Yang +*******************************************************/ + +#include "univ.i" + +#ifdef UNIV_PFS_IO +/****************************************************************//** +NOTE! Please use the corresponding macro os_file_create_simple(), +not directly this function! +A performance schema instrumented wrapper function for +os_file_create_simple() which opens or creates a file. +@return own: handle to the file, not defined if error, error number +can be retrieved with os_file_get_last_error */ +UNIV_INLINE +os_file_t +pfs_os_file_create_simple_func( +/*===========================*/ + mysql_pfs_key_t key, /*!< in: Performance Schema Key */ + const char* name, /*!< in: name of the file or path as a + null-terminated string */ + ulint create_mode,/*!< in: create mode */ + ulint access_type,/*!< in: OS_FILE_READ_ONLY or + OS_FILE_READ_WRITE */ + ibool* success,/*!< out: TRUE if succeed, FALSE if error */ + const char* src_file,/*!< in: file name where func invoked */ + ulint src_line)/*!< in: line where the func invoked */ +{ + os_file_t file; + struct PSI_file_locker* locker = NULL; + PSI_file_locker_state state; + + /* register a file open or creation depending on "create_mode" */ + register_pfs_file_open_begin(&state, locker, key, + ((create_mode == OS_FILE_CREATE) + ? PSI_FILE_CREATE + : PSI_FILE_OPEN), + name, src_file, src_line); + + file = os_file_create_simple_func(name, create_mode, + access_type, success); + + /* Regsiter the returning "file" value with the system */ + register_pfs_file_open_end(locker, file); + + return(file); +} + +/****************************************************************//** +NOTE! Please use the corresponding macro +os_file_create_simple_no_error_handling(), not directly this function! +A performance schema instrumented wrapper function for +os_file_create_simple_no_error_handling(). Add instrumentation to +monitor file creation/open. +@return own: handle to the file, not defined if error, error number +can be retrieved with os_file_get_last_error */ +UNIV_INLINE +os_file_t +pfs_os_file_create_simple_no_error_handling_func( +/*=============================================*/ + mysql_pfs_key_t key, /*!< in: Performance Schema Key */ + const char* name, /*!< in: name of the file or path as a + null-terminated string */ + ulint create_mode, /*!< in: file create mode */ + ulint access_type,/*!< in: OS_FILE_READ_ONLY, + OS_FILE_READ_WRITE, or + OS_FILE_READ_ALLOW_DELETE; the last option is + used by a backup program reading the file */ + ibool* success,/*!< out: TRUE if succeed, FALSE if error */ + const char* src_file,/*!< in: file name where func invoked */ + ulint src_line)/*!< in: line where the func invoked */ +{ + os_file_t file; + struct PSI_file_locker* locker = NULL; + PSI_file_locker_state state; + + /* register a file open or creation depending on "create_mode" */ + register_pfs_file_open_begin(&state, locker, key, + ((create_mode == OS_FILE_CREATE) + ? PSI_FILE_CREATE + : PSI_FILE_OPEN), + name, src_file, src_line); + + file = os_file_create_simple_no_error_handling_func( + name, create_mode, access_type, success); + + register_pfs_file_open_end(locker, file); + + return(file); +} + +/****************************************************************//** +NOTE! Please use the corresponding macro os_file_create(), not directly +this function! +A performance schema wrapper function for os_file_create(). +Add instrumentation to monitor file creation/open. +@return own: handle to the file, not defined if error, error number +can be retrieved with os_file_get_last_error */ +UNIV_INLINE +os_file_t +pfs_os_file_create_func( +/*====================*/ + mysql_pfs_key_t key, /*!< in: Performance Schema Key */ + const char* name, /*!< in: name of the file or path as a + null-terminated string */ + ulint create_mode,/*!< in: file create mode */ + ulint purpose,/*!< in: OS_FILE_AIO, if asynchronous, + non-buffered i/o is desired, + OS_FILE_NORMAL, if any normal file; + NOTE that it also depends on type, os_aio_.. + and srv_.. variables whether we really use + async i/o or unbuffered i/o: look in the + function source code for the exact rules */ + ulint type, /*!< in: OS_DATA_FILE or OS_LOG_FILE */ + ibool* success,/*!< out: TRUE if succeed, FALSE if error */ + const char* src_file,/*!< in: file name where func invoked */ + ulint src_line)/*!< in: line where the func invoked */ +{ + os_file_t file; + struct PSI_file_locker* locker = NULL; + PSI_file_locker_state state; + + /* register a file open or creation depending on "create_mode" */ + register_pfs_file_open_begin(&state, locker, key, + ((create_mode == OS_FILE_CREATE) + ? PSI_FILE_CREATE + : PSI_FILE_OPEN), + name, src_file, src_line); + + file = os_file_create_func(name, create_mode, purpose, type, success); + + register_pfs_file_open_end(locker, file); + + return(file); +} + +/***********************************************************************//** +NOTE! Please use the corresponding macro os_file_close(), not directly +this function! +A performance schema instrumented wrapper function for os_file_close(). +@return TRUE if success */ +UNIV_INLINE +ibool +pfs_os_file_close_func( +/*===================*/ + os_file_t file, /*!< in, own: handle to a file */ + const char* src_file,/*!< in: file name where func invoked */ + ulint src_line)/*!< in: line where the func invoked */ +{ + ibool result; + struct PSI_file_locker* locker = NULL; + PSI_file_locker_state state; + + /* register the file close */ + register_pfs_file_io_begin(&state, locker, file, 0, PSI_FILE_CLOSE, + src_file, src_line); + + result = os_file_close_func(file); + + register_pfs_file_io_end(locker, 0); + + return(result); +} + +/*******************************************************************//** +NOTE! Please use the corresponding macro os_aio(), not directly this +function! +Performance schema instrumented wrapper function of os_aio() which +requests an asynchronous i/o operation. +@return TRUE if request was queued successfully, FALSE if fail */ +UNIV_INLINE +ibool +pfs_os_aio_func( +/*============*/ + ulint type, /*!< in: OS_FILE_READ or OS_FILE_WRITE */ + ulint mode, /*!< in: OS_AIO_NORMAL etc. I/O mode */ + const char* name, /*!< in: name of the file or path as a + null-terminated string */ + os_file_t file, /*!< in: handle to a file */ + void* buf, /*!< in: buffer where to read or from which + to write */ + os_offset_t offset, /*!< in: file offset where to read or write */ + ulint n, /*!< in: number of bytes to read or write */ + fil_node_t* message1,/*!< in: message for the aio handler + (can be used to identify a completed + aio operation); ignored if mode is + OS_AIO_SYNC */ + void* message2,/*!< in: message for the aio handler + (can be used to identify a completed + aio operation); ignored if mode is + OS_AIO_SYNC */ + const char* src_file,/*!< in: file name where func invoked */ + ulint src_line)/*!< in: line where the func invoked */ +{ + ibool result; + struct PSI_file_locker* locker = NULL; + PSI_file_locker_state state; + + /* Register the read or write I/O depending on "type" */ + register_pfs_file_io_begin(&state, locker, file, n, + (type == OS_FILE_WRITE) + ? PSI_FILE_WRITE + : PSI_FILE_READ, + src_file, src_line); + + result = os_aio_func(type, mode, name, file, buf, offset, + n, message1, message2); + + register_pfs_file_io_end(locker, n); + + return(result); +} + +/*******************************************************************//** +NOTE! Please use the corresponding macro os_file_read(), not directly +this function! +This is the performance schema instrumented wrapper function for +os_file_read() which requests a synchronous read operation. +@return TRUE if request was successful, FALSE if fail */ +UNIV_INLINE +ibool +pfs_os_file_read_func( +/*==================*/ + os_file_t file, /*!< in: handle to a file */ + void* buf, /*!< in: buffer where to read */ + os_offset_t offset, /*!< in: file offset where to read */ + ulint n, /*!< in: number of bytes to read */ + const char* src_file,/*!< in: file name where func invoked */ + ulint src_line)/*!< in: line where the func invoked */ +{ + ibool result; + struct PSI_file_locker* locker = NULL; + PSI_file_locker_state state; + + register_pfs_file_io_begin(&state, locker, file, n, PSI_FILE_READ, + src_file, src_line); + + result = os_file_read_func(file, buf, offset, n); + + register_pfs_file_io_end(locker, n); + + return(result); +} + +/*******************************************************************//** +NOTE! Please use the corresponding macro +os_file_read_no_error_handling(), not directly this function! +This is the performance schema instrumented wrapper function for +os_file_read_no_error_handling() which requests a synchronous +positioned read operation. This function does not do any error +handling. In case of error it returns FALSE. +@return TRUE if request was successful, FALSE if fail */ +UNIV_INLINE +ibool +pfs_os_file_read_no_error_handling_func( +/*====================================*/ + os_file_t file, /*!< in: handle to a file */ + void* buf, /*!< in: buffer where to read */ + os_offset_t offset, /*!< in: file offset where to read */ + ulint n, /*!< in: number of bytes to read */ + const char* src_file,/*!< in: file name where func invoked */ + ulint src_line)/*!< in: line where the func invoked */ +{ + ibool result; + struct PSI_file_locker* locker = NULL; + PSI_file_locker_state state; + + register_pfs_file_io_begin(&state, locker, file, n, PSI_FILE_READ, + src_file, src_line); + + result = os_file_read_no_error_handling_func(file, buf, offset, n); + + register_pfs_file_io_end(locker, n); + + return(result); +} + +/*******************************************************************//** +NOTE! Please use the corresponding macro os_file_write(), not directly +this function! +This is the performance schema instrumented wrapper function for +os_file_write() which requests a synchronous write operation. +@return TRUE if request was successful, FALSE if fail */ +UNIV_INLINE +ibool +pfs_os_file_write_func( +/*===================*/ + const char* name, /*!< in: name of the file or path as a + null-terminated string */ + os_file_t file, /*!< in: handle to a file */ + const void* buf, /*!< in: buffer from which to write */ + os_offset_t offset, /*!< in: file offset where to write */ + ulint n, /*!< in: number of bytes to write */ + const char* src_file,/*!< in: file name where func invoked */ + ulint src_line)/*!< in: line where the func invoked */ +{ + ibool result; + struct PSI_file_locker* locker = NULL; + PSI_file_locker_state state; + + register_pfs_file_io_begin(&state, locker, file, n, PSI_FILE_WRITE, + src_file, src_line); + + result = os_file_write_func(name, file, buf, offset, n); + + register_pfs_file_io_end(locker, n); + + return(result); +} + +/***********************************************************************//** +NOTE! Please use the corresponding macro os_file_flush(), not directly +this function! +This is the performance schema instrumented wrapper function for +os_file_flush() which flushes the write buffers of a given file to the disk. +@return TRUE if success */ +UNIV_INLINE +ibool +pfs_os_file_flush_func( +/*===================*/ + os_file_t file, /*!< in, own: handle to a file */ + const char* src_file,/*!< in: file name where func invoked */ + ulint src_line)/*!< in: line where the func invoked */ +{ + ibool result; + struct PSI_file_locker* locker = NULL; + PSI_file_locker_state state; + + register_pfs_file_io_begin(&state, locker, file, 0, PSI_FILE_SYNC, + src_file, src_line); + result = os_file_flush_func(file); + + register_pfs_file_io_end(locker, 0); + + return(result); +} + +/***********************************************************************//** +NOTE! Please use the corresponding macro os_file_rename(), not directly +this function! +This is the performance schema instrumented wrapper function for +os_file_rename() +@return TRUE if success */ +UNIV_INLINE +ibool +pfs_os_file_rename_func( +/*====================*/ + mysql_pfs_key_t key, /*!< in: Performance Schema Key */ + const char* oldpath,/*!< in: old file path as a null-terminated + string */ + const char* newpath,/*!< in: new file path */ + const char* src_file,/*!< in: file name where func invoked */ + ulint src_line)/*!< in: line where the func invoked */ +{ + ibool result; + struct PSI_file_locker* locker = NULL; + PSI_file_locker_state state; + + register_pfs_file_open_begin(&state, locker, key, PSI_FILE_RENAME, newpath, + src_file, src_line); + + result = os_file_rename_func(oldpath, newpath); + + register_pfs_file_open_end(locker, 0); + + return(result); +} + +/***********************************************************************//** +NOTE! Please use the corresponding macro os_file_delete(), not directly +this function! +This is the performance schema instrumented wrapper function for +os_file_delete() +@return TRUE if success */ +UNIV_INLINE +bool +pfs_os_file_delete_func( +/*====================*/ + mysql_pfs_key_t key, /*!< in: Performance Schema Key */ + const char* name, /*!< in: file path as a null-terminated + string */ + const char* src_file, /*!< in: file name where func invoked */ + ulint src_line) /*!< in: line where the func invoked */ +{ + bool result; + struct PSI_file_locker* locker = NULL; + PSI_file_locker_state state; + + register_pfs_file_close_begin(&state, locker, key, PSI_FILE_DELETE, + name, src_file, src_line); + + result = os_file_delete_func(name); + + register_pfs_file_close_end(locker, 0); + + return(result); +} + +/***********************************************************************//** +NOTE! Please use the corresponding macro os_file_delete_if_exists(), not +directly this function! +This is the performance schema instrumented wrapper function for +os_file_delete_if_exists() +@return TRUE if success */ +UNIV_INLINE +bool +pfs_os_file_delete_if_exists_func( +/*==============================*/ + mysql_pfs_key_t key, /*!< in: Performance Schema Key */ + const char* name, /*!< in: file path as a null-terminated + string */ + const char* src_file, /*!< in: file name where func invoked */ + ulint src_line) /*!< in: line where the func invoked */ +{ + bool result; + struct PSI_file_locker* locker = NULL; + PSI_file_locker_state state; + + register_pfs_file_close_begin(&state, locker, key, PSI_FILE_DELETE, + name, src_file, src_line); + + result = os_file_delete_if_exists_func(name); + + register_pfs_file_close_end(locker, 0); + + return(result); +} +#endif /* UNIV_PFS_IO */ diff --git a/storage/innobase/include/os0once.h b/storage/innobase/include/os0once.h new file mode 100644 index 00000000000..a8bbaf1d2d4 --- /dev/null +++ b/storage/innobase/include/os0once.h @@ -0,0 +1,125 @@ +/***************************************************************************** + +Copyright (c) 2014, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/os0once.h +A class that aids executing a given function exactly once in a multi-threaded +environment. + +Created Feb 20, 2014 Vasil Dimov +*******************************************************/ + +#ifndef os0once_h +#define os0once_h + +#include "univ.i" + +#include "os0sync.h" +#include "ut0ut.h" + +/** Execute a given function exactly once in a multi-threaded environment +or wait for the function to be executed by another thread. + +Example usage: +First the user must create a control variable of type os_once::state_t and +assign it os_once::NEVER_DONE. +Then the user must pass this variable, together with a function to be +executed to os_once::do_or_wait_for_done(). + +Multiple threads can call os_once::do_or_wait_for_done() simultaneously with +the same (os_once::state_t) control variable. The provided function will be +called exactly once and when os_once::do_or_wait_for_done() returns then this +function has completed execution, by this or another thread. In other words +os_once::do_or_wait_for_done() will either execute the provided function or +will wait for its execution to complete if it is already called by another +thread or will do nothing if the function has already completed its execution +earlier. + +This mimics pthread_once(3), but unfortunatelly pthread_once(3) does not +support passing arguments to the init_routine() function. We should use +std::call_once() when we start compiling with C++11 enabled. */ +class os_once { +public: + /** Control variables' state type */ + typedef ib_uint32_t state_t; + + /** Not yet executed. */ + static const state_t NEVER_DONE = 0; + + /** Currently being executed by this or another thread. */ + static const state_t IN_PROGRESS = 1; + + /** Finished execution. */ + static const state_t DONE = 2; + +#ifdef HAVE_ATOMIC_BUILTINS + /** Call a given function or wait its execution to complete if it is + already called by another thread. + @param[in,out] state control variable + @param[in] do_func function to call + @param[in,out] do_func_arg an argument to pass to do_func(). */ + static + void + do_or_wait_for_done( + volatile state_t* state, + void (*do_func)(void*), + void* do_func_arg) + { + /* Avoid calling os_compare_and_swap_uint32() in the most + common case. */ + if (*state == DONE) { + return; + } + + if (os_compare_and_swap_uint32(state, + NEVER_DONE, IN_PROGRESS)) { + /* We are the first. Call the function. */ + + do_func(do_func_arg); + + const bool swapped = os_compare_and_swap_uint32( + state, IN_PROGRESS, DONE); + + ut_a(swapped); + } else { + /* The state is not NEVER_DONE, so either it is + IN_PROGRESS (somebody is calling the function right + now or DONE (it has already been called and completed). + Wait for it to become DONE. */ + for (;;) { + const state_t s = *state; + + switch (s) { + case DONE: + return; + case IN_PROGRESS: + break; + case NEVER_DONE: + /* fall through */ + default: + ut_error; + } + + UT_RELAX_CPU(); + } + } + } +#endif /* HAVE_ATOMIC_BUILTINS */ +}; + +#endif /* os0once_h */ diff --git a/storage/innobase/include/os0proc.h b/storage/innobase/include/os0proc.h new file mode 100644 index 00000000000..613e3bd6947 --- /dev/null +++ b/storage/innobase/include/os0proc.h @@ -0,0 +1,77 @@ +/***************************************************************************** + +Copyright (c) 1995, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/os0proc.h +The interface to the operating system +process control primitives + +Created 9/30/1995 Heikki Tuuri +*******************************************************/ + +#ifndef os0proc_h +#define os0proc_h + +#include "univ.i" + +#ifdef UNIV_LINUX +#include <sys/ipc.h> +#include <sys/shm.h> +#endif + +typedef void* os_process_t; +typedef unsigned long int os_process_id_t; + +extern ibool os_use_large_pages; +/* Large page size. This may be a boot-time option on some platforms */ +extern ulint os_large_page_size; + +/****************************************************************//** +Converts the current process id to a number. It is not guaranteed that the +number is unique. In Linux returns the 'process number' of the current +thread. That number is the same as one sees in 'top', for example. In Linux +the thread id is not the same as one sees in 'top'. +@return process id as a number */ +UNIV_INTERN +ulint +os_proc_get_number(void); +/*====================*/ +/****************************************************************//** +Allocates large pages memory. +@return allocated memory */ +UNIV_INTERN +void* +os_mem_alloc_large( +/*===============*/ + ulint* n); /*!< in/out: number of bytes */ +/****************************************************************//** +Frees large pages memory. */ +UNIV_INTERN +void +os_mem_free_large( +/*==============*/ + void *ptr, /*!< in: pointer returned by + os_mem_alloc_large() */ + ulint size); /*!< in: size returned by + os_mem_alloc_large() */ + +#ifndef UNIV_NONINL +#include "os0proc.ic" +#endif + +#endif diff --git a/storage/innobase/include/os0proc.ic b/storage/innobase/include/os0proc.ic new file mode 100644 index 00000000000..506f4f8ce0c --- /dev/null +++ b/storage/innobase/include/os0proc.ic @@ -0,0 +1,27 @@ +/***************************************************************************** + +Copyright (c) 1995, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/os0proc.ic +The interface to the operating system +process control primitives + +Created 9/30/1995 Heikki Tuuri +*******************************************************/ + + diff --git a/storage/innobase/include/os0sync.h b/storage/innobase/include/os0sync.h new file mode 100644 index 00000000000..57b29fff663 --- /dev/null +++ b/storage/innobase/include/os0sync.h @@ -0,0 +1,743 @@ +/***************************************************************************** + +Copyright (c) 1995, 2014, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2008, Google Inc. + +Portions of this file contain modifications contributed and copyrighted by +Google, Inc. Those modifications are gratefully acknowledged and are described +briefly in the InnoDB documentation. The contributions by Google are +incorporated with their permission, and subject to the conditions contained in +the file COPYING.Google. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/os0sync.h +The interface to the operating system +synchronization primitives. + +Created 9/6/1995 Heikki Tuuri +*******************************************************/ + +#ifndef os0sync_h +#define os0sync_h + +#include "univ.i" +#include "ut0lst.h" +#include "sync0types.h" + +#ifdef __WIN__ +/** Native event (slow)*/ +typedef HANDLE os_native_event_t; +/** Native mutex */ +typedef CRITICAL_SECTION fast_mutex_t; +/** Native condition variable. */ +typedef CONDITION_VARIABLE os_cond_t; +#else +/** Native mutex */ +typedef pthread_mutex_t fast_mutex_t; +/** Native condition variable */ +typedef pthread_cond_t os_cond_t; +#endif + +/** Structure that includes Performance Schema Probe pfs_psi +in the os_fast_mutex structure if UNIV_PFS_MUTEX is defined */ +struct os_fast_mutex_t { + fast_mutex_t mutex; /*!< os_fast_mutex */ +#ifdef UNIV_PFS_MUTEX + struct PSI_mutex* pfs_psi;/*!< The performance schema + instrumentation hook */ +#endif +}; + +/** Operating system event handle */ +typedef struct os_event* os_event_t; + +/** An asynchronous signal sent between threads */ +struct os_event { +#ifdef __WIN__ + HANDLE handle; /*!< kernel event object, slow, + used on older Windows */ +#endif + os_fast_mutex_t os_mutex; /*!< this mutex protects the next + fields */ + ibool is_set; /*!< this is TRUE when the event is + in the signaled state, i.e., a thread + does not stop if it tries to wait for + this event */ + ib_int64_t signal_count; /*!< this is incremented each time + the event becomes signaled */ + os_cond_t cond_var; /*!< condition variable is used in + waiting for the event */ + UT_LIST_NODE_T(os_event_t) os_event_list; + /*!< list of all created events */ +}; + +/** Denotes an infinite delay for os_event_wait_time() */ +#define OS_SYNC_INFINITE_TIME ULINT_UNDEFINED + +/** Return value of os_event_wait_time() when the time is exceeded */ +#define OS_SYNC_TIME_EXCEEDED 1 + +/** Operating system mutex handle */ +typedef struct os_mutex_t* os_ib_mutex_t; + +/** Mutex protecting counts and the event and OS 'slow' mutex lists */ +extern os_ib_mutex_t os_sync_mutex; + +/** This is incremented by 1 in os_thread_create and decremented by 1 in +os_thread_exit */ +extern ulint os_thread_count; + +extern ulint os_event_count; +extern ulint os_mutex_count; +extern ulint os_fast_mutex_count; + +/*********************************************************//** +Initializes global event and OS 'slow' mutex lists. */ +UNIV_INTERN +void +os_sync_init(void); +/*==============*/ +/*********************************************************//** +Frees created events and OS 'slow' mutexes. */ +UNIV_INTERN +void +os_sync_free(void); +/*==============*/ +/*********************************************************//** +Creates an event semaphore, i.e., a semaphore which may just have two states: +signaled and nonsignaled. The created event is manual reset: it must be reset +explicitly by calling sync_os_reset_event. +@return the event handle */ +UNIV_INTERN +os_event_t +os_event_create(void); +/*==================*/ +/**********************************************************//** +Sets an event semaphore to the signaled state: lets waiting threads +proceed. */ +UNIV_INTERN +void +os_event_set( +/*=========*/ + os_event_t event); /*!< in: event to set */ +/**********************************************************//** +Resets an event semaphore to the nonsignaled state. Waiting threads will +stop to wait for the event. +The return value should be passed to os_even_wait_low() if it is desired +that this thread should not wait in case of an intervening call to +os_event_set() between this os_event_reset() and the +os_event_wait_low() call. See comments for os_event_wait_low(). */ +UNIV_INTERN +ib_int64_t +os_event_reset( +/*===========*/ + os_event_t event); /*!< in: event to reset */ +/**********************************************************//** +Frees an event object. */ +UNIV_INTERN +void +os_event_free( +/*==========*/ + os_event_t event); /*!< in: event to free */ + +/**********************************************************//** +Waits for an event object until it is in the signaled state. + +Typically, if the event has been signalled after the os_event_reset() +we'll return immediately because event->is_set == TRUE. +There are, however, situations (e.g.: sync_array code) where we may +lose this information. For example: + +thread A calls os_event_reset() +thread B calls os_event_set() [event->is_set == TRUE] +thread C calls os_event_reset() [event->is_set == FALSE] +thread A calls os_event_wait() [infinite wait!] +thread C calls os_event_wait() [infinite wait!] + +Where such a scenario is possible, to avoid infinite wait, the +value returned by os_event_reset() should be passed in as +reset_sig_count. */ +UNIV_INTERN +void +os_event_wait_low( +/*==============*/ + os_event_t event, /*!< in: event to wait */ + ib_int64_t reset_sig_count);/*!< in: zero or the value + returned by previous call of + os_event_reset(). */ + +#define os_event_wait(event) os_event_wait_low(event, 0) +#define os_event_wait_time(event, t) os_event_wait_time_low(event, t, 0) + +/**********************************************************//** +Waits for an event object until it is in the signaled state or +a timeout is exceeded. In Unix the timeout is always infinite. +@return 0 if success, OS_SYNC_TIME_EXCEEDED if timeout was exceeded */ +UNIV_INTERN +ulint +os_event_wait_time_low( +/*===================*/ + os_event_t event, /*!< in: event to wait */ + ulint time_in_usec, /*!< in: timeout in + microseconds, or + OS_SYNC_INFINITE_TIME */ + ib_int64_t reset_sig_count); /*!< in: zero or the value + returned by previous call of + os_event_reset(). */ +/*********************************************************//** +Creates an operating system mutex semaphore. Because these are slow, the +mutex semaphore of InnoDB itself (ib_mutex_t) should be used where possible. +@return the mutex handle */ +UNIV_INTERN +os_ib_mutex_t +os_mutex_create(void); +/*=================*/ +/**********************************************************//** +Acquires ownership of a mutex semaphore. */ +UNIV_INTERN +void +os_mutex_enter( +/*===========*/ + os_ib_mutex_t mutex); /*!< in: mutex to acquire */ +/**********************************************************//** +Releases ownership of a mutex. */ +UNIV_INTERN +void +os_mutex_exit( +/*==========*/ + os_ib_mutex_t mutex); /*!< in: mutex to release */ +/**********************************************************//** +Frees an mutex object. */ +UNIV_INTERN +void +os_mutex_free( +/*==========*/ + os_ib_mutex_t mutex); /*!< in: mutex to free */ +/**********************************************************//** +Acquires ownership of a fast mutex. Currently in Windows this is the same +as os_fast_mutex_lock! +@return 0 if success, != 0 if was reserved by another thread */ +UNIV_INLINE +ulint +os_fast_mutex_trylock( +/*==================*/ + os_fast_mutex_t* fast_mutex); /*!< in: mutex to acquire */ + +/********************************************************************** +Following os_fast_ mutex APIs would be performance schema instrumented: + +os_fast_mutex_init +os_fast_mutex_lock +os_fast_mutex_unlock +os_fast_mutex_free + +These mutex APIs will point to corresponding wrapper functions that contain +the performance schema instrumentation. + +NOTE! The following macro should be used in mutex operation, not the +corresponding function. */ + +#ifdef UNIV_PFS_MUTEX +# define os_fast_mutex_init(K, M) \ + pfs_os_fast_mutex_init(K, M) + +# define os_fast_mutex_lock(M) \ + pfs_os_fast_mutex_lock(M, __FILE__, __LINE__) + +# define os_fast_mutex_unlock(M) pfs_os_fast_mutex_unlock(M) + +# define os_fast_mutex_free(M) pfs_os_fast_mutex_free(M) + +/*********************************************************//** +NOTE! Please use the corresponding macro os_fast_mutex_init(), not directly +this function! +A wrapper function for os_fast_mutex_init_func(). Initializes an operating +system fast mutex semaphore. */ +UNIV_INLINE +void +pfs_os_fast_mutex_init( +/*===================*/ + PSI_mutex_key key, /*!< in: Performance Schema + key */ + os_fast_mutex_t* fast_mutex); /*!< out: fast mutex */ +/**********************************************************//** +NOTE! Please use the corresponding macro os_fast_mutex_free(), not directly +this function! +Wrapper function for pfs_os_fast_mutex_free(). Also destroys the performance +schema probes when freeing the mutex */ +UNIV_INLINE +void +pfs_os_fast_mutex_free( +/*===================*/ + os_fast_mutex_t* fast_mutex); /*!< in/out: mutex to free */ +/**********************************************************//** +NOTE! Please use the corresponding macro os_fast_mutex_lock, not directly +this function! +Wrapper function of os_fast_mutex_lock. Acquires ownership of a fast mutex. */ +UNIV_INLINE +void +pfs_os_fast_mutex_lock( +/*===================*/ + os_fast_mutex_t* fast_mutex, /*!< in/out: mutex to acquire */ + const char* file_name, /*!< in: file name where + locked */ + ulint line); /*!< in: line where locked */ +/**********************************************************//** +NOTE! Please use the corresponding macro os_fast_mutex_unlock, not directly +this function! +Wrapper function of os_fast_mutex_unlock. Releases ownership of a fast mutex. */ +UNIV_INLINE +void +pfs_os_fast_mutex_unlock( +/*=====================*/ + os_fast_mutex_t* fast_mutex); /*!< in/out: mutex to release */ + +#else /* UNIV_PFS_MUTEX */ + +# define os_fast_mutex_init(K, M) \ + os_fast_mutex_init_func(&((os_fast_mutex_t*)(M))->mutex) + +# define os_fast_mutex_lock(M) \ + os_fast_mutex_lock_func(&((os_fast_mutex_t*)(M))->mutex) + +# define os_fast_mutex_unlock(M) \ + os_fast_mutex_unlock_func(&((os_fast_mutex_t*)(M))->mutex) + +# define os_fast_mutex_free(M) \ + os_fast_mutex_free_func(&((os_fast_mutex_t*)(M))->mutex) +#endif /* UNIV_PFS_MUTEX */ + +/**********************************************************//** +Releases ownership of a fast mutex. */ +UNIV_INTERN +void +os_fast_mutex_unlock_func( +/*======================*/ + fast_mutex_t* fast_mutex); /*!< in: mutex to release */ +/*********************************************************//** +Initializes an operating system fast mutex semaphore. */ +UNIV_INTERN +void +os_fast_mutex_init_func( +/*====================*/ + fast_mutex_t* fast_mutex); /*!< in: fast mutex */ +/**********************************************************//** +Acquires ownership of a fast mutex. */ +UNIV_INTERN +void +os_fast_mutex_lock_func( +/*====================*/ + fast_mutex_t* fast_mutex); /*!< in: mutex to acquire */ +/**********************************************************//** +Frees an mutex object. */ +UNIV_INTERN +void +os_fast_mutex_free_func( +/*====================*/ + fast_mutex_t* fast_mutex); /*!< in: mutex to free */ + +/**********************************************************//** +Atomic compare-and-swap and increment for InnoDB. */ + +#if defined(HAVE_IB_GCC_ATOMIC_BUILTINS) + +# define HAVE_ATOMIC_BUILTINS + +# ifdef HAVE_IB_GCC_ATOMIC_BUILTINS_BYTE +# define HAVE_ATOMIC_BUILTINS_BYTE +# endif + +# ifdef HAVE_IB_GCC_ATOMIC_BUILTINS_64 +# define HAVE_ATOMIC_BUILTINS_64 +# endif + +/**********************************************************//** +Returns true if swapped, ptr is pointer to target, old_val is value to +compare to, new_val is the value to swap in. */ + +# define os_compare_and_swap(ptr, old_val, new_val) \ + __sync_bool_compare_and_swap(ptr, old_val, new_val) + +# define os_compare_and_swap_ulint(ptr, old_val, new_val) \ + os_compare_and_swap(ptr, old_val, new_val) + +# define os_compare_and_swap_lint(ptr, old_val, new_val) \ + os_compare_and_swap(ptr, old_val, new_val) + +# define os_compare_and_swap_uint32(ptr, old_val, new_val) \ + os_compare_and_swap(ptr, old_val, new_val) + +# ifdef HAVE_IB_ATOMIC_PTHREAD_T_GCC +# define os_compare_and_swap_thread_id(ptr, old_val, new_val) \ + os_compare_and_swap(ptr, old_val, new_val) +# define INNODB_RW_LOCKS_USE_ATOMICS +# define IB_ATOMICS_STARTUP_MSG \ + "Mutexes and rw_locks use GCC atomic builtins" +# else /* HAVE_IB_ATOMIC_PTHREAD_T_GCC */ +# define IB_ATOMICS_STARTUP_MSG \ + "Mutexes use GCC atomic builtins, rw_locks do not" +# endif /* HAVE_IB_ATOMIC_PTHREAD_T_GCC */ + +/**********************************************************//** +Returns the resulting value, ptr is pointer to target, amount is the +amount of increment. */ + +# define os_atomic_increment(ptr, amount) \ + __sync_add_and_fetch(ptr, amount) + +# define os_atomic_increment_lint(ptr, amount) \ + os_atomic_increment(ptr, amount) + +# define os_atomic_increment_uint32(ptr, amount ) \ + os_atomic_increment(ptr, amount) + +# define os_atomic_increment_ulint(ptr, amount) \ + os_atomic_increment(ptr, amount) + +# define os_atomic_increment_uint64(ptr, amount) \ + os_atomic_increment(ptr, amount) + +/* Returns the resulting value, ptr is pointer to target, amount is the +amount to decrement. */ + +# define os_atomic_decrement(ptr, amount) \ + __sync_sub_and_fetch(ptr, amount) + +# define os_atomic_decrement_uint32(ptr, amount) \ + os_atomic_decrement(ptr, amount) + +# define os_atomic_decrement_lint(ptr, amount) \ + os_atomic_decrement(ptr, amount) + +# define os_atomic_decrement_ulint(ptr, amount) \ + os_atomic_decrement(ptr, amount) + +# define os_atomic_decrement_uint64(ptr, amount) \ + os_atomic_decrement(ptr, amount) + +/**********************************************************//** +Returns the old value of *ptr, atomically sets *ptr to new_val */ + +# define os_atomic_test_and_set_byte(ptr, new_val) \ + __sync_lock_test_and_set(ptr, (byte) new_val) + +# define os_atomic_test_and_set_ulint(ptr, new_val) \ + __sync_lock_test_and_set(ptr, new_val) + +#elif defined(HAVE_IB_SOLARIS_ATOMICS) + +# define HAVE_ATOMIC_BUILTINS +# define HAVE_ATOMIC_BUILTINS_BYTE +# define HAVE_ATOMIC_BUILTINS_64 + +/* If not compiling with GCC or GCC doesn't support the atomic +intrinsics and running on Solaris >= 10 use Solaris atomics */ + +# include <atomic.h> + +/**********************************************************//** +Returns true if swapped, ptr is pointer to target, old_val is value to +compare to, new_val is the value to swap in. */ + +# define os_compare_and_swap_uint32(ptr, old_val, new_val) \ + (atomic_cas_32(ptr, old_val, new_val) == old_val) + +# define os_compare_and_swap_ulint(ptr, old_val, new_val) \ + (atomic_cas_ulong(ptr, old_val, new_val) == old_val) + +# define os_compare_and_swap_lint(ptr, old_val, new_val) \ + ((lint) atomic_cas_ulong((ulong_t*) ptr, old_val, new_val) == old_val) + +# ifdef HAVE_IB_ATOMIC_PTHREAD_T_SOLARIS +# if SIZEOF_PTHREAD_T == 4 +# define os_compare_and_swap_thread_id(ptr, old_val, new_val) \ + ((pthread_t) atomic_cas_32(ptr, old_val, new_val) == old_val) +# elif SIZEOF_PTHREAD_T == 8 +# define os_compare_and_swap_thread_id(ptr, old_val, new_val) \ + ((pthread_t) atomic_cas_64(ptr, old_val, new_val) == old_val) +# else +# error "SIZEOF_PTHREAD_T != 4 or 8" +# endif /* SIZEOF_PTHREAD_T CHECK */ +# define INNODB_RW_LOCKS_USE_ATOMICS +# define IB_ATOMICS_STARTUP_MSG \ + "Mutexes and rw_locks use Solaris atomic functions" +# else /* HAVE_IB_ATOMIC_PTHREAD_T_SOLARIS */ +# define IB_ATOMICS_STARTUP_MSG \ + "Mutexes use Solaris atomic functions, rw_locks do not" +# endif /* HAVE_IB_ATOMIC_PTHREAD_T_SOLARIS */ + +/**********************************************************//** +Returns the resulting value, ptr is pointer to target, amount is the +amount of increment. */ + +# define os_atomic_increment_uint32(ptr, amount) \ + atomic_add_32_nv(ptr, amount) + +# define os_atomic_increment_ulint(ptr, amount) \ + atomic_add_long_nv(ptr, amount) + +# define os_atomic_increment_lint(ptr, amount) \ + os_atomic_increment_ulint((ulong_t*) ptr, amount) + +# define os_atomic_increment_uint64(ptr, amount) \ + atomic_add_64_nv(ptr, amount) + +/* Returns the resulting value, ptr is pointer to target, amount is the +amount to decrement. */ + +# define os_atomic_decrement_uint32(ptr, amount) \ + os_atomic_increment_uint32(ptr, -(amount)) + +# define os_atomic_decrement_lint(ptr, amount) \ + os_atomic_increment_ulint((ulong_t*) ptr, -(amount)) + +# define os_atomic_decrement_ulint(ptr, amount) \ + os_atomic_increment_ulint(ptr, -(amount)) + +# define os_atomic_decrement_uint64(ptr, amount) \ + os_atomic_increment_uint64(ptr, -(amount)) + +/**********************************************************//** +Returns the old value of *ptr, atomically sets *ptr to new_val */ + +# define os_atomic_test_and_set_byte(ptr, new_val) \ + atomic_swap_uchar(ptr, new_val) + +# define os_atomic_test_and_set_ulint(ptr, new_val) \ + atomic_swap_ulong(ptr, new_val) + +#elif defined(HAVE_WINDOWS_ATOMICS) + +# define HAVE_ATOMIC_BUILTINS +# define HAVE_ATOMIC_BUILTINS_BYTE + +# ifndef _WIN32 +# define HAVE_ATOMIC_BUILTINS_64 +# endif + +/**********************************************************//** +Atomic compare and exchange of signed integers (both 32 and 64 bit). +@return value found before the exchange. +If it is not equal to old_value the exchange did not happen. */ +UNIV_INLINE +lint +win_cmp_and_xchg_lint( +/*==================*/ + volatile lint* ptr, /*!< in/out: source/destination */ + lint new_val, /*!< in: exchange value */ + lint old_val); /*!< in: value to compare to */ + +/**********************************************************//** +Atomic addition of signed integers. +@return Initial value of the variable pointed to by ptr */ +UNIV_INLINE +lint +win_xchg_and_add( +/*=============*/ + volatile lint* ptr, /*!< in/out: address of destination */ + lint val); /*!< in: number to be added */ + +/**********************************************************//** +Atomic compare and exchange of unsigned integers. +@return value found before the exchange. +If it is not equal to old_value the exchange did not happen. */ +UNIV_INLINE +ulint +win_cmp_and_xchg_ulint( +/*===================*/ + volatile ulint* ptr, /*!< in/out: source/destination */ + ulint new_val, /*!< in: exchange value */ + ulint old_val); /*!< in: value to compare to */ + +/**********************************************************//** +Atomic compare and exchange of 32 bit unsigned integers. +@return value found before the exchange. +If it is not equal to old_value the exchange did not happen. */ +UNIV_INLINE +DWORD +win_cmp_and_xchg_dword( +/*===================*/ + volatile DWORD* ptr, /*!< in/out: source/destination */ + DWORD new_val, /*!< in: exchange value */ + DWORD old_val); /*!< in: value to compare to */ + +/**********************************************************//** +Returns true if swapped, ptr is pointer to target, old_val is value to +compare to, new_val is the value to swap in. */ + +# define os_compare_and_swap_uint32(ptr, old_val, new_val) \ + (InterlockedCompareExchange(reinterpret_cast<volatile long*>(ptr), \ + new_val, old_val) == old_val) + +# define os_compare_and_swap_ulint(ptr, old_val, new_val) \ + (win_cmp_and_xchg_ulint(ptr, new_val, old_val) == old_val) + +# define os_compare_and_swap_lint(ptr, old_val, new_val) \ + (win_cmp_and_xchg_lint(ptr, new_val, old_val) == old_val) + +/* windows thread objects can always be passed to windows atomic functions */ +# define os_compare_and_swap_thread_id(ptr, old_val, new_val) \ + (win_cmp_and_xchg_dword(ptr, new_val, old_val) == old_val) + +# define INNODB_RW_LOCKS_USE_ATOMICS +# define IB_ATOMICS_STARTUP_MSG \ + "Mutexes and rw_locks use Windows interlocked functions" + +/**********************************************************//** +Returns the resulting value, ptr is pointer to target, amount is the +amount of increment. */ + +# define os_atomic_increment_lint(ptr, amount) \ + (win_xchg_and_add(ptr, amount) + amount) + +# define os_atomic_increment_uint32(ptr, amount) \ + ((ulint) InterlockedExchangeAdd((long*) ptr, amount)) + +# define os_atomic_increment_ulint(ptr, amount) \ + ((ulint) (win_xchg_and_add((lint*) ptr, (lint) amount) + amount)) + +# define os_atomic_increment_uint64(ptr, amount) \ + ((ib_uint64_t) (InterlockedExchangeAdd64( \ + (ib_int64_t*) ptr, \ + (ib_int64_t) amount) + amount)) + +/**********************************************************//** +Returns the resulting value, ptr is pointer to target, amount is the +amount to decrement. There is no atomic substract function on Windows */ + +# define os_atomic_decrement_uint32(ptr, amount) \ + ((ulint) InterlockedExchangeAdd((long*) ptr, (-amount))) + +# define os_atomic_decrement_lint(ptr, amount) \ + (win_xchg_and_add(ptr, -(lint) amount) - amount) + +# define os_atomic_decrement_ulint(ptr, amount) \ + ((ulint) (win_xchg_and_add((lint*) ptr, -(lint) amount) - amount)) + +# define os_atomic_decrement_uint64(ptr, amount) \ + ((ib_uint64_t) (InterlockedExchangeAdd64( \ + (ib_int64_t*) ptr, \ + -(ib_int64_t) amount) - amount)) + +/**********************************************************//** +Returns the old value of *ptr, atomically sets *ptr to new_val. +InterlockedExchange() operates on LONG, and the LONG will be +clobbered */ + +# define os_atomic_test_and_set_byte(ptr, new_val) \ + ((byte) InterlockedExchange(ptr, new_val)) + +# define os_atomic_test_and_set_ulong(ptr, new_val) \ + InterlockedExchange(ptr, new_val) + +#else +# define IB_ATOMICS_STARTUP_MSG \ + "Mutexes and rw_locks use InnoDB's own implementation" +#endif +#ifdef HAVE_ATOMIC_BUILTINS +#define os_atomic_inc_ulint(m,v,d) os_atomic_increment_ulint(v, d) +#define os_atomic_dec_ulint(m,v,d) os_atomic_decrement_ulint(v, d) +#else +#define os_atomic_inc_ulint(m,v,d) os_atomic_inc_ulint_func(m, v, d) +#define os_atomic_dec_ulint(m,v,d) os_atomic_dec_ulint_func(m, v, d) +#endif /* HAVE_ATOMIC_BUILTINS */ + +/**********************************************************//** +Following macros are used to update specified counter atomically +if HAVE_ATOMIC_BUILTINS defined. Otherwise, use mutex passed in +for synchronization */ +#ifdef HAVE_ATOMIC_BUILTINS +#define os_increment_counter_by_amount(mutex, counter, amount) \ + (void) os_atomic_increment_ulint(&counter, amount) + +#define os_decrement_counter_by_amount(mutex, counter, amount) \ + (void) os_atomic_increment_ulint(&counter, (-((lint) amount))) +#else +#define os_increment_counter_by_amount(mutex, counter, amount) \ + do { \ + mutex_enter(&(mutex)); \ + (counter) += (amount); \ + mutex_exit(&(mutex)); \ + } while (0) + +#define os_decrement_counter_by_amount(mutex, counter, amount) \ + do { \ + ut_a(counter >= amount); \ + mutex_enter(&(mutex)); \ + (counter) -= (amount); \ + mutex_exit(&(mutex)); \ + } while (0) +#endif /* HAVE_ATOMIC_BUILTINS */ + +#define os_inc_counter(mutex, counter) \ + os_increment_counter_by_amount(mutex, counter, 1) + +#define os_dec_counter(mutex, counter) \ + do { \ + os_decrement_counter_by_amount(mutex, counter, 1);\ + } while (0); + +/** barrier definitions for memory ordering */ +#if defined __i386__ || defined __x86_64__ || defined _M_IX86 || defined _M_X64 || defined __WIN__ +/* Performance regression was observed at some conditions for Intel +architecture. Disable memory barrier for Intel architecture for now. */ +# define os_rmb +# define os_wmb +# define IB_MEMORY_BARRIER_STARTUP_MSG \ + "Memory barrier is not used" +#elif defined(HAVE_IB_GCC_ATOMIC_THREAD_FENCE) +# define HAVE_MEMORY_BARRIER +# define os_rmb __atomic_thread_fence(__ATOMIC_ACQUIRE) +# define os_wmb __atomic_thread_fence(__ATOMIC_RELEASE) +# define IB_MEMORY_BARRIER_STARTUP_MSG \ + "GCC builtin __atomic_thread_fence() is used for memory barrier" + +#elif defined(HAVE_IB_GCC_SYNC_SYNCHRONISE) +# define HAVE_MEMORY_BARRIER +# define os_rmb __sync_synchronize() +# define os_wmb __sync_synchronize() +# define IB_MEMORY_BARRIER_STARTUP_MSG \ + "GCC builtin __sync_synchronize() is used for memory barrier" + +#elif defined(HAVE_IB_MACHINE_BARRIER_SOLARIS) +# define HAVE_MEMORY_BARRIER +# include <mbarrier.h> +# define os_rmb __machine_r_barrier() +# define os_wmb __machine_w_barrier() +# define IB_MEMORY_BARRIER_STARTUP_MSG \ + "Solaris memory ordering functions are used for memory barrier" + +#elif defined(HAVE_WINDOWS_MM_FENCE) && defined(_WIN64) +# define HAVE_MEMORY_BARRIER +# include <mmintrin.h> +# define os_rmb _mm_lfence() +# define os_wmb _mm_sfence() +# define IB_MEMORY_BARRIER_STARTUP_MSG \ + "_mm_lfence() and _mm_sfence() are used for memory barrier" + +#else +# define os_rmb +# define os_wmb +# define IB_MEMORY_BARRIER_STARTUP_MSG \ + "Memory barrier is not used" +#endif + +#ifndef UNIV_NONINL +#include "os0sync.ic" +#endif + +#endif diff --git a/storage/innobase/include/os0sync.ic b/storage/innobase/include/os0sync.ic new file mode 100644 index 00000000000..9a7e520ece6 --- /dev/null +++ b/storage/innobase/include/os0sync.ic @@ -0,0 +1,234 @@ +/***************************************************************************** + +Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/os0sync.ic +The interface to the operating system synchronization primitives. + +Created 9/6/1995 Heikki Tuuri +*******************************************************/ + +#ifdef __WIN__ +#include <winbase.h> +#endif + +/**********************************************************//** +Acquires ownership of a fast mutex. +@return 0 if success, != 0 if was reserved by another thread */ +UNIV_INLINE +ulint +os_fast_mutex_trylock( +/*==================*/ + os_fast_mutex_t* fast_mutex) /*!< in: mutex to acquire */ +{ + fast_mutex_t* mutex = &fast_mutex->mutex; + +#ifdef __WIN__ + return(!TryEnterCriticalSection(mutex)); +#else + /* NOTE that the MySQL my_pthread.h redefines pthread_mutex_trylock + so that it returns 0 on success. In the operating system + libraries, HP-UX-10.20 follows the old Posix 1003.4a Draft 4 and + returns 1 on success (but MySQL remaps that to 0), while Linux, + FreeBSD, Solaris, AIX, Tru64 Unix, HP-UX-11.0 return 0 on success. */ + + return((ulint) pthread_mutex_trylock(mutex)); +#endif +} + +#ifdef UNIV_PFS_MUTEX +/*********************************************************//** +NOTE! Please use the corresponding macro os_fast_mutex_init(), not directly +this function! +A wrapper function for os_fast_mutex_init_func(). Initializes an operating +system fast mutex semaphore. */ +UNIV_INLINE +void +pfs_os_fast_mutex_init( +/*===================*/ + PSI_mutex_key key, /*!< in: Performance Schema + key */ + os_fast_mutex_t* fast_mutex) /*!< out: fast mutex */ +{ +#ifdef HAVE_PSI_MUTEX_INTERFACE + fast_mutex->pfs_psi = PSI_MUTEX_CALL(init_mutex)(key, &fast_mutex->mutex); +#else + fast_mutex->pfs_psi = NULL; +#endif + + os_fast_mutex_init_func(&fast_mutex->mutex); +} +/******************************************************************//** +NOTE! Please use the corresponding macro os_fast_mutex_free(), not directly +this function! +Wrapper function for pfs_os_fast_mutex_free(). Also destroys the performance +schema probes when freeing the mutex */ +UNIV_INLINE +void +pfs_os_fast_mutex_free( +/*===================*/ + os_fast_mutex_t* fast_mutex) /*!< in/out: mutex */ +{ +#ifdef HAVE_PSI_MUTEX_INTERFACE + if (fast_mutex->pfs_psi != NULL) + PSI_MUTEX_CALL(destroy_mutex)(fast_mutex->pfs_psi); +#endif + fast_mutex->pfs_psi = NULL; + + os_fast_mutex_free_func(&fast_mutex->mutex); +} +/**********************************************************//** +NOTE! Please use the corresponding macro os_fast_mutex_lock, not directly +this function! +Wrapper function of os_fast_mutex_lock_func. Acquires ownership of a fast +mutex. */ +UNIV_INLINE +void +pfs_os_fast_mutex_lock( +/*===================*/ + os_fast_mutex_t* fast_mutex, /*!< in/out: mutex to acquire */ + const char* file_name, /*!< in: file name where + locked */ + ulint line) /*!< in: line where locked */ +{ +#ifdef HAVE_PSI_MUTEX_INTERFACE + if (fast_mutex->pfs_psi != NULL) + { + PSI_mutex_locker* locker; + PSI_mutex_locker_state state; + + locker = PSI_MUTEX_CALL(start_mutex_wait)( + &state, fast_mutex->pfs_psi, + PSI_MUTEX_LOCK, file_name, + static_cast<uint>(line)); + + os_fast_mutex_lock_func(&fast_mutex->mutex); + + if (locker != NULL) + PSI_MUTEX_CALL(end_mutex_wait)(locker, 0); + } + else +#endif + { + os_fast_mutex_lock_func(&fast_mutex->mutex); + } + + return; +} +/**********************************************************//** +NOTE! Please use the corresponding macro os_fast_mutex_unlock, not directly +this function! +Wrapper function of os_fast_mutex_unlock_func. Releases ownership of a +fast mutex. */ +UNIV_INLINE +void +pfs_os_fast_mutex_unlock( +/*=====================*/ + os_fast_mutex_t* fast_mutex) /*!< in/out: mutex to release */ +{ +#ifdef HAVE_PSI_MUTEX_INTERFACE + if (fast_mutex->pfs_psi != NULL) + PSI_MUTEX_CALL(unlock_mutex)(fast_mutex->pfs_psi); +#endif + + os_fast_mutex_unlock_func(&fast_mutex->mutex); +} +#endif /* UNIV_PFS_MUTEX */ + +#ifdef HAVE_WINDOWS_ATOMICS + +/* Use inline functions to make 64 and 32 bit versions of windows atomic +functions so that typecasts are evaluated at compile time. Take advantage +that lint is either __int64 or long int and windows atomic functions work +on __int64 and LONG */ + +/**********************************************************//** +Atomic compare and exchange of unsigned integers. +@return value found before the exchange. +If it is not equal to old_value the exchange did not happen. */ +UNIV_INLINE +lint +win_cmp_and_xchg_lint( +/*==================*/ + volatile lint* ptr, /*!< in/out: source/destination */ + lint new_val, /*!< in: exchange value */ + lint old_val) /*!< in: value to compare to */ +{ +# ifdef _WIN64 + return(InterlockedCompareExchange64(ptr, new_val, old_val)); +# else + return(InterlockedCompareExchange(ptr, new_val, old_val)); +# endif +} + +/**********************************************************//** +Atomic addition of signed integers. +@return Initial value of the variable pointed to by ptr */ +UNIV_INLINE +lint +win_xchg_and_add( +/*=============*/ + volatile lint* ptr, /*!< in/out: address of destination */ + lint val) /*!< in: number to be added */ +{ +#ifdef _WIN64 + return(InterlockedExchangeAdd64(ptr, val)); +#else + return(InterlockedExchangeAdd(ptr, val)); +#endif +} + +/**********************************************************//** +Atomic compare and exchange of unsigned integers. +@return value found before the exchange. +If it is not equal to old_value the exchange did not happen. */ +UNIV_INLINE +ulint +win_cmp_and_xchg_ulint( +/*===================*/ + volatile ulint* ptr, /*!< in/out: source/destination */ + ulint new_val, /*!< in: exchange value */ + ulint old_val) /*!< in: value to compare to */ +{ + return((ulint) win_cmp_and_xchg_lint( + (volatile lint*) ptr, + (lint) new_val, + (lint) old_val)); +} + +/**********************************************************//** +Atomic compare and exchange of 32-bit unsigned integers. +@return value found before the exchange. +If it is not equal to old_value the exchange did not happen. */ +UNIV_INLINE +DWORD +win_cmp_and_xchg_dword( +/*===================*/ + volatile DWORD* ptr, /*!< in/out: source/destination */ + DWORD new_val, /*!< in: exchange value */ + DWORD old_val) /*!< in: value to compare to */ +{ + ut_ad(sizeof(DWORD) == sizeof(LONG)); /* We assume this. */ + return(InterlockedCompareExchange( + (volatile LONG*) ptr, + (LONG) new_val, + (LONG) old_val)); +} + +#endif /* HAVE_WINDOWS_ATOMICS */ + diff --git a/storage/innobase/include/os0thread.h b/storage/innobase/include/os0thread.h new file mode 100644 index 00000000000..37c54afae80 --- /dev/null +++ b/storage/innobase/include/os0thread.h @@ -0,0 +1,154 @@ +/***************************************************************************** + +Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/os0thread.h +The interface to the operating system +process and thread control primitives + +Created 9/8/1995 Heikki Tuuri +*******************************************************/ + +#ifndef os0thread_h +#define os0thread_h + +#include "univ.i" + +/* Maximum number of threads which can be created in the program; +this is also the size of the wait slot array for MySQL threads which +can wait inside InnoDB */ + +#define OS_THREAD_MAX_N srv_max_n_threads + +/* Possible fixed priorities for threads */ +#define OS_THREAD_PRIORITY_NONE 100 +#define OS_THREAD_PRIORITY_BACKGROUND 1 +#define OS_THREAD_PRIORITY_NORMAL 2 +#define OS_THREAD_PRIORITY_ABOVE_NORMAL 3 + +#ifdef __WIN__ +typedef void* os_thread_t; +typedef DWORD os_thread_id_t; /*!< In Windows the thread id + is an unsigned long int */ +extern "C" { +typedef LPTHREAD_START_ROUTINE os_thread_func_t; +} + +/** Macro for specifying a Windows thread start function. */ +#define DECLARE_THREAD(func) WINAPI func + +/** Required to get around a build error on Windows. Even though our functions +are defined/declared as WINAPI f(LPVOID a); the compiler complains that they +are defined as: os_thread_ret_t (__cdecl*)(void*). Because our functions +don't access the arguments and don't return any value, we should be safe. */ +#define os_thread_create(f,a,i) \ + os_thread_create_func(reinterpret_cast<os_thread_func_t>(f), a, i) + +#else + +typedef pthread_t os_thread_t; +typedef os_thread_t os_thread_id_t; /*!< In Unix we use the thread + handle itself as the id of + the thread */ +extern "C" { typedef void* (*os_thread_func_t)(void*); } + +/** Macro for specifying a POSIX thread start function. */ +#define DECLARE_THREAD(func) func +#define os_thread_create(f,a,i) os_thread_create_func(f, a, i) + +#endif /* __WIN__ */ + +/* Define a function pointer type to use in a typecast */ +typedef void* (*os_posix_f_t) (void*); + +#ifdef HAVE_PSI_INTERFACE +/* Define for performance schema registration key */ +typedef unsigned int mysql_pfs_key_t; +#endif + +/***************************************************************//** +Compares two thread ids for equality. +@return TRUE if equal */ +UNIV_INTERN +ibool +os_thread_eq( +/*=========*/ + os_thread_id_t a, /*!< in: OS thread or thread id */ + os_thread_id_t b); /*!< in: OS thread or thread id */ +/****************************************************************//** +Converts an OS thread id to a ulint. It is NOT guaranteed that the ulint is +unique for the thread though! +@return thread identifier as a number */ +UNIV_INTERN +ulint +os_thread_pf( +/*=========*/ + os_thread_id_t a); /*!< in: OS thread identifier */ +/****************************************************************//** +Creates a new thread of execution. The execution starts from +the function given. The start function takes a void* parameter +and returns a ulint. +NOTE: We count the number of threads in os_thread_exit(). A created +thread should always use that to exit and not use return() to exit. +@return handle to the thread */ +UNIV_INTERN +os_thread_t +os_thread_create_func( +/*==================*/ + os_thread_func_t func, /*!< in: pointer to function + from which to start */ + void* arg, /*!< in: argument to start + function */ + os_thread_id_t* thread_id); /*!< out: id of the created + thread, or NULL */ + +/*****************************************************************//** +Exits the current thread. */ +UNIV_INTERN +void +os_thread_exit( +/*===========*/ + void* exit_value) /*!< in: exit value; in Windows this void* + is cast as a DWORD */ + UNIV_COLD __attribute__((noreturn)); +/*****************************************************************//** +Returns the thread identifier of current thread. +@return current thread identifier */ +UNIV_INTERN +os_thread_id_t +os_thread_get_curr_id(void); +/*========================*/ +/*****************************************************************//** +Advises the os to give up remainder of the thread's time slice. */ +UNIV_INTERN +void +os_thread_yield(void); +/*=================*/ +/*****************************************************************//** +The thread sleeps at least the time given in microseconds. */ +UNIV_INTERN +void +os_thread_sleep( +/*============*/ + ulint tm); /*!< in: time in microseconds */ + +#ifndef UNIV_NONINL +#include "os0thread.ic" +#endif + +#endif diff --git a/storage/innobase/include/os0thread.ic b/storage/innobase/include/os0thread.ic new file mode 100644 index 00000000000..0622d22f2dc --- /dev/null +++ b/storage/innobase/include/os0thread.ic @@ -0,0 +1,25 @@ +/***************************************************************************** + +Copyright (c) 1995, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/os0thread.ic +The interface to the operating system +process and thread control primitives + +Created 9/8/1995 Heikki Tuuri +*******************************************************/ diff --git a/storage/innobase/include/page0cur.h b/storage/innobase/include/page0cur.h new file mode 100644 index 00000000000..b1ad49b4915 --- /dev/null +++ b/storage/innobase/include/page0cur.h @@ -0,0 +1,387 @@ +/***************************************************************************** + +Copyright (c) 1994, 2013, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/********************************************************************//** +@file include/page0cur.h +The page cursor + +Created 10/4/1994 Heikki Tuuri +*************************************************************************/ + +#ifndef page0cur_h +#define page0cur_h + +#include "univ.i" + +#include "buf0types.h" +#include "page0page.h" +#include "rem0rec.h" +#include "data0data.h" +#include "mtr0mtr.h" + + +#define PAGE_CUR_ADAPT + +/* Page cursor search modes; the values must be in this order! */ + +#define PAGE_CUR_UNSUPP 0 +#define PAGE_CUR_G 1 +#define PAGE_CUR_GE 2 +#define PAGE_CUR_L 3 +#define PAGE_CUR_LE 4 +/*#define PAGE_CUR_LE_OR_EXTENDS 5*/ /* This is a search mode used in + "column LIKE 'abc%' ORDER BY column DESC"; + we have to find strings which are <= 'abc' or + which extend it */ +#ifdef UNIV_SEARCH_DEBUG +# define PAGE_CUR_DBG 6 /* As PAGE_CUR_LE, but skips search shortcut */ +#endif /* UNIV_SEARCH_DEBUG */ + +#ifdef UNIV_DEBUG +/*********************************************************//** +Gets pointer to the page frame where the cursor is positioned. +@return page */ +UNIV_INLINE +page_t* +page_cur_get_page( +/*==============*/ + page_cur_t* cur); /*!< in: page cursor */ +/*********************************************************//** +Gets pointer to the buffer block where the cursor is positioned. +@return page */ +UNIV_INLINE +buf_block_t* +page_cur_get_block( +/*===============*/ + page_cur_t* cur); /*!< in: page cursor */ +/*********************************************************//** +Gets pointer to the page frame where the cursor is positioned. +@return page */ +UNIV_INLINE +page_zip_des_t* +page_cur_get_page_zip( +/*==================*/ + page_cur_t* cur); /*!< in: page cursor */ +/*********************************************************//** +Gets the record where the cursor is positioned. +@return record */ +UNIV_INLINE +rec_t* +page_cur_get_rec( +/*=============*/ + page_cur_t* cur); /*!< in: page cursor */ +#else /* UNIV_DEBUG */ +# define page_cur_get_page(cur) page_align((cur)->rec) +# define page_cur_get_block(cur) (cur)->block +# define page_cur_get_page_zip(cur) buf_block_get_page_zip((cur)->block) +# define page_cur_get_rec(cur) (cur)->rec +#endif /* UNIV_DEBUG */ +/*********************************************************//** +Sets the cursor object to point before the first user record +on the page. */ +UNIV_INLINE +void +page_cur_set_before_first( +/*======================*/ + const buf_block_t* block, /*!< in: index page */ + page_cur_t* cur); /*!< in: cursor */ +/*********************************************************//** +Sets the cursor object to point after the last user record on +the page. */ +UNIV_INLINE +void +page_cur_set_after_last( +/*====================*/ + const buf_block_t* block, /*!< in: index page */ + page_cur_t* cur); /*!< in: cursor */ +/*********************************************************//** +Returns TRUE if the cursor is before first user record on page. +@return TRUE if at start */ +UNIV_INLINE +ibool +page_cur_is_before_first( +/*=====================*/ + const page_cur_t* cur); /*!< in: cursor */ +/*********************************************************//** +Returns TRUE if the cursor is after last user record. +@return TRUE if at end */ +UNIV_INLINE +ibool +page_cur_is_after_last( +/*===================*/ + const page_cur_t* cur); /*!< in: cursor */ +/**********************************************************//** +Positions the cursor on the given record. */ +UNIV_INLINE +void +page_cur_position( +/*==============*/ + const rec_t* rec, /*!< in: record on a page */ + const buf_block_t* block, /*!< in: buffer block containing + the record */ + page_cur_t* cur); /*!< out: page cursor */ +/**********************************************************//** +Invalidates a page cursor by setting the record pointer NULL. */ +UNIV_INLINE +void +page_cur_invalidate( +/*================*/ + page_cur_t* cur); /*!< out: page cursor */ +/**********************************************************//** +Moves the cursor to the next record on page. */ +UNIV_INLINE +void +page_cur_move_to_next( +/*==================*/ + page_cur_t* cur); /*!< in/out: cursor; must not be after last */ +/**********************************************************//** +Moves the cursor to the previous record on page. */ +UNIV_INLINE +void +page_cur_move_to_prev( +/*==================*/ + page_cur_t* cur); /*!< in/out: cursor; not before first */ +#ifndef UNIV_HOTBACKUP +/***********************************************************//** +Inserts a record next to page cursor. Returns pointer to inserted record if +succeed, i.e., enough space available, NULL otherwise. The cursor stays at +the same logical position, but the physical position may change if it is +pointing to a compressed page that was reorganized. + +IMPORTANT: The caller will have to update IBUF_BITMAP_FREE +if this is a compressed leaf page in a secondary index. +This has to be done either within the same mini-transaction, +or by invoking ibuf_reset_free_bits() before mtr_commit(). + +@return pointer to record if succeed, NULL otherwise */ +UNIV_INLINE +rec_t* +page_cur_tuple_insert( +/*==================*/ + page_cur_t* cursor, /*!< in/out: a page cursor */ + const dtuple_t* tuple, /*!< in: pointer to a data tuple */ + dict_index_t* index, /*!< in: record descriptor */ + ulint** offsets,/*!< out: offsets on *rec */ + mem_heap_t** heap, /*!< in/out: pointer to memory heap, or NULL */ + ulint n_ext, /*!< in: number of externally stored columns */ + mtr_t* mtr) /*!< in: mini-transaction handle, or NULL */ + __attribute__((nonnull(1,2,3,4,5), warn_unused_result)); +#endif /* !UNIV_HOTBACKUP */ +/***********************************************************//** +Inserts a record next to page cursor. Returns pointer to inserted record if +succeed, i.e., enough space available, NULL otherwise. The cursor stays at +the same logical position, but the physical position may change if it is +pointing to a compressed page that was reorganized. + +IMPORTANT: The caller will have to update IBUF_BITMAP_FREE +if this is a compressed leaf page in a secondary index. +This has to be done either within the same mini-transaction, +or by invoking ibuf_reset_free_bits() before mtr_commit(). + +@return pointer to record if succeed, NULL otherwise */ +UNIV_INLINE +rec_t* +page_cur_rec_insert( +/*================*/ + page_cur_t* cursor, /*!< in/out: a page cursor */ + const rec_t* rec, /*!< in: record to insert */ + dict_index_t* index, /*!< in: record descriptor */ + ulint* offsets,/*!< in/out: rec_get_offsets(rec, index) */ + mtr_t* mtr); /*!< in: mini-transaction handle, or NULL */ +/***********************************************************//** +Inserts a record next to page cursor on an uncompressed page. +Returns pointer to inserted record if succeed, i.e., enough +space available, NULL otherwise. The cursor stays at the same position. +@return pointer to record if succeed, NULL otherwise */ +UNIV_INTERN +rec_t* +page_cur_insert_rec_low( +/*====================*/ + rec_t* current_rec,/*!< in: pointer to current record after + which the new record is inserted */ + dict_index_t* index, /*!< in: record descriptor */ + const rec_t* rec, /*!< in: pointer to a physical record */ + ulint* offsets,/*!< in/out: rec_get_offsets(rec, index) */ + mtr_t* mtr) /*!< in: mini-transaction handle, or NULL */ + __attribute__((nonnull(1,2,3,4), warn_unused_result)); +/***********************************************************//** +Inserts a record next to page cursor on a compressed and uncompressed +page. Returns pointer to inserted record if succeed, i.e., +enough space available, NULL otherwise. +The cursor stays at the same position. + +IMPORTANT: The caller will have to update IBUF_BITMAP_FREE +if this is a compressed leaf page in a secondary index. +This has to be done either within the same mini-transaction, +or by invoking ibuf_reset_free_bits() before mtr_commit(). + +@return pointer to record if succeed, NULL otherwise */ +UNIV_INTERN +rec_t* +page_cur_insert_rec_zip( +/*====================*/ + page_cur_t* cursor, /*!< in/out: page cursor */ + dict_index_t* index, /*!< in: record descriptor */ + const rec_t* rec, /*!< in: pointer to a physical record */ + ulint* offsets,/*!< in/out: rec_get_offsets(rec, index) */ + mtr_t* mtr) /*!< in: mini-transaction handle, or NULL */ + __attribute__((nonnull(1,2,3,4), warn_unused_result)); +/*************************************************************//** +Copies records from page to a newly created page, from a given record onward, +including that record. Infimum and supremum records are not copied. + +IMPORTANT: The caller will have to update IBUF_BITMAP_FREE +if this is a compressed leaf page in a secondary index. +This has to be done either within the same mini-transaction, +or by invoking ibuf_reset_free_bits() before mtr_commit(). */ +UNIV_INTERN +void +page_copy_rec_list_end_to_created_page( +/*===================================*/ + page_t* new_page, /*!< in/out: index page to copy to */ + rec_t* rec, /*!< in: first record to copy */ + dict_index_t* index, /*!< in: record descriptor */ + mtr_t* mtr); /*!< in: mtr */ +/***********************************************************//** +Deletes a record at the page cursor. The cursor is moved to the +next record after the deleted one. */ +UNIV_INTERN +void +page_cur_delete_rec( +/*================*/ + page_cur_t* cursor, /*!< in/out: a page cursor */ + const dict_index_t* index, /*!< in: record descriptor */ + const ulint* offsets,/*!< in: rec_get_offsets( + cursor->rec, index) */ + mtr_t* mtr); /*!< in: mini-transaction handle */ +#ifndef UNIV_HOTBACKUP +/****************************************************************//** +Searches the right position for a page cursor. +@return number of matched fields on the left */ +UNIV_INLINE +ulint +page_cur_search( +/*============*/ + const buf_block_t* block, /*!< in: buffer block */ + const dict_index_t* index, /*!< in: record descriptor */ + const dtuple_t* tuple, /*!< in: data tuple */ + ulint mode, /*!< in: PAGE_CUR_L, + PAGE_CUR_LE, PAGE_CUR_G, or + PAGE_CUR_GE */ + page_cur_t* cursor);/*!< out: page cursor */ +/****************************************************************//** +Searches the right position for a page cursor. */ +UNIV_INTERN +void +page_cur_search_with_match( +/*=======================*/ + const buf_block_t* block, /*!< in: buffer block */ + const dict_index_t* index, /*!< in: record descriptor */ + const dtuple_t* tuple, /*!< in: data tuple */ + ulint mode, /*!< in: PAGE_CUR_L, + PAGE_CUR_LE, PAGE_CUR_G, or + PAGE_CUR_GE */ + ulint* iup_matched_fields, + /*!< in/out: already matched + fields in upper limit record */ + ulint* iup_matched_bytes, + /*!< in/out: already matched + bytes in a field not yet + completely matched */ + ulint* ilow_matched_fields, + /*!< in/out: already matched + fields in lower limit record */ + ulint* ilow_matched_bytes, + /*!< in/out: already matched + bytes in a field not yet + completely matched */ + page_cur_t* cursor);/*!< out: page cursor */ +/***********************************************************//** +Positions a page cursor on a randomly chosen user record on a page. If there +are no user records, sets the cursor on the infimum record. */ +UNIV_INTERN +void +page_cur_open_on_rnd_user_rec( +/*==========================*/ + buf_block_t* block, /*!< in: page */ + page_cur_t* cursor);/*!< out: page cursor */ +#endif /* !UNIV_HOTBACKUP */ +/***********************************************************//** +Parses a log record of a record insert on a page. +@return end of log record or NULL */ +UNIV_INTERN +byte* +page_cur_parse_insert_rec( +/*======================*/ + ibool is_short,/*!< in: TRUE if short inserts */ + byte* ptr, /*!< in: buffer */ + byte* end_ptr,/*!< in: buffer end */ + buf_block_t* block, /*!< in: page or NULL */ + dict_index_t* index, /*!< in: record descriptor */ + mtr_t* mtr); /*!< in: mtr or NULL */ +/**********************************************************//** +Parses a log record of copying a record list end to a new created page. +@return end of log record or NULL */ +UNIV_INTERN +byte* +page_parse_copy_rec_list_to_created_page( +/*=====================================*/ + byte* ptr, /*!< in: buffer */ + byte* end_ptr,/*!< in: buffer end */ + buf_block_t* block, /*!< in: page or NULL */ + dict_index_t* index, /*!< in: record descriptor */ + mtr_t* mtr); /*!< in: mtr or NULL */ +/***********************************************************//** +Parses log record of a record delete on a page. +@return pointer to record end or NULL */ +UNIV_INTERN +byte* +page_cur_parse_delete_rec( +/*======================*/ + byte* ptr, /*!< in: buffer */ + byte* end_ptr,/*!< in: buffer end */ + buf_block_t* block, /*!< in: page or NULL */ + dict_index_t* index, /*!< in: record descriptor */ + mtr_t* mtr); /*!< in: mtr or NULL */ +/*******************************************************//** +Removes the record from a leaf page. This function does not log +any changes. It is used by the IMPORT tablespace functions. +@return true if success, i.e., the page did not become too empty */ +UNIV_INTERN +bool +page_delete_rec( +/*============*/ + const dict_index_t* index, /*!< in: The index that the record + belongs to */ + page_cur_t* pcur, /*!< in/out: page cursor on record + to delete */ + page_zip_des_t* page_zip,/*!< in: compressed page descriptor */ + const ulint* offsets);/*!< in: offsets for record */ + +/** Index page cursor */ + +struct page_cur_t{ + byte* rec; /*!< pointer to a record on page */ + buf_block_t* block; /*!< pointer to the block containing rec */ +}; + +#ifndef UNIV_NONINL +#include "page0cur.ic" +#endif + +#endif diff --git a/storage/innobase/include/page0cur.ic b/storage/innobase/include/page0cur.ic new file mode 100644 index 00000000000..028d33b17aa --- /dev/null +++ b/storage/innobase/include/page0cur.ic @@ -0,0 +1,317 @@ +/***************************************************************************** + +Copyright (c) 1994, 2013, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/********************************************************************//** +@file include/page0cur.ic +The page cursor + +Created 10/4/1994 Heikki Tuuri +*************************************************************************/ + +#include "page0page.h" +#include "buf0types.h" + +#ifdef UNIV_DEBUG +# include "rem0cmp.h" + +/*********************************************************//** +Gets pointer to the page frame where the cursor is positioned. +@return page */ +UNIV_INLINE +page_t* +page_cur_get_page( +/*==============*/ + page_cur_t* cur) /*!< in: page cursor */ +{ + ut_ad(cur); + ut_ad(page_align(cur->rec) == cur->block->frame); + + return(page_align(cur->rec)); +} + +/*********************************************************//** +Gets pointer to the buffer block where the cursor is positioned. +@return page */ +UNIV_INLINE +buf_block_t* +page_cur_get_block( +/*===============*/ + page_cur_t* cur) /*!< in: page cursor */ +{ + ut_ad(cur); + ut_ad(page_align(cur->rec) == cur->block->frame); + return(cur->block); +} + +/*********************************************************//** +Gets pointer to the page frame where the cursor is positioned. +@return page */ +UNIV_INLINE +page_zip_des_t* +page_cur_get_page_zip( +/*==================*/ + page_cur_t* cur) /*!< in: page cursor */ +{ + return(buf_block_get_page_zip(page_cur_get_block(cur))); +} + +/*********************************************************//** +Gets the record where the cursor is positioned. +@return record */ +UNIV_INLINE +rec_t* +page_cur_get_rec( +/*=============*/ + page_cur_t* cur) /*!< in: page cursor */ +{ + ut_ad(cur); + ut_ad(page_align(cur->rec) == cur->block->frame); + + return(cur->rec); +} +#endif /* UNIV_DEBUG */ + +/*********************************************************//** +Sets the cursor object to point before the first user record +on the page. */ +UNIV_INLINE +void +page_cur_set_before_first( +/*======================*/ + const buf_block_t* block, /*!< in: index page */ + page_cur_t* cur) /*!< in: cursor */ +{ + cur->block = (buf_block_t*) block; + cur->rec = page_get_infimum_rec(buf_block_get_frame(cur->block)); +} + +/*********************************************************//** +Sets the cursor object to point after the last user record on +the page. */ +UNIV_INLINE +void +page_cur_set_after_last( +/*====================*/ + const buf_block_t* block, /*!< in: index page */ + page_cur_t* cur) /*!< in: cursor */ +{ + cur->block = (buf_block_t*) block; + cur->rec = page_get_supremum_rec(buf_block_get_frame(cur->block)); +} + +/*********************************************************//** +Returns TRUE if the cursor is before first user record on page. +@return TRUE if at start */ +UNIV_INLINE +ibool +page_cur_is_before_first( +/*=====================*/ + const page_cur_t* cur) /*!< in: cursor */ +{ + ut_ad(cur); + ut_ad(page_align(cur->rec) == cur->block->frame); + return(page_rec_is_infimum(cur->rec)); +} + +/*********************************************************//** +Returns TRUE if the cursor is after last user record. +@return TRUE if at end */ +UNIV_INLINE +ibool +page_cur_is_after_last( +/*===================*/ + const page_cur_t* cur) /*!< in: cursor */ +{ + ut_ad(cur); + ut_ad(page_align(cur->rec) == cur->block->frame); + return(page_rec_is_supremum(cur->rec)); +} + +/**********************************************************//** +Positions the cursor on the given record. */ +UNIV_INLINE +void +page_cur_position( +/*==============*/ + const rec_t* rec, /*!< in: record on a page */ + const buf_block_t* block, /*!< in: buffer block containing + the record */ + page_cur_t* cur) /*!< out: page cursor */ +{ + ut_ad(rec && block && cur); + ut_ad(page_align(rec) == block->frame); + + cur->rec = (rec_t*) rec; + cur->block = (buf_block_t*) block; +} + +/**********************************************************//** +Invalidates a page cursor by setting the record pointer NULL. */ +UNIV_INLINE +void +page_cur_invalidate( +/*================*/ + page_cur_t* cur) /*!< out: page cursor */ +{ + ut_ad(cur); + + cur->rec = NULL; + cur->block = NULL; +} + +/**********************************************************//** +Moves the cursor to the next record on page. */ +UNIV_INLINE +void +page_cur_move_to_next( +/*==================*/ + page_cur_t* cur) /*!< in/out: cursor; must not be after last */ +{ + ut_ad(!page_cur_is_after_last(cur)); + + cur->rec = page_rec_get_next(cur->rec); +} + +/**********************************************************//** +Moves the cursor to the previous record on page. */ +UNIV_INLINE +void +page_cur_move_to_prev( +/*==================*/ + page_cur_t* cur) /*!< in/out: page cursor, not before first */ +{ + ut_ad(!page_cur_is_before_first(cur)); + + cur->rec = page_rec_get_prev(cur->rec); +} + +#ifndef UNIV_HOTBACKUP +/****************************************************************//** +Searches the right position for a page cursor. +@return number of matched fields on the left */ +UNIV_INLINE +ulint +page_cur_search( +/*============*/ + const buf_block_t* block, /*!< in: buffer block */ + const dict_index_t* index, /*!< in: record descriptor */ + const dtuple_t* tuple, /*!< in: data tuple */ + ulint mode, /*!< in: PAGE_CUR_L, + PAGE_CUR_LE, PAGE_CUR_G, or + PAGE_CUR_GE */ + page_cur_t* cursor) /*!< out: page cursor */ +{ + ulint low_matched_fields = 0; + ulint low_matched_bytes = 0; + ulint up_matched_fields = 0; + ulint up_matched_bytes = 0; + + ut_ad(dtuple_check_typed(tuple)); + + page_cur_search_with_match(block, index, tuple, mode, + &up_matched_fields, + &up_matched_bytes, + &low_matched_fields, + &low_matched_bytes, + cursor); + return(low_matched_fields); +} + +/***********************************************************//** +Inserts a record next to page cursor. Returns pointer to inserted record if +succeed, i.e., enough space available, NULL otherwise. The cursor stays at +the same logical position, but the physical position may change if it is +pointing to a compressed page that was reorganized. + +IMPORTANT: The caller will have to update IBUF_BITMAP_FREE +if this is a compressed leaf page in a secondary index. +This has to be done either within the same mini-transaction, +or by invoking ibuf_reset_free_bits() before mtr_commit(). + +@return pointer to record if succeed, NULL otherwise */ +UNIV_INLINE +rec_t* +page_cur_tuple_insert( +/*==================*/ + page_cur_t* cursor, /*!< in/out: a page cursor */ + const dtuple_t* tuple, /*!< in: pointer to a data tuple */ + dict_index_t* index, /*!< in: record descriptor */ + ulint** offsets,/*!< out: offsets on *rec */ + mem_heap_t** heap, /*!< in/out: pointer to memory heap, or NULL */ + ulint n_ext, /*!< in: number of externally stored columns */ + mtr_t* mtr) /*!< in: mini-transaction handle, or NULL */ +{ + ulint size + = rec_get_converted_size(index, tuple, n_ext); + rec_t* rec; + + if (!*heap) { + *heap = mem_heap_create(size + + (4 + REC_OFFS_HEADER_SIZE + + dtuple_get_n_fields(tuple)) + * sizeof **offsets); + } + + rec = rec_convert_dtuple_to_rec((byte*) mem_heap_alloc(*heap, size), + index, tuple, n_ext); + *offsets = rec_get_offsets( + rec, index, *offsets, ULINT_UNDEFINED, heap); + + if (buf_block_get_page_zip(cursor->block)) { + rec = page_cur_insert_rec_zip( + cursor, index, rec, *offsets, mtr); + } else { + rec = page_cur_insert_rec_low(cursor->rec, + index, rec, *offsets, mtr); + } + + ut_ad(!rec || !cmp_dtuple_rec(tuple, rec, *offsets)); + return(rec); +} +#endif /* !UNIV_HOTBACKUP */ + +/***********************************************************//** +Inserts a record next to page cursor. Returns pointer to inserted record if +succeed, i.e., enough space available, NULL otherwise. The cursor stays at +the same logical position, but the physical position may change if it is +pointing to a compressed page that was reorganized. + +IMPORTANT: The caller will have to update IBUF_BITMAP_FREE +if this is a compressed leaf page in a secondary index. +This has to be done either within the same mini-transaction, +or by invoking ibuf_reset_free_bits() before mtr_commit(). + +@return pointer to record if succeed, NULL otherwise */ +UNIV_INLINE +rec_t* +page_cur_rec_insert( +/*================*/ + page_cur_t* cursor, /*!< in/out: a page cursor */ + const rec_t* rec, /*!< in: record to insert */ + dict_index_t* index, /*!< in: record descriptor */ + ulint* offsets,/*!< in/out: rec_get_offsets(rec, index) */ + mtr_t* mtr) /*!< in: mini-transaction handle, or NULL */ +{ + if (buf_block_get_page_zip(cursor->block)) { + return(page_cur_insert_rec_zip( + cursor, index, rec, offsets, mtr)); + } else { + return(page_cur_insert_rec_low(cursor->rec, + index, rec, offsets, mtr)); + } +} diff --git a/storage/innobase/include/page0page.h b/storage/innobase/include/page0page.h new file mode 100644 index 00000000000..b572f7abb49 --- /dev/null +++ b/storage/innobase/include/page0page.h @@ -0,0 +1,1122 @@ +/***************************************************************************** + +Copyright (c) 1994, 2013, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/page0page.h +Index page routines + +Created 2/2/1994 Heikki Tuuri +*******************************************************/ + +#ifndef page0page_h +#define page0page_h + +#include "univ.i" + +#include "page0types.h" +#include "fil0fil.h" +#include "buf0buf.h" +#include "data0data.h" +#include "dict0dict.h" +#include "rem0rec.h" +#include "fsp0fsp.h" +#include "mtr0mtr.h" + +#ifdef UNIV_MATERIALIZE +#undef UNIV_INLINE +#define UNIV_INLINE +#endif + +/* PAGE HEADER + =========== + +Index page header starts at the first offset left free by the FIL-module */ + +typedef byte page_header_t; + +#define PAGE_HEADER FSEG_PAGE_DATA /* index page header starts at this + offset */ +/*-----------------------------*/ +#define PAGE_N_DIR_SLOTS 0 /* number of slots in page directory */ +#define PAGE_HEAP_TOP 2 /* pointer to record heap top */ +#define PAGE_N_HEAP 4 /* number of records in the heap, + bit 15=flag: new-style compact page format */ +#define PAGE_FREE 6 /* pointer to start of page free record list */ +#define PAGE_GARBAGE 8 /* number of bytes in deleted records */ +#define PAGE_LAST_INSERT 10 /* pointer to the last inserted record, or + NULL if this info has been reset by a delete, + for example */ +#define PAGE_DIRECTION 12 /* last insert direction: PAGE_LEFT, ... */ +#define PAGE_N_DIRECTION 14 /* number of consecutive inserts to the same + direction */ +#define PAGE_N_RECS 16 /* number of user records on the page */ +#define PAGE_MAX_TRX_ID 18 /* highest id of a trx which may have modified + a record on the page; trx_id_t; defined only + in secondary indexes and in the insert buffer + tree */ +#define PAGE_HEADER_PRIV_END 26 /* end of private data structure of the page + header which are set in a page create */ +/*----*/ +#define PAGE_LEVEL 26 /* level of the node in an index tree; the + leaf level is the level 0. This field should + not be written to after page creation. */ +#define PAGE_INDEX_ID 28 /* index id where the page belongs. + This field should not be written to after + page creation. */ +#define PAGE_BTR_SEG_LEAF 36 /* file segment header for the leaf pages in + a B-tree: defined only on the root page of a + B-tree, but not in the root of an ibuf tree */ +#define PAGE_BTR_IBUF_FREE_LIST PAGE_BTR_SEG_LEAF +#define PAGE_BTR_IBUF_FREE_LIST_NODE PAGE_BTR_SEG_LEAF + /* in the place of PAGE_BTR_SEG_LEAF and _TOP + there is a free list base node if the page is + the root page of an ibuf tree, and at the same + place is the free list node if the page is in + a free list */ +#define PAGE_BTR_SEG_TOP (36 + FSEG_HEADER_SIZE) + /* file segment header for the non-leaf pages + in a B-tree: defined only on the root page of + a B-tree, but not in the root of an ibuf + tree */ +/*----*/ +#define PAGE_DATA (PAGE_HEADER + 36 + 2 * FSEG_HEADER_SIZE) + /* start of data on the page */ + +#define PAGE_OLD_INFIMUM (PAGE_DATA + 1 + REC_N_OLD_EXTRA_BYTES) + /* offset of the page infimum record on an + old-style page */ +#define PAGE_OLD_SUPREMUM (PAGE_DATA + 2 + 2 * REC_N_OLD_EXTRA_BYTES + 8) + /* offset of the page supremum record on an + old-style page */ +#define PAGE_OLD_SUPREMUM_END (PAGE_OLD_SUPREMUM + 9) + /* offset of the page supremum record end on + an old-style page */ +#define PAGE_NEW_INFIMUM (PAGE_DATA + REC_N_NEW_EXTRA_BYTES) + /* offset of the page infimum record on a + new-style compact page */ +#define PAGE_NEW_SUPREMUM (PAGE_DATA + 2 * REC_N_NEW_EXTRA_BYTES + 8) + /* offset of the page supremum record on a + new-style compact page */ +#define PAGE_NEW_SUPREMUM_END (PAGE_NEW_SUPREMUM + 8) + /* offset of the page supremum record end on + a new-style compact page */ +/*-----------------------------*/ + +/* Heap numbers */ +#define PAGE_HEAP_NO_INFIMUM 0 /* page infimum */ +#define PAGE_HEAP_NO_SUPREMUM 1 /* page supremum */ +#define PAGE_HEAP_NO_USER_LOW 2 /* first user record in + creation (insertion) order, + not necessarily collation order; + this record may have been deleted */ + +/* Directions of cursor movement */ +#define PAGE_LEFT 1 +#define PAGE_RIGHT 2 +#define PAGE_SAME_REC 3 +#define PAGE_SAME_PAGE 4 +#define PAGE_NO_DIRECTION 5 + +/* PAGE DIRECTORY + ============== +*/ + +typedef byte page_dir_slot_t; +typedef page_dir_slot_t page_dir_t; + +/* Offset of the directory start down from the page end. We call the +slot with the highest file address directory start, as it points to +the first record in the list of records. */ +#define PAGE_DIR FIL_PAGE_DATA_END + +/* We define a slot in the page directory as two bytes */ +#define PAGE_DIR_SLOT_SIZE 2 + +/* The offset of the physically lower end of the directory, counted from +page end, when the page is empty */ +#define PAGE_EMPTY_DIR_START (PAGE_DIR + 2 * PAGE_DIR_SLOT_SIZE) + +/* The maximum and minimum number of records owned by a directory slot. The +number may drop below the minimum in the first and the last slot in the +directory. */ +#define PAGE_DIR_SLOT_MAX_N_OWNED 8 +#define PAGE_DIR_SLOT_MIN_N_OWNED 4 + +/************************************************************//** +Gets the start of a page. +@return start of the page */ +UNIV_INLINE +page_t* +page_align( +/*=======*/ + const void* ptr) /*!< in: pointer to page frame */ + __attribute__((const)); +/************************************************************//** +Gets the offset within a page. +@return offset from the start of the page */ +UNIV_INLINE +ulint +page_offset( +/*========*/ + const void* ptr) /*!< in: pointer to page frame */ + __attribute__((const)); +/*************************************************************//** +Returns the max trx id field value. */ +UNIV_INLINE +trx_id_t +page_get_max_trx_id( +/*================*/ + const page_t* page); /*!< in: page */ +/*************************************************************//** +Sets the max trx id field value. */ +UNIV_INTERN +void +page_set_max_trx_id( +/*================*/ + buf_block_t* block, /*!< in/out: page */ + page_zip_des_t* page_zip,/*!< in/out: compressed page, or NULL */ + trx_id_t trx_id, /*!< in: transaction id */ + mtr_t* mtr); /*!< in/out: mini-transaction, or NULL */ +/*************************************************************//** +Sets the max trx id field value if trx_id is bigger than the previous +value. */ +UNIV_INLINE +void +page_update_max_trx_id( +/*===================*/ + buf_block_t* block, /*!< in/out: page */ + page_zip_des_t* page_zip,/*!< in/out: compressed page whose + uncompressed part will be updated, or NULL */ + trx_id_t trx_id, /*!< in: transaction id */ + mtr_t* mtr); /*!< in/out: mini-transaction */ +/*************************************************************//** +Reads the given header field. */ +UNIV_INLINE +ulint +page_header_get_field( +/*==================*/ + const page_t* page, /*!< in: page */ + ulint field); /*!< in: PAGE_N_DIR_SLOTS, ... */ +/*************************************************************//** +Sets the given header field. */ +UNIV_INLINE +void +page_header_set_field( +/*==================*/ + page_t* page, /*!< in/out: page */ + page_zip_des_t* page_zip,/*!< in/out: compressed page whose + uncompressed part will be updated, or NULL */ + ulint field, /*!< in: PAGE_N_DIR_SLOTS, ... */ + ulint val); /*!< in: value */ +/*************************************************************//** +Returns the offset stored in the given header field. +@return offset from the start of the page, or 0 */ +UNIV_INLINE +ulint +page_header_get_offs( +/*=================*/ + const page_t* page, /*!< in: page */ + ulint field) /*!< in: PAGE_FREE, ... */ + __attribute__((nonnull, pure)); + +/*************************************************************//** +Returns the pointer stored in the given header field, or NULL. */ +#define page_header_get_ptr(page, field) \ + (page_header_get_offs(page, field) \ + ? page + page_header_get_offs(page, field) : NULL) +/*************************************************************//** +Sets the pointer stored in the given header field. */ +UNIV_INLINE +void +page_header_set_ptr( +/*================*/ + page_t* page, /*!< in/out: page */ + page_zip_des_t* page_zip,/*!< in/out: compressed page whose + uncompressed part will be updated, or NULL */ + ulint field, /*!< in/out: PAGE_FREE, ... */ + const byte* ptr); /*!< in: pointer or NULL*/ +#ifndef UNIV_HOTBACKUP +/*************************************************************//** +Resets the last insert info field in the page header. Writes to mlog +about this operation. */ +UNIV_INLINE +void +page_header_reset_last_insert( +/*==========================*/ + page_t* page, /*!< in: page */ + page_zip_des_t* page_zip,/*!< in/out: compressed page whose + uncompressed part will be updated, or NULL */ + mtr_t* mtr); /*!< in: mtr */ +#endif /* !UNIV_HOTBACKUP */ +/************************************************************//** +Gets the offset of the first record on the page. +@return offset of the first record in record list, relative from page */ +UNIV_INLINE +ulint +page_get_infimum_offset( +/*====================*/ + const page_t* page); /*!< in: page which must have record(s) */ +/************************************************************//** +Gets the offset of the last record on the page. +@return offset of the last record in record list, relative from page */ +UNIV_INLINE +ulint +page_get_supremum_offset( +/*=====================*/ + const page_t* page); /*!< in: page which must have record(s) */ +#define page_get_infimum_rec(page) ((page) + page_get_infimum_offset(page)) +#define page_get_supremum_rec(page) ((page) + page_get_supremum_offset(page)) + +/************************************************************//** +Returns the nth record of the record list. +This is the inverse function of page_rec_get_n_recs_before(). +@return nth record */ +UNIV_INTERN +const rec_t* +page_rec_get_nth_const( +/*===================*/ + const page_t* page, /*!< in: page */ + ulint nth) /*!< in: nth record */ + __attribute__((nonnull, warn_unused_result)); +/************************************************************//** +Returns the nth record of the record list. +This is the inverse function of page_rec_get_n_recs_before(). +@return nth record */ +UNIV_INLINE +rec_t* +page_rec_get_nth( +/*=============*/ + page_t* page, /*< in: page */ + ulint nth) /*!< in: nth record */ + __attribute__((nonnull, warn_unused_result)); + +#ifndef UNIV_HOTBACKUP +/************************************************************//** +Returns the middle record of the records on the page. If there is an +even number of records in the list, returns the first record of the +upper half-list. +@return middle record */ +UNIV_INLINE +rec_t* +page_get_middle_rec( +/*================*/ + page_t* page) /*!< in: page */ + __attribute__((nonnull, warn_unused_result)); +/*************************************************************//** +Compares a data tuple to a physical record. Differs from the function +cmp_dtuple_rec_with_match in the way that the record must reside on an +index page, and also page infimum and supremum records can be given in +the parameter rec. These are considered as the negative infinity and +the positive infinity in the alphabetical order. +@return 1, 0, -1, if dtuple is greater, equal, less than rec, +respectively, when only the common first fields are compared */ +UNIV_INLINE +int +page_cmp_dtuple_rec_with_match( +/*===========================*/ + const dtuple_t* dtuple, /*!< in: data tuple */ + const rec_t* rec, /*!< in: physical record on a page; may also + be page infimum or supremum, in which case + matched-parameter values below are not + affected */ + const ulint* offsets,/*!< in: array returned by rec_get_offsets() */ + ulint* matched_fields, /*!< in/out: number of already completely + matched fields; when function returns + contains the value for current comparison */ + ulint* matched_bytes); /*!< in/out: number of already matched + bytes within the first field not completely + matched; when function returns contains the + value for current comparison */ +#endif /* !UNIV_HOTBACKUP */ +/*************************************************************//** +Gets the page number. +@return page number */ +UNIV_INLINE +ulint +page_get_page_no( +/*=============*/ + const page_t* page); /*!< in: page */ +/*************************************************************//** +Gets the tablespace identifier. +@return space id */ +UNIV_INLINE +ulint +page_get_space_id( +/*==============*/ + const page_t* page); /*!< in: page */ +/*************************************************************//** +Gets the number of user records on page (the infimum and supremum records +are not user records). +@return number of user records */ +UNIV_INLINE +ulint +page_get_n_recs( +/*============*/ + const page_t* page); /*!< in: index page */ +/***************************************************************//** +Returns the number of records before the given record in chain. +The number includes infimum and supremum records. +This is the inverse function of page_rec_get_nth(). +@return number of records */ +UNIV_INTERN +ulint +page_rec_get_n_recs_before( +/*=======================*/ + const rec_t* rec); /*!< in: the physical record */ +/*************************************************************//** +Gets the number of records in the heap. +@return number of user records */ +UNIV_INLINE +ulint +page_dir_get_n_heap( +/*================*/ + const page_t* page); /*!< in: index page */ +/*************************************************************//** +Sets the number of records in the heap. */ +UNIV_INLINE +void +page_dir_set_n_heap( +/*================*/ + page_t* page, /*!< in/out: index page */ + page_zip_des_t* page_zip,/*!< in/out: compressed page whose + uncompressed part will be updated, or NULL. + Note that the size of the dense page directory + in the compressed page trailer is + n_heap * PAGE_ZIP_DIR_SLOT_SIZE. */ + ulint n_heap);/*!< in: number of records */ +/*************************************************************//** +Gets the number of dir slots in directory. +@return number of slots */ +UNIV_INLINE +ulint +page_dir_get_n_slots( +/*=================*/ + const page_t* page); /*!< in: index page */ +/*************************************************************//** +Sets the number of dir slots in directory. */ +UNIV_INLINE +void +page_dir_set_n_slots( +/*=================*/ + page_t* page, /*!< in/out: page */ + page_zip_des_t* page_zip,/*!< in/out: compressed page whose + uncompressed part will be updated, or NULL */ + ulint n_slots);/*!< in: number of slots */ +#ifdef UNIV_DEBUG +/*************************************************************//** +Gets pointer to nth directory slot. +@return pointer to dir slot */ +UNIV_INLINE +page_dir_slot_t* +page_dir_get_nth_slot( +/*==================*/ + const page_t* page, /*!< in: index page */ + ulint n); /*!< in: position */ +#else /* UNIV_DEBUG */ +# define page_dir_get_nth_slot(page, n) \ + ((page) + UNIV_PAGE_SIZE - PAGE_DIR \ + - (n + 1) * PAGE_DIR_SLOT_SIZE) +#endif /* UNIV_DEBUG */ +/**************************************************************//** +Used to check the consistency of a record on a page. +@return TRUE if succeed */ +UNIV_INLINE +ibool +page_rec_check( +/*===========*/ + const rec_t* rec); /*!< in: record */ +/***************************************************************//** +Gets the record pointed to by a directory slot. +@return pointer to record */ +UNIV_INLINE +const rec_t* +page_dir_slot_get_rec( +/*==================*/ + const page_dir_slot_t* slot); /*!< in: directory slot */ +/***************************************************************//** +This is used to set the record offset in a directory slot. */ +UNIV_INLINE +void +page_dir_slot_set_rec( +/*==================*/ + page_dir_slot_t* slot, /*!< in: directory slot */ + rec_t* rec); /*!< in: record on the page */ +/***************************************************************//** +Gets the number of records owned by a directory slot. +@return number of records */ +UNIV_INLINE +ulint +page_dir_slot_get_n_owned( +/*======================*/ + const page_dir_slot_t* slot); /*!< in: page directory slot */ +/***************************************************************//** +This is used to set the owned records field of a directory slot. */ +UNIV_INLINE +void +page_dir_slot_set_n_owned( +/*======================*/ + page_dir_slot_t*slot, /*!< in/out: directory slot */ + page_zip_des_t* page_zip,/*!< in/out: compressed page, or NULL */ + ulint n); /*!< in: number of records owned by the slot */ +/************************************************************//** +Calculates the space reserved for directory slots of a given +number of records. The exact value is a fraction number +n * PAGE_DIR_SLOT_SIZE / PAGE_DIR_SLOT_MIN_N_OWNED, and it is +rounded upwards to an integer. */ +UNIV_INLINE +ulint +page_dir_calc_reserved_space( +/*=========================*/ + ulint n_recs); /*!< in: number of records */ +/***************************************************************//** +Looks for the directory slot which owns the given record. +@return the directory slot number */ +UNIV_INTERN +ulint +page_dir_find_owner_slot( +/*=====================*/ + const rec_t* rec); /*!< in: the physical record */ +/************************************************************//** +Determine whether the page is in new-style compact format. +@return nonzero if the page is in compact format, zero if it is in +old-style format */ +UNIV_INLINE +ulint +page_is_comp( +/*=========*/ + const page_t* page); /*!< in: index page */ +/************************************************************//** +TRUE if the record is on a page in compact format. +@return nonzero if in compact format */ +UNIV_INLINE +ulint +page_rec_is_comp( +/*=============*/ + const rec_t* rec); /*!< in: record */ +/***************************************************************//** +Returns the heap number of a record. +@return heap number */ +UNIV_INLINE +ulint +page_rec_get_heap_no( +/*=================*/ + const rec_t* rec); /*!< in: the physical record */ +/************************************************************//** +Determine whether the page is a B-tree leaf. +@return true if the page is a B-tree leaf (PAGE_LEVEL = 0) */ +UNIV_INLINE +bool +page_is_leaf( +/*=========*/ + const page_t* page) /*!< in: page */ + __attribute__((nonnull, pure)); +/************************************************************//** +Determine whether the page is empty. +@return true if the page is empty (PAGE_N_RECS = 0) */ +UNIV_INLINE +bool +page_is_empty( +/*==========*/ + const page_t* page) /*!< in: page */ + __attribute__((nonnull, pure)); +/************************************************************//** +Determine whether the page contains garbage. +@return true if the page contains garbage (PAGE_GARBAGE is not 0) */ +UNIV_INLINE +bool +page_has_garbage( +/*=============*/ + const page_t* page) /*!< in: page */ + __attribute__((nonnull, pure)); +/************************************************************//** +Gets the pointer to the next record on the page. +@return pointer to next record */ +UNIV_INLINE +const rec_t* +page_rec_get_next_low( +/*==================*/ + const rec_t* rec, /*!< in: pointer to record */ + ulint comp); /*!< in: nonzero=compact page layout */ +/************************************************************//** +Gets the pointer to the next record on the page. +@return pointer to next record */ +UNIV_INLINE +rec_t* +page_rec_get_next( +/*==============*/ + rec_t* rec); /*!< in: pointer to record */ +/************************************************************//** +Gets the pointer to the next record on the page. +@return pointer to next record */ +UNIV_INLINE +const rec_t* +page_rec_get_next_const( +/*====================*/ + const rec_t* rec); /*!< in: pointer to record */ +/************************************************************//** +Gets the pointer to the next non delete-marked record on the page. +If all subsequent records are delete-marked, then this function +will return the supremum record. +@return pointer to next non delete-marked record or pointer to supremum */ +UNIV_INLINE +const rec_t* +page_rec_get_next_non_del_marked( +/*=============================*/ + const rec_t* rec); /*!< in: pointer to record */ +/************************************************************//** +Sets the pointer to the next record on the page. */ +UNIV_INLINE +void +page_rec_set_next( +/*==============*/ + rec_t* rec, /*!< in: pointer to record, + must not be page supremum */ + const rec_t* next); /*!< in: pointer to next record, + must not be page infimum */ +/************************************************************//** +Gets the pointer to the previous record. +@return pointer to previous record */ +UNIV_INLINE +const rec_t* +page_rec_get_prev_const( +/*====================*/ + const rec_t* rec); /*!< in: pointer to record, must not be page + infimum */ +/************************************************************//** +Gets the pointer to the previous record. +@return pointer to previous record */ +UNIV_INLINE +rec_t* +page_rec_get_prev( +/*==============*/ + rec_t* rec); /*!< in: pointer to record, + must not be page infimum */ +/************************************************************//** +TRUE if the record is a user record on the page. +@return TRUE if a user record */ +UNIV_INLINE +ibool +page_rec_is_user_rec_low( +/*=====================*/ + ulint offset) /*!< in: record offset on page */ + __attribute__((const)); +/************************************************************//** +TRUE if the record is the supremum record on a page. +@return TRUE if the supremum record */ +UNIV_INLINE +ibool +page_rec_is_supremum_low( +/*=====================*/ + ulint offset) /*!< in: record offset on page */ + __attribute__((const)); +/************************************************************//** +TRUE if the record is the infimum record on a page. +@return TRUE if the infimum record */ +UNIV_INLINE +ibool +page_rec_is_infimum_low( +/*====================*/ + ulint offset) /*!< in: record offset on page */ + __attribute__((const)); + +/************************************************************//** +TRUE if the record is a user record on the page. +@return TRUE if a user record */ +UNIV_INLINE +ibool +page_rec_is_user_rec( +/*=================*/ + const rec_t* rec) /*!< in: record */ + __attribute__((const)); +/************************************************************//** +TRUE if the record is the supremum record on a page. +@return TRUE if the supremum record */ +UNIV_INLINE +ibool +page_rec_is_supremum( +/*=================*/ + const rec_t* rec) /*!< in: record */ + __attribute__((const)); + +/************************************************************//** +TRUE if the record is the infimum record on a page. +@return TRUE if the infimum record */ +UNIV_INLINE +ibool +page_rec_is_infimum( +/*================*/ + const rec_t* rec) /*!< in: record */ + __attribute__((const)); +/***************************************************************//** +Looks for the record which owns the given record. +@return the owner record */ +UNIV_INLINE +rec_t* +page_rec_find_owner_rec( +/*====================*/ + rec_t* rec); /*!< in: the physical record */ +#ifndef UNIV_HOTBACKUP +/***********************************************************************//** +Write a 32-bit field in a data dictionary record. */ +UNIV_INLINE +void +page_rec_write_field( +/*=================*/ + rec_t* rec, /*!< in/out: record to update */ + ulint i, /*!< in: index of the field to update */ + ulint val, /*!< in: value to write */ + mtr_t* mtr) /*!< in/out: mini-transaction */ + __attribute__((nonnull)); +#endif /* !UNIV_HOTBACKUP */ +/************************************************************//** +Returns the maximum combined size of records which can be inserted on top +of record heap. +@return maximum combined size for inserted records */ +UNIV_INLINE +ulint +page_get_max_insert_size( +/*=====================*/ + const page_t* page, /*!< in: index page */ + ulint n_recs);/*!< in: number of records */ +/************************************************************//** +Returns the maximum combined size of records which can be inserted on top +of record heap if page is first reorganized. +@return maximum combined size for inserted records */ +UNIV_INLINE +ulint +page_get_max_insert_size_after_reorganize( +/*======================================*/ + const page_t* page, /*!< in: index page */ + ulint n_recs);/*!< in: number of records */ +/*************************************************************//** +Calculates free space if a page is emptied. +@return free space */ +UNIV_INLINE +ulint +page_get_free_space_of_empty( +/*=========================*/ + ulint comp) /*!< in: nonzero=compact page format */ + __attribute__((const)); +/**********************************************************//** +Returns the base extra size of a physical record. This is the +size of the fixed header, independent of the record size. +@return REC_N_NEW_EXTRA_BYTES or REC_N_OLD_EXTRA_BYTES */ +UNIV_INLINE +ulint +page_rec_get_base_extra_size( +/*=========================*/ + const rec_t* rec); /*!< in: physical record */ +/************************************************************//** +Returns the sum of the sizes of the records in the record list +excluding the infimum and supremum records. +@return data in bytes */ +UNIV_INLINE +ulint +page_get_data_size( +/*===============*/ + const page_t* page); /*!< in: index page */ +/************************************************************//** +Allocates a block of memory from the head of the free list +of an index page. */ +UNIV_INLINE +void +page_mem_alloc_free( +/*================*/ + page_t* page, /*!< in/out: index page */ + page_zip_des_t* page_zip,/*!< in/out: compressed page with enough + space available for inserting the record, + or NULL */ + rec_t* next_rec,/*!< in: pointer to the new head of the + free record list */ + ulint need); /*!< in: number of bytes allocated */ +/************************************************************//** +Allocates a block of memory from the heap of an index page. +@return pointer to start of allocated buffer, or NULL if allocation fails */ +UNIV_INTERN +byte* +page_mem_alloc_heap( +/*================*/ + page_t* page, /*!< in/out: index page */ + page_zip_des_t* page_zip,/*!< in/out: compressed page with enough + space available for inserting the record, + or NULL */ + ulint need, /*!< in: total number of bytes needed */ + ulint* heap_no);/*!< out: this contains the heap number + of the allocated record + if allocation succeeds */ +/************************************************************//** +Puts a record to free list. */ +UNIV_INLINE +void +page_mem_free( +/*==========*/ + page_t* page, /*!< in/out: index page */ + page_zip_des_t* page_zip,/*!< in/out: compressed page, + or NULL */ + rec_t* rec, /*!< in: pointer to the (origin of) + record */ + const dict_index_t* index, /*!< in: index of rec */ + const ulint* offsets);/*!< in: array returned by + rec_get_offsets() */ +/**********************************************************//** +Create an uncompressed B-tree index page. +@return pointer to the page */ +UNIV_INTERN +page_t* +page_create( +/*========*/ + buf_block_t* block, /*!< in: a buffer block where the + page is created */ + mtr_t* mtr, /*!< in: mini-transaction handle */ + ulint comp); /*!< in: nonzero=compact page format */ +/**********************************************************//** +Create a compressed B-tree index page. +@return pointer to the page */ +UNIV_INTERN +page_t* +page_create_zip( +/*============*/ + buf_block_t* block, /*!< in/out: a buffer frame where the + page is created */ + dict_index_t* index, /*!< in: the index of the page */ + ulint level, /*!< in: the B-tree level of the page */ + trx_id_t max_trx_id, /*!< in: PAGE_MAX_TRX_ID */ + mtr_t* mtr) /*!< in/out: mini-transaction */ + __attribute__((nonnull)); +/**********************************************************//** +Empty a previously created B-tree index page. */ +UNIV_INTERN +void +page_create_empty( +/*==============*/ + buf_block_t* block, /*!< in/out: B-tree block */ + dict_index_t* index, /*!< in: the index of the page */ + mtr_t* mtr) /*!< in/out: mini-transaction */ + __attribute__((nonnull(1,2))); +/*************************************************************//** +Differs from page_copy_rec_list_end, because this function does not +touch the lock table and max trx id on page or compress the page. + +IMPORTANT: The caller will have to update IBUF_BITMAP_FREE +if new_block is a compressed leaf page in a secondary index. +This has to be done either within the same mini-transaction, +or by invoking ibuf_reset_free_bits() before mtr_commit(). */ +UNIV_INTERN +void +page_copy_rec_list_end_no_locks( +/*============================*/ + buf_block_t* new_block, /*!< in: index page to copy to */ + buf_block_t* block, /*!< in: index page of rec */ + rec_t* rec, /*!< in: record on page */ + dict_index_t* index, /*!< in: record descriptor */ + mtr_t* mtr); /*!< in: mtr */ +/*************************************************************//** +Copies records from page to new_page, from the given record onward, +including that record. Infimum and supremum records are not copied. +The records are copied to the start of the record list on new_page. + +IMPORTANT: The caller will have to update IBUF_BITMAP_FREE +if new_block is a compressed leaf page in a secondary index. +This has to be done either within the same mini-transaction, +or by invoking ibuf_reset_free_bits() before mtr_commit(). + +@return pointer to the original successor of the infimum record on +new_page, or NULL on zip overflow (new_block will be decompressed) */ +UNIV_INTERN +rec_t* +page_copy_rec_list_end( +/*===================*/ + buf_block_t* new_block, /*!< in/out: index page to copy to */ + buf_block_t* block, /*!< in: index page containing rec */ + rec_t* rec, /*!< in: record on page */ + dict_index_t* index, /*!< in: record descriptor */ + mtr_t* mtr) /*!< in: mtr */ + __attribute__((nonnull)); +/*************************************************************//** +Copies records from page to new_page, up to the given record, NOT +including that record. Infimum and supremum records are not copied. +The records are copied to the end of the record list on new_page. + +IMPORTANT: The caller will have to update IBUF_BITMAP_FREE +if new_block is a compressed leaf page in a secondary index. +This has to be done either within the same mini-transaction, +or by invoking ibuf_reset_free_bits() before mtr_commit(). + +@return pointer to the original predecessor of the supremum record on +new_page, or NULL on zip overflow (new_block will be decompressed) */ +UNIV_INTERN +rec_t* +page_copy_rec_list_start( +/*=====================*/ + buf_block_t* new_block, /*!< in/out: index page to copy to */ + buf_block_t* block, /*!< in: index page containing rec */ + rec_t* rec, /*!< in: record on page */ + dict_index_t* index, /*!< in: record descriptor */ + mtr_t* mtr) /*!< in: mtr */ + __attribute__((nonnull)); +/*************************************************************//** +Deletes records from a page from a given record onward, including that record. +The infimum and supremum records are not deleted. */ +UNIV_INTERN +void +page_delete_rec_list_end( +/*=====================*/ + rec_t* rec, /*!< in: pointer to record on page */ + buf_block_t* block, /*!< in: buffer block of the page */ + dict_index_t* index, /*!< in: record descriptor */ + ulint n_recs, /*!< in: number of records to delete, + or ULINT_UNDEFINED if not known */ + ulint size, /*!< in: the sum of the sizes of the + records in the end of the chain to + delete, or ULINT_UNDEFINED if not known */ + mtr_t* mtr) /*!< in: mtr */ + __attribute__((nonnull)); +/*************************************************************//** +Deletes records from page, up to the given record, NOT including +that record. Infimum and supremum records are not deleted. */ +UNIV_INTERN +void +page_delete_rec_list_start( +/*=======================*/ + rec_t* rec, /*!< in: record on page */ + buf_block_t* block, /*!< in: buffer block of the page */ + dict_index_t* index, /*!< in: record descriptor */ + mtr_t* mtr) /*!< in: mtr */ + __attribute__((nonnull)); +/*************************************************************//** +Moves record list end to another page. Moved records include +split_rec. + +IMPORTANT: The caller will have to update IBUF_BITMAP_FREE +if new_block is a compressed leaf page in a secondary index. +This has to be done either within the same mini-transaction, +or by invoking ibuf_reset_free_bits() before mtr_commit(). + +@return TRUE on success; FALSE on compression failure (new_block will +be decompressed) */ +UNIV_INTERN +ibool +page_move_rec_list_end( +/*===================*/ + buf_block_t* new_block, /*!< in/out: index page where to move */ + buf_block_t* block, /*!< in: index page from where to move */ + rec_t* split_rec, /*!< in: first record to move */ + dict_index_t* index, /*!< in: record descriptor */ + mtr_t* mtr) /*!< in: mtr */ + __attribute__((nonnull(1, 2, 4, 5))); +/*************************************************************//** +Moves record list start to another page. Moved records do not include +split_rec. + +IMPORTANT: The caller will have to update IBUF_BITMAP_FREE +if new_block is a compressed leaf page in a secondary index. +This has to be done either within the same mini-transaction, +or by invoking ibuf_reset_free_bits() before mtr_commit(). + +@return TRUE on success; FALSE on compression failure */ +UNIV_INTERN +ibool +page_move_rec_list_start( +/*=====================*/ + buf_block_t* new_block, /*!< in/out: index page where to move */ + buf_block_t* block, /*!< in/out: page containing split_rec */ + rec_t* split_rec, /*!< in: first record not to move */ + dict_index_t* index, /*!< in: record descriptor */ + mtr_t* mtr) /*!< in: mtr */ + __attribute__((nonnull(1, 2, 4, 5))); +/****************************************************************//** +Splits a directory slot which owns too many records. */ +UNIV_INTERN +void +page_dir_split_slot( +/*================*/ + page_t* page, /*!< in: index page */ + page_zip_des_t* page_zip,/*!< in/out: compressed page whose + uncompressed part will be written, or NULL */ + ulint slot_no)/*!< in: the directory slot */ + __attribute__((nonnull(1))); +/*************************************************************//** +Tries to balance the given directory slot with too few records +with the upper neighbor, so that there are at least the minimum number +of records owned by the slot; this may result in the merging of +two slots. */ +UNIV_INTERN +void +page_dir_balance_slot( +/*==================*/ + page_t* page, /*!< in/out: index page */ + page_zip_des_t* page_zip,/*!< in/out: compressed page, or NULL */ + ulint slot_no)/*!< in: the directory slot */ + __attribute__((nonnull(1))); +/**********************************************************//** +Parses a log record of a record list end or start deletion. +@return end of log record or NULL */ +UNIV_INTERN +byte* +page_parse_delete_rec_list( +/*=======================*/ + byte type, /*!< in: MLOG_LIST_END_DELETE, + MLOG_LIST_START_DELETE, + MLOG_COMP_LIST_END_DELETE or + MLOG_COMP_LIST_START_DELETE */ + byte* ptr, /*!< in: buffer */ + byte* end_ptr,/*!< in: buffer end */ + buf_block_t* block, /*!< in/out: buffer block or NULL */ + dict_index_t* index, /*!< in: record descriptor */ + mtr_t* mtr); /*!< in: mtr or NULL */ +/***********************************************************//** +Parses a redo log record of creating a page. +@return end of log record or NULL */ +UNIV_INTERN +byte* +page_parse_create( +/*==============*/ + byte* ptr, /*!< in: buffer */ + byte* end_ptr,/*!< in: buffer end */ + ulint comp, /*!< in: nonzero=compact page format */ + buf_block_t* block, /*!< in: block or NULL */ + mtr_t* mtr); /*!< in: mtr or NULL */ +#ifndef UNIV_HOTBACKUP +/************************************************************//** +Prints record contents including the data relevant only in +the index page context. */ +UNIV_INTERN +void +page_rec_print( +/*===========*/ + const rec_t* rec, /*!< in: physical record */ + const ulint* offsets);/*!< in: record descriptor */ +# ifdef UNIV_BTR_PRINT +/***************************************************************//** +This is used to print the contents of the directory for +debugging purposes. */ +UNIV_INTERN +void +page_dir_print( +/*===========*/ + page_t* page, /*!< in: index page */ + ulint pr_n); /*!< in: print n first and n last entries */ +/***************************************************************//** +This is used to print the contents of the page record list for +debugging purposes. */ +UNIV_INTERN +void +page_print_list( +/*============*/ + buf_block_t* block, /*!< in: index page */ + dict_index_t* index, /*!< in: dictionary index of the page */ + ulint pr_n); /*!< in: print n first and n last entries */ +/***************************************************************//** +Prints the info in a page header. */ +UNIV_INTERN +void +page_header_print( +/*==============*/ + const page_t* page); /*!< in: index page */ +/***************************************************************//** +This is used to print the contents of the page for +debugging purposes. */ +UNIV_INTERN +void +page_print( +/*=======*/ + buf_block_t* block, /*!< in: index page */ + dict_index_t* index, /*!< in: dictionary index of the page */ + ulint dn, /*!< in: print dn first and last entries + in directory */ + ulint rn); /*!< in: print rn first and last records + in directory */ +# endif /* UNIV_BTR_PRINT */ +#endif /* !UNIV_HOTBACKUP */ +/***************************************************************//** +The following is used to validate a record on a page. This function +differs from rec_validate as it can also check the n_owned field and +the heap_no field. +@return TRUE if ok */ +UNIV_INTERN +ibool +page_rec_validate( +/*==============*/ + const rec_t* rec, /*!< in: physical record */ + const ulint* offsets);/*!< in: array returned by rec_get_offsets() */ +/***************************************************************//** +Checks that the first directory slot points to the infimum record and +the last to the supremum. This function is intended to track if the +bug fixed in 4.0.14 has caused corruption to users' databases. */ +UNIV_INTERN +void +page_check_dir( +/*===========*/ + const page_t* page); /*!< in: index page */ +/***************************************************************//** +This function checks the consistency of an index page when we do not +know the index. This is also resilient so that this should never crash +even if the page is total garbage. +@return TRUE if ok */ +UNIV_INTERN +ibool +page_simple_validate_old( +/*=====================*/ + const page_t* page); /*!< in: index page in ROW_FORMAT=REDUNDANT */ +/***************************************************************//** +This function checks the consistency of an index page when we do not +know the index. This is also resilient so that this should never crash +even if the page is total garbage. +@return TRUE if ok */ +UNIV_INTERN +ibool +page_simple_validate_new( +/*=====================*/ + const page_t* page); /*!< in: index page in ROW_FORMAT!=REDUNDANT */ +/***************************************************************//** +This function checks the consistency of an index page. +@return TRUE if ok */ +UNIV_INTERN +ibool +page_validate( +/*==========*/ + const page_t* page, /*!< in: index page */ + dict_index_t* index); /*!< in: data dictionary index containing + the page record type definition */ +/***************************************************************//** +Looks in the page record list for a record with the given heap number. +@return record, NULL if not found */ + +const rec_t* +page_find_rec_with_heap_no( +/*=======================*/ + const page_t* page, /*!< in: index page */ + ulint heap_no);/*!< in: heap number */ +/** Get the last non-delete-marked record on a page. +@param[in] page index tree leaf page +@return the last record, not delete-marked +@retval infimum record if all records are delete-marked */ + +const rec_t* +page_find_rec_max_not_deleted( + const page_t* page); +#ifdef UNIV_MATERIALIZE +#undef UNIV_INLINE +#define UNIV_INLINE UNIV_INLINE_ORIGINAL +#endif + +#ifndef UNIV_NONINL +#include "page0page.ic" +#endif + +#endif diff --git a/storage/innobase/include/page0page.ic b/storage/innobase/include/page0page.ic new file mode 100644 index 00000000000..9b81156708f --- /dev/null +++ b/storage/innobase/include/page0page.ic @@ -0,0 +1,1176 @@ +/***************************************************************************** + +Copyright (c) 1994, 2013, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/page0page.ic +Index page routines + +Created 2/2/1994 Heikki Tuuri +*******************************************************/ + +#include "mach0data.h" +#ifdef UNIV_DEBUG +# include "log0recv.h" +#endif /* !UNIV_DEBUG */ +#ifndef UNIV_HOTBACKUP +# include "rem0cmp.h" +#endif /* !UNIV_HOTBACKUP */ +#include "mtr0log.h" +#include "page0zip.h" + +#ifdef UNIV_MATERIALIZE +#undef UNIV_INLINE +#define UNIV_INLINE +#endif + +/************************************************************//** +Gets the start of a page. +@return start of the page */ +UNIV_INLINE +page_t* +page_align( +/*=======*/ + const void* ptr) /*!< in: pointer to page frame */ +{ + return((page_t*) ut_align_down(ptr, UNIV_PAGE_SIZE)); +} +/************************************************************//** +Gets the offset within a page. +@return offset from the start of the page */ +UNIV_INLINE +ulint +page_offset( +/*========*/ + const void* ptr) /*!< in: pointer to page frame */ +{ + return(ut_align_offset(ptr, UNIV_PAGE_SIZE)); +} +/*************************************************************//** +Returns the max trx id field value. */ +UNIV_INLINE +trx_id_t +page_get_max_trx_id( +/*================*/ + const page_t* page) /*!< in: page */ +{ + ut_ad(page); + + return(mach_read_from_8(page + PAGE_HEADER + PAGE_MAX_TRX_ID)); +} + +/*************************************************************//** +Sets the max trx id field value if trx_id is bigger than the previous +value. */ +UNIV_INLINE +void +page_update_max_trx_id( +/*===================*/ + buf_block_t* block, /*!< in/out: page */ + page_zip_des_t* page_zip,/*!< in/out: compressed page whose + uncompressed part will be updated, or NULL */ + trx_id_t trx_id, /*!< in: transaction id */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + ut_ad(block); + ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); + /* During crash recovery, this function may be called on + something else than a leaf page of a secondary index or the + insert buffer index tree (dict_index_is_sec_or_ibuf() returns + TRUE for the dummy indexes constructed during redo log + application). In that case, PAGE_MAX_TRX_ID is unused, + and trx_id is usually zero. */ + ut_ad(trx_id || recv_recovery_is_on()); + ut_ad(page_is_leaf(buf_block_get_frame(block))); + + if (page_get_max_trx_id(buf_block_get_frame(block)) < trx_id) { + + page_set_max_trx_id(block, page_zip, trx_id, mtr); + } +} + +/*************************************************************//** +Reads the given header field. */ +UNIV_INLINE +ulint +page_header_get_field( +/*==================*/ + const page_t* page, /*!< in: page */ + ulint field) /*!< in: PAGE_LEVEL, ... */ +{ + ut_ad(page); + ut_ad(field <= PAGE_INDEX_ID); + + return(mach_read_from_2(page + PAGE_HEADER + field)); +} + +/*************************************************************//** +Sets the given header field. */ +UNIV_INLINE +void +page_header_set_field( +/*==================*/ + page_t* page, /*!< in/out: page */ + page_zip_des_t* page_zip,/*!< in/out: compressed page whose + uncompressed part will be updated, or NULL */ + ulint field, /*!< in: PAGE_N_DIR_SLOTS, ... */ + ulint val) /*!< in: value */ +{ + ut_ad(page); + ut_ad(field <= PAGE_N_RECS); + ut_ad(field == PAGE_N_HEAP || val < UNIV_PAGE_SIZE); + ut_ad(field != PAGE_N_HEAP || (val & 0x7fff) < UNIV_PAGE_SIZE); + + mach_write_to_2(page + PAGE_HEADER + field, val); + if (page_zip) { + page_zip_write_header(page_zip, + page + PAGE_HEADER + field, 2, NULL); + } +} + +/*************************************************************//** +Returns the offset stored in the given header field. +@return offset from the start of the page, or 0 */ +UNIV_INLINE +ulint +page_header_get_offs( +/*=================*/ + const page_t* page, /*!< in: page */ + ulint field) /*!< in: PAGE_FREE, ... */ +{ + ulint offs; + + ut_ad(page); + ut_ad((field == PAGE_FREE) + || (field == PAGE_LAST_INSERT) + || (field == PAGE_HEAP_TOP)); + + offs = page_header_get_field(page, field); + + ut_ad((field != PAGE_HEAP_TOP) || offs); + + return(offs); +} + +/*************************************************************//** +Sets the pointer stored in the given header field. */ +UNIV_INLINE +void +page_header_set_ptr( +/*================*/ + page_t* page, /*!< in: page */ + page_zip_des_t* page_zip,/*!< in/out: compressed page whose + uncompressed part will be updated, or NULL */ + ulint field, /*!< in: PAGE_FREE, ... */ + const byte* ptr) /*!< in: pointer or NULL*/ +{ + ulint offs; + + ut_ad(page); + ut_ad((field == PAGE_FREE) + || (field == PAGE_LAST_INSERT) + || (field == PAGE_HEAP_TOP)); + + if (ptr == NULL) { + offs = 0; + } else { + offs = ptr - page; + } + + ut_ad((field != PAGE_HEAP_TOP) || offs); + + page_header_set_field(page, page_zip, field, offs); +} + +#ifndef UNIV_HOTBACKUP +/*************************************************************//** +Resets the last insert info field in the page header. Writes to mlog +about this operation. */ +UNIV_INLINE +void +page_header_reset_last_insert( +/*==========================*/ + page_t* page, /*!< in/out: page */ + page_zip_des_t* page_zip,/*!< in/out: compressed page whose + uncompressed part will be updated, or NULL */ + mtr_t* mtr) /*!< in: mtr */ +{ + ut_ad(page && mtr); + + if (page_zip) { + mach_write_to_2(page + (PAGE_HEADER + PAGE_LAST_INSERT), 0); + page_zip_write_header(page_zip, + page + (PAGE_HEADER + PAGE_LAST_INSERT), + 2, mtr); + } else { + mlog_write_ulint(page + (PAGE_HEADER + PAGE_LAST_INSERT), 0, + MLOG_2BYTES, mtr); + } +} +#endif /* !UNIV_HOTBACKUP */ + +/************************************************************//** +Determine whether the page is in new-style compact format. +@return nonzero if the page is in compact format, zero if it is in +old-style format */ +UNIV_INLINE +ulint +page_is_comp( +/*=========*/ + const page_t* page) /*!< in: index page */ +{ + return(page_header_get_field(page, PAGE_N_HEAP) & 0x8000); +} + +/************************************************************//** +TRUE if the record is on a page in compact format. +@return nonzero if in compact format */ +UNIV_INLINE +ulint +page_rec_is_comp( +/*=============*/ + const rec_t* rec) /*!< in: record */ +{ + return(page_is_comp(page_align(rec))); +} + +/***************************************************************//** +Returns the heap number of a record. +@return heap number */ +UNIV_INLINE +ulint +page_rec_get_heap_no( +/*=================*/ + const rec_t* rec) /*!< in: the physical record */ +{ + if (page_rec_is_comp(rec)) { + return(rec_get_heap_no_new(rec)); + } else { + return(rec_get_heap_no_old(rec)); + } +} + +/************************************************************//** +Determine whether the page is a B-tree leaf. +@return true if the page is a B-tree leaf (PAGE_LEVEL = 0) */ +UNIV_INLINE +bool +page_is_leaf( +/*=========*/ + const page_t* page) /*!< in: page */ +{ + return(!*(const uint16*) (page + (PAGE_HEADER + PAGE_LEVEL))); +} + +/************************************************************//** +Determine whether the page is empty. +@return true if the page is empty (PAGE_N_RECS = 0) */ +UNIV_INLINE +bool +page_is_empty( +/*==========*/ + const page_t* page) /*!< in: page */ +{ + return(!*(const uint16*) (page + (PAGE_HEADER + PAGE_N_RECS))); +} + +/************************************************************//** +Determine whether the page contains garbage. +@return true if the page contains garbage (PAGE_GARBAGE is not 0) */ +UNIV_INLINE +bool +page_has_garbage( +/*=============*/ + const page_t* page) /*!< in: page */ +{ + return(!!*(const uint16*) (page + (PAGE_HEADER + PAGE_GARBAGE))); +} + +/************************************************************//** +Gets the offset of the first record on the page. +@return offset of the first record in record list, relative from page */ +UNIV_INLINE +ulint +page_get_infimum_offset( +/*====================*/ + const page_t* page) /*!< in: page which must have record(s) */ +{ + ut_ad(page); + ut_ad(!page_offset(page)); + + if (page_is_comp(page)) { + return(PAGE_NEW_INFIMUM); + } else { + return(PAGE_OLD_INFIMUM); + } +} + +/************************************************************//** +Gets the offset of the last record on the page. +@return offset of the last record in record list, relative from page */ +UNIV_INLINE +ulint +page_get_supremum_offset( +/*=====================*/ + const page_t* page) /*!< in: page which must have record(s) */ +{ + ut_ad(page); + ut_ad(!page_offset(page)); + + if (page_is_comp(page)) { + return(PAGE_NEW_SUPREMUM); + } else { + return(PAGE_OLD_SUPREMUM); + } +} + +/************************************************************//** +TRUE if the record is a user record on the page. +@return TRUE if a user record */ +UNIV_INLINE +ibool +page_rec_is_user_rec_low( +/*=====================*/ + ulint offset) /*!< in: record offset on page */ +{ + ut_ad(offset >= PAGE_NEW_INFIMUM); +#if PAGE_OLD_INFIMUM < PAGE_NEW_INFIMUM +# error "PAGE_OLD_INFIMUM < PAGE_NEW_INFIMUM" +#endif +#if PAGE_OLD_SUPREMUM < PAGE_NEW_SUPREMUM +# error "PAGE_OLD_SUPREMUM < PAGE_NEW_SUPREMUM" +#endif +#if PAGE_NEW_INFIMUM > PAGE_OLD_SUPREMUM +# error "PAGE_NEW_INFIMUM > PAGE_OLD_SUPREMUM" +#endif +#if PAGE_OLD_INFIMUM > PAGE_NEW_SUPREMUM +# error "PAGE_OLD_INFIMUM > PAGE_NEW_SUPREMUM" +#endif +#if PAGE_NEW_SUPREMUM > PAGE_OLD_SUPREMUM_END +# error "PAGE_NEW_SUPREMUM > PAGE_OLD_SUPREMUM_END" +#endif +#if PAGE_OLD_SUPREMUM > PAGE_NEW_SUPREMUM_END +# error "PAGE_OLD_SUPREMUM > PAGE_NEW_SUPREMUM_END" +#endif + ut_ad(offset <= UNIV_PAGE_SIZE - PAGE_EMPTY_DIR_START); + + return(offset != PAGE_NEW_SUPREMUM + && offset != PAGE_NEW_INFIMUM + && offset != PAGE_OLD_INFIMUM + && offset != PAGE_OLD_SUPREMUM); +} + +/************************************************************//** +TRUE if the record is the supremum record on a page. +@return TRUE if the supremum record */ +UNIV_INLINE +ibool +page_rec_is_supremum_low( +/*=====================*/ + ulint offset) /*!< in: record offset on page */ +{ + ut_ad(offset >= PAGE_NEW_INFIMUM); + ut_ad(offset <= UNIV_PAGE_SIZE - PAGE_EMPTY_DIR_START); + + return(offset == PAGE_NEW_SUPREMUM + || offset == PAGE_OLD_SUPREMUM); +} + +/************************************************************//** +TRUE if the record is the infimum record on a page. +@return TRUE if the infimum record */ +UNIV_INLINE +ibool +page_rec_is_infimum_low( +/*====================*/ + ulint offset) /*!< in: record offset on page */ +{ + ut_ad(offset >= PAGE_NEW_INFIMUM); + ut_ad(offset <= UNIV_PAGE_SIZE - PAGE_EMPTY_DIR_START); + + return(offset == PAGE_NEW_INFIMUM || offset == PAGE_OLD_INFIMUM); +} + +/************************************************************//** +TRUE if the record is a user record on the page. +@return TRUE if a user record */ +UNIV_INLINE +ibool +page_rec_is_user_rec( +/*=================*/ + const rec_t* rec) /*!< in: record */ +{ + ut_ad(page_rec_check(rec)); + + return(page_rec_is_user_rec_low(page_offset(rec))); +} + +/************************************************************//** +TRUE if the record is the supremum record on a page. +@return TRUE if the supremum record */ +UNIV_INLINE +ibool +page_rec_is_supremum( +/*=================*/ + const rec_t* rec) /*!< in: record */ +{ + ut_ad(page_rec_check(rec)); + + return(page_rec_is_supremum_low(page_offset(rec))); +} + +/************************************************************//** +TRUE if the record is the infimum record on a page. +@return TRUE if the infimum record */ +UNIV_INLINE +ibool +page_rec_is_infimum( +/*================*/ + const rec_t* rec) /*!< in: record */ +{ + ut_ad(page_rec_check(rec)); + + return(page_rec_is_infimum_low(page_offset(rec))); +} + +/************************************************************//** +Returns the nth record of the record list. +This is the inverse function of page_rec_get_n_recs_before(). +@return nth record */ +UNIV_INLINE +rec_t* +page_rec_get_nth( +/*=============*/ + page_t* page, /*!< in: page */ + ulint nth) /*!< in: nth record */ +{ + return((rec_t*) page_rec_get_nth_const(page, nth)); +} + +#ifndef UNIV_HOTBACKUP +/************************************************************//** +Returns the middle record of the records on the page. If there is an +even number of records in the list, returns the first record of the +upper half-list. +@return middle record */ +UNIV_INLINE +rec_t* +page_get_middle_rec( +/*================*/ + page_t* page) /*!< in: page */ +{ + ulint middle = (page_get_n_recs(page) + PAGE_HEAP_NO_USER_LOW) / 2; + + return(page_rec_get_nth(page, middle)); +} + +/*************************************************************//** +Compares a data tuple to a physical record. Differs from the function +cmp_dtuple_rec_with_match in the way that the record must reside on an +index page, and also page infimum and supremum records can be given in +the parameter rec. These are considered as the negative infinity and +the positive infinity in the alphabetical order. +@return 1, 0, -1, if dtuple is greater, equal, less than rec, +respectively, when only the common first fields are compared */ +UNIV_INLINE +int +page_cmp_dtuple_rec_with_match( +/*===========================*/ + const dtuple_t* dtuple, /*!< in: data tuple */ + const rec_t* rec, /*!< in: physical record on a page; may also + be page infimum or supremum, in which case + matched-parameter values below are not + affected */ + const ulint* offsets,/*!< in: array returned by rec_get_offsets() */ + ulint* matched_fields, /*!< in/out: number of already completely + matched fields; when function returns + contains the value for current comparison */ + ulint* matched_bytes) /*!< in/out: number of already matched + bytes within the first field not completely + matched; when function returns contains the + value for current comparison */ +{ + ulint rec_offset; + + ut_ad(dtuple_check_typed(dtuple)); + ut_ad(rec_offs_validate(rec, NULL, offsets)); + ut_ad(!rec_offs_comp(offsets) == !page_rec_is_comp(rec)); + + rec_offset = page_offset(rec); + + if (rec_offset == PAGE_NEW_INFIMUM + || rec_offset == PAGE_OLD_INFIMUM) { + + return(1); + + } else if (rec_offset == PAGE_NEW_SUPREMUM + || rec_offset == PAGE_OLD_SUPREMUM) { + + return(-1); + } + + return(cmp_dtuple_rec_with_match(dtuple, rec, offsets, + matched_fields, + matched_bytes)); +} +#endif /* !UNIV_HOTBACKUP */ + +/*************************************************************//** +Gets the page number. +@return page number */ +UNIV_INLINE +ulint +page_get_page_no( +/*=============*/ + const page_t* page) /*!< in: page */ +{ + ut_ad(page == page_align((page_t*) page)); + return(mach_read_from_4(page + FIL_PAGE_OFFSET)); +} + +/*************************************************************//** +Gets the tablespace identifier. +@return space id */ +UNIV_INLINE +ulint +page_get_space_id( +/*==============*/ + const page_t* page) /*!< in: page */ +{ + ut_ad(page == page_align((page_t*) page)); + return(mach_read_from_4(page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID)); +} + +/*************************************************************//** +Gets the number of user records on page (infimum and supremum records +are not user records). +@return number of user records */ +UNIV_INLINE +ulint +page_get_n_recs( +/*============*/ + const page_t* page) /*!< in: index page */ +{ + return(page_header_get_field(page, PAGE_N_RECS)); +} + +/*************************************************************//** +Gets the number of dir slots in directory. +@return number of slots */ +UNIV_INLINE +ulint +page_dir_get_n_slots( +/*=================*/ + const page_t* page) /*!< in: index page */ +{ + return(page_header_get_field(page, PAGE_N_DIR_SLOTS)); +} +/*************************************************************//** +Sets the number of dir slots in directory. */ +UNIV_INLINE +void +page_dir_set_n_slots( +/*=================*/ + page_t* page, /*!< in/out: page */ + page_zip_des_t* page_zip,/*!< in/out: compressed page whose + uncompressed part will be updated, or NULL */ + ulint n_slots)/*!< in: number of slots */ +{ + page_header_set_field(page, page_zip, PAGE_N_DIR_SLOTS, n_slots); +} + +/*************************************************************//** +Gets the number of records in the heap. +@return number of user records */ +UNIV_INLINE +ulint +page_dir_get_n_heap( +/*================*/ + const page_t* page) /*!< in: index page */ +{ + return(page_header_get_field(page, PAGE_N_HEAP) & 0x7fff); +} + +/*************************************************************//** +Sets the number of records in the heap. */ +UNIV_INLINE +void +page_dir_set_n_heap( +/*================*/ + page_t* page, /*!< in/out: index page */ + page_zip_des_t* page_zip,/*!< in/out: compressed page whose + uncompressed part will be updated, or NULL. + Note that the size of the dense page directory + in the compressed page trailer is + n_heap * PAGE_ZIP_DIR_SLOT_SIZE. */ + ulint n_heap) /*!< in: number of records */ +{ + ut_ad(n_heap < 0x8000); + ut_ad(!page_zip || n_heap + == (page_header_get_field(page, PAGE_N_HEAP) & 0x7fff) + 1); + + page_header_set_field(page, page_zip, PAGE_N_HEAP, n_heap + | (0x8000 + & page_header_get_field(page, PAGE_N_HEAP))); +} + +#ifdef UNIV_DEBUG +/*************************************************************//** +Gets pointer to nth directory slot. +@return pointer to dir slot */ +UNIV_INLINE +page_dir_slot_t* +page_dir_get_nth_slot( +/*==================*/ + const page_t* page, /*!< in: index page */ + ulint n) /*!< in: position */ +{ + ut_ad(page_dir_get_n_slots(page) > n); + + return((page_dir_slot_t*) + page + UNIV_PAGE_SIZE - PAGE_DIR + - (n + 1) * PAGE_DIR_SLOT_SIZE); +} +#endif /* UNIV_DEBUG */ + +/**************************************************************//** +Used to check the consistency of a record on a page. +@return TRUE if succeed */ +UNIV_INLINE +ibool +page_rec_check( +/*===========*/ + const rec_t* rec) /*!< in: record */ +{ + const page_t* page = page_align(rec); + + ut_a(rec); + + ut_a(page_offset(rec) <= page_header_get_field(page, PAGE_HEAP_TOP)); + ut_a(page_offset(rec) >= PAGE_DATA); + + return(TRUE); +} + +/***************************************************************//** +Gets the record pointed to by a directory slot. +@return pointer to record */ +UNIV_INLINE +const rec_t* +page_dir_slot_get_rec( +/*==================*/ + const page_dir_slot_t* slot) /*!< in: directory slot */ +{ + return(page_align(slot) + mach_read_from_2(slot)); +} + +/***************************************************************//** +This is used to set the record offset in a directory slot. */ +UNIV_INLINE +void +page_dir_slot_set_rec( +/*==================*/ + page_dir_slot_t* slot, /*!< in: directory slot */ + rec_t* rec) /*!< in: record on the page */ +{ + ut_ad(page_rec_check(rec)); + + mach_write_to_2(slot, page_offset(rec)); +} + +/***************************************************************//** +Gets the number of records owned by a directory slot. +@return number of records */ +UNIV_INLINE +ulint +page_dir_slot_get_n_owned( +/*======================*/ + const page_dir_slot_t* slot) /*!< in: page directory slot */ +{ + const rec_t* rec = page_dir_slot_get_rec(slot); + if (page_rec_is_comp(slot)) { + return(rec_get_n_owned_new(rec)); + } else { + return(rec_get_n_owned_old(rec)); + } +} + +/***************************************************************//** +This is used to set the owned records field of a directory slot. */ +UNIV_INLINE +void +page_dir_slot_set_n_owned( +/*======================*/ + page_dir_slot_t*slot, /*!< in/out: directory slot */ + page_zip_des_t* page_zip,/*!< in/out: compressed page, or NULL */ + ulint n) /*!< in: number of records owned by the slot */ +{ + rec_t* rec = (rec_t*) page_dir_slot_get_rec(slot); + if (page_rec_is_comp(slot)) { + rec_set_n_owned_new(rec, page_zip, n); + } else { + ut_ad(!page_zip); + rec_set_n_owned_old(rec, n); + } +} + +/************************************************************//** +Calculates the space reserved for directory slots of a given number of +records. The exact value is a fraction number n * PAGE_DIR_SLOT_SIZE / +PAGE_DIR_SLOT_MIN_N_OWNED, and it is rounded upwards to an integer. */ +UNIV_INLINE +ulint +page_dir_calc_reserved_space( +/*=========================*/ + ulint n_recs) /*!< in: number of records */ +{ + return((PAGE_DIR_SLOT_SIZE * n_recs + PAGE_DIR_SLOT_MIN_N_OWNED - 1) + / PAGE_DIR_SLOT_MIN_N_OWNED); +} + +/************************************************************//** +Gets the pointer to the next record on the page. +@return pointer to next record */ +UNIV_INLINE +const rec_t* +page_rec_get_next_low( +/*==================*/ + const rec_t* rec, /*!< in: pointer to record */ + ulint comp) /*!< in: nonzero=compact page layout */ +{ + ulint offs; + const page_t* page; + + ut_ad(page_rec_check(rec)); + + page = page_align(rec); + + offs = rec_get_next_offs(rec, comp); + + if (offs >= UNIV_PAGE_SIZE) { + fprintf(stderr, + "InnoDB: Next record offset is nonsensical %lu" + " in record at offset %lu\n" + "InnoDB: rec address %p, space id %lu, page %lu\n", + (ulong) offs, (ulong) page_offset(rec), + (void*) rec, + (ulong) page_get_space_id(page), + (ulong) page_get_page_no(page)); + buf_page_print(page, 0, 0); + + ut_error; + } else if (offs == 0) { + + return(NULL); + } + + return(page + offs); +} + +/************************************************************//** +Gets the pointer to the next record on the page. +@return pointer to next record */ +UNIV_INLINE +rec_t* +page_rec_get_next( +/*==============*/ + rec_t* rec) /*!< in: pointer to record */ +{ + return((rec_t*) page_rec_get_next_low(rec, page_rec_is_comp(rec))); +} + +/************************************************************//** +Gets the pointer to the next record on the page. +@return pointer to next record */ +UNIV_INLINE +const rec_t* +page_rec_get_next_const( +/*====================*/ + const rec_t* rec) /*!< in: pointer to record */ +{ + return(page_rec_get_next_low(rec, page_rec_is_comp(rec))); +} + +/************************************************************//** +Gets the pointer to the next non delete-marked record on the page. +If all subsequent records are delete-marked, then this function +will return the supremum record. +@return pointer to next non delete-marked record or pointer to supremum */ +UNIV_INLINE +const rec_t* +page_rec_get_next_non_del_marked( +/*=============================*/ + const rec_t* rec) /*!< in: pointer to record */ +{ + const rec_t* r; + ulint page_is_compact = page_rec_is_comp(rec); + + for (r = page_rec_get_next_const(rec); + !page_rec_is_supremum(r) + && rec_get_deleted_flag(r, page_is_compact); + r = page_rec_get_next_const(r)) { + /* noop */ + } + + return(r); +} + +/************************************************************//** +Sets the pointer to the next record on the page. */ +UNIV_INLINE +void +page_rec_set_next( +/*==============*/ + rec_t* rec, /*!< in: pointer to record, + must not be page supremum */ + const rec_t* next) /*!< in: pointer to next record, + must not be page infimum */ +{ + ulint offs; + + ut_ad(page_rec_check(rec)); + ut_ad(!page_rec_is_supremum(rec)); + ut_ad(rec != next); + + ut_ad(!next || !page_rec_is_infimum(next)); + ut_ad(!next || page_align(rec) == page_align(next)); + + offs = next != NULL ? page_offset(next) : 0; + + if (page_rec_is_comp(rec)) { + rec_set_next_offs_new(rec, offs); + } else { + rec_set_next_offs_old(rec, offs); + } +} + +/************************************************************//** +Gets the pointer to the previous record. +@return pointer to previous record */ +UNIV_INLINE +const rec_t* +page_rec_get_prev_const( +/*====================*/ + const rec_t* rec) /*!< in: pointer to record, must not be page + infimum */ +{ + const page_dir_slot_t* slot; + ulint slot_no; + const rec_t* rec2; + const rec_t* prev_rec = NULL; + const page_t* page; + + ut_ad(page_rec_check(rec)); + + page = page_align(rec); + + ut_ad(!page_rec_is_infimum(rec)); + + slot_no = page_dir_find_owner_slot(rec); + + ut_a(slot_no != 0); + + slot = page_dir_get_nth_slot(page, slot_no - 1); + + rec2 = page_dir_slot_get_rec(slot); + + if (page_is_comp(page)) { + while (rec != rec2) { + prev_rec = rec2; + rec2 = page_rec_get_next_low(rec2, TRUE); + } + } else { + while (rec != rec2) { + prev_rec = rec2; + rec2 = page_rec_get_next_low(rec2, FALSE); + } + } + + ut_a(prev_rec); + + return(prev_rec); +} + +/************************************************************//** +Gets the pointer to the previous record. +@return pointer to previous record */ +UNIV_INLINE +rec_t* +page_rec_get_prev( +/*==============*/ + rec_t* rec) /*!< in: pointer to record, must not be page + infimum */ +{ + return((rec_t*) page_rec_get_prev_const(rec)); +} + +/***************************************************************//** +Looks for the record which owns the given record. +@return the owner record */ +UNIV_INLINE +rec_t* +page_rec_find_owner_rec( +/*====================*/ + rec_t* rec) /*!< in: the physical record */ +{ + ut_ad(page_rec_check(rec)); + + if (page_rec_is_comp(rec)) { + while (rec_get_n_owned_new(rec) == 0) { + rec = page_rec_get_next(rec); + } + } else { + while (rec_get_n_owned_old(rec) == 0) { + rec = page_rec_get_next(rec); + } + } + + return(rec); +} + +/**********************************************************//** +Returns the base extra size of a physical record. This is the +size of the fixed header, independent of the record size. +@return REC_N_NEW_EXTRA_BYTES or REC_N_OLD_EXTRA_BYTES */ +UNIV_INLINE +ulint +page_rec_get_base_extra_size( +/*=========================*/ + const rec_t* rec) /*!< in: physical record */ +{ +#if REC_N_NEW_EXTRA_BYTES + 1 != REC_N_OLD_EXTRA_BYTES +# error "REC_N_NEW_EXTRA_BYTES + 1 != REC_N_OLD_EXTRA_BYTES" +#endif + return(REC_N_NEW_EXTRA_BYTES + (ulint) !page_rec_is_comp(rec)); +} + +/************************************************************//** +Returns the sum of the sizes of the records in the record list, excluding +the infimum and supremum records. +@return data in bytes */ +UNIV_INLINE +ulint +page_get_data_size( +/*===============*/ + const page_t* page) /*!< in: index page */ +{ + ulint ret; + + ret = (ulint)(page_header_get_field(page, PAGE_HEAP_TOP) + - (page_is_comp(page) + ? PAGE_NEW_SUPREMUM_END + : PAGE_OLD_SUPREMUM_END) + - page_header_get_field(page, PAGE_GARBAGE)); + + ut_ad(ret < UNIV_PAGE_SIZE); + + return(ret); +} + + +/************************************************************//** +Allocates a block of memory from the free list of an index page. */ +UNIV_INLINE +void +page_mem_alloc_free( +/*================*/ + page_t* page, /*!< in/out: index page */ + page_zip_des_t* page_zip,/*!< in/out: compressed page with enough + space available for inserting the record, + or NULL */ + rec_t* next_rec,/*!< in: pointer to the new head of the + free record list */ + ulint need) /*!< in: number of bytes allocated */ +{ + ulint garbage; + +#ifdef UNIV_DEBUG + const rec_t* old_rec = page_header_get_ptr(page, PAGE_FREE); + ulint next_offs; + + ut_ad(old_rec); + next_offs = rec_get_next_offs(old_rec, page_is_comp(page)); + ut_ad(next_rec == (next_offs ? page + next_offs : NULL)); +#endif + + page_header_set_ptr(page, page_zip, PAGE_FREE, next_rec); + + garbage = page_header_get_field(page, PAGE_GARBAGE); + ut_ad(garbage >= need); + + page_header_set_field(page, page_zip, PAGE_GARBAGE, garbage - need); +} + +/*************************************************************//** +Calculates free space if a page is emptied. +@return free space */ +UNIV_INLINE +ulint +page_get_free_space_of_empty( +/*=========================*/ + ulint comp) /*!< in: nonzero=compact page layout */ +{ + if (comp) { + return((ulint)(UNIV_PAGE_SIZE + - PAGE_NEW_SUPREMUM_END + - PAGE_DIR + - 2 * PAGE_DIR_SLOT_SIZE)); + } + + return((ulint)(UNIV_PAGE_SIZE + - PAGE_OLD_SUPREMUM_END + - PAGE_DIR + - 2 * PAGE_DIR_SLOT_SIZE)); +} + +#ifndef UNIV_HOTBACKUP +/***********************************************************************//** +Write a 32-bit field in a data dictionary record. */ +UNIV_INLINE +void +page_rec_write_field( +/*=================*/ + rec_t* rec, /*!< in/out: record to update */ + ulint i, /*!< in: index of the field to update */ + ulint val, /*!< in: value to write */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + byte* data; + ulint len; + + data = rec_get_nth_field_old(rec, i, &len); + + ut_ad(len == 4); + + mlog_write_ulint(data, val, MLOG_4BYTES, mtr); +} +#endif /* !UNIV_HOTBACKUP */ + +/************************************************************//** +Each user record on a page, and also the deleted user records in the heap +takes its size plus the fraction of the dir cell size / +PAGE_DIR_SLOT_MIN_N_OWNED bytes for it. If the sum of these exceeds the +value of page_get_free_space_of_empty, the insert is impossible, otherwise +it is allowed. This function returns the maximum combined size of records +which can be inserted on top of the record heap. +@return maximum combined size for inserted records */ +UNIV_INLINE +ulint +page_get_max_insert_size( +/*=====================*/ + const page_t* page, /*!< in: index page */ + ulint n_recs) /*!< in: number of records */ +{ + ulint occupied; + ulint free_space; + + if (page_is_comp(page)) { + occupied = page_header_get_field(page, PAGE_HEAP_TOP) + - PAGE_NEW_SUPREMUM_END + + page_dir_calc_reserved_space( + n_recs + page_dir_get_n_heap(page) - 2); + + free_space = page_get_free_space_of_empty(TRUE); + } else { + occupied = page_header_get_field(page, PAGE_HEAP_TOP) + - PAGE_OLD_SUPREMUM_END + + page_dir_calc_reserved_space( + n_recs + page_dir_get_n_heap(page) - 2); + + free_space = page_get_free_space_of_empty(FALSE); + } + + /* Above the 'n_recs +' part reserves directory space for the new + inserted records; the '- 2' excludes page infimum and supremum + records */ + + if (occupied > free_space) { + + return(0); + } + + return(free_space - occupied); +} + +/************************************************************//** +Returns the maximum combined size of records which can be inserted on top +of the record heap if a page is first reorganized. +@return maximum combined size for inserted records */ +UNIV_INLINE +ulint +page_get_max_insert_size_after_reorganize( +/*======================================*/ + const page_t* page, /*!< in: index page */ + ulint n_recs) /*!< in: number of records */ +{ + ulint occupied; + ulint free_space; + + occupied = page_get_data_size(page) + + page_dir_calc_reserved_space(n_recs + page_get_n_recs(page)); + + free_space = page_get_free_space_of_empty(page_is_comp(page)); + + if (occupied > free_space) { + + return(0); + } + + return(free_space - occupied); +} + +/************************************************************//** +Puts a record to free list. */ +UNIV_INLINE +void +page_mem_free( +/*==========*/ + page_t* page, /*!< in/out: index page */ + page_zip_des_t* page_zip, /*!< in/out: compressed page, + or NULL */ + rec_t* rec, /*!< in: pointer to the + (origin of) record */ + const dict_index_t* index, /*!< in: index of rec */ + const ulint* offsets) /*!< in: array returned by + rec_get_offsets() */ +{ + rec_t* free; + ulint garbage; + + ut_ad(rec_offs_validate(rec, index, offsets)); + free = page_header_get_ptr(page, PAGE_FREE); + + page_rec_set_next(rec, free); + page_header_set_ptr(page, page_zip, PAGE_FREE, rec); + + garbage = page_header_get_field(page, PAGE_GARBAGE); + + page_header_set_field(page, page_zip, PAGE_GARBAGE, + garbage + rec_offs_size(offsets)); + + if (page_zip) { + page_zip_dir_delete(page_zip, rec, index, offsets, free); + } else { + page_header_set_field(page, page_zip, PAGE_N_RECS, + page_get_n_recs(page) - 1); + } +} + +#ifdef UNIV_MATERIALIZE +#undef UNIV_INLINE +#define UNIV_INLINE UNIV_INLINE_ORIGINAL +#endif diff --git a/storage/innobase/include/page0types.h b/storage/innobase/include/page0types.h new file mode 100644 index 00000000000..95143a4bb44 --- /dev/null +++ b/storage/innobase/include/page0types.h @@ -0,0 +1,169 @@ +/***************************************************************************** + +Copyright (c) 1994, 2013, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/page0types.h +Index page routines + +Created 2/2/1994 Heikki Tuuri +*******************************************************/ + +#ifndef page0types_h +#define page0types_h + +using namespace std; + +#include <map> + +#include "univ.i" +#include "dict0types.h" +#include "mtr0types.h" + +/** Eliminates a name collision on HP-UX */ +#define page_t ib_page_t +/** Type of the index page */ +typedef byte page_t; +/** Index page cursor */ +struct page_cur_t; + +/** Compressed index page */ +typedef byte page_zip_t; + +/* The following definitions would better belong to page0zip.h, +but we cannot include page0zip.h from rem0rec.ic, because +page0*.h includes rem0rec.h and may include rem0rec.ic. */ + +/** Number of bits needed for representing different compressed page sizes */ +#define PAGE_ZIP_SSIZE_BITS 3 + +/** Maximum compressed page shift size */ +#define PAGE_ZIP_SSIZE_MAX \ + (UNIV_ZIP_SIZE_SHIFT_MAX - UNIV_ZIP_SIZE_SHIFT_MIN + 1) + +/* Make sure there are enough bits available to store the maximum zip +ssize, which is the number of shifts from 512. */ +#if PAGE_ZIP_SSIZE_MAX >= (1 << PAGE_ZIP_SSIZE_BITS) +# error "PAGE_ZIP_SSIZE_MAX >= (1 << PAGE_ZIP_SSIZE_BITS)" +#endif + +/** Compressed page descriptor */ +struct page_zip_des_t +{ + page_zip_t* data; /*!< compressed page data */ + +#ifdef UNIV_DEBUG + unsigned m_start:16; /*!< start offset of modification log */ + bool m_external; /*!< Allocated externally, not from the + buffer pool */ +#endif /* UNIV_DEBUG */ + unsigned m_end:16; /*!< end offset of modification log */ + unsigned m_nonempty:1; /*!< TRUE if the modification log + is not empty */ + unsigned n_blobs:12; /*!< number of externally stored + columns on the page; the maximum + is 744 on a 16 KiB page */ + unsigned ssize:PAGE_ZIP_SSIZE_BITS; + /*!< 0 or compressed page shift size; + the size in bytes is + (UNIV_ZIP_SIZE_MIN >> 1) << ssize. */ +}; + +/** Compression statistics for a given page size */ +struct page_zip_stat_t { + /** Number of page compressions */ + ulint compressed; + /** Number of successful page compressions */ + ulint compressed_ok; + /** Number of page decompressions */ + ulint decompressed; + /** Duration of page compressions in microseconds */ + ib_uint64_t compressed_usec; + /** Duration of page decompressions in microseconds */ + ib_uint64_t decompressed_usec; + page_zip_stat_t() : + /* Initialize members to 0 so that when we do + stlmap[key].compressed++ and element with "key" does not + exist it gets inserted with zeroed members. */ + compressed(0), + compressed_ok(0), + decompressed(0), + compressed_usec(0), + decompressed_usec(0) + { } +}; + +/** Compression statistics types */ +typedef map<index_id_t, page_zip_stat_t> page_zip_stat_per_index_t; + +/** Statistics on compression, indexed by page_zip_des_t::ssize - 1 */ +extern page_zip_stat_t page_zip_stat[PAGE_ZIP_SSIZE_MAX]; +/** Statistics on compression, indexed by dict_index_t::id */ +extern page_zip_stat_per_index_t page_zip_stat_per_index; +extern ib_mutex_t page_zip_stat_per_index_mutex; +#ifdef HAVE_PSI_INTERFACE +extern mysql_pfs_key_t page_zip_stat_per_index_mutex_key; +#endif /* HAVE_PSI_INTERFACE */ + +/**********************************************************************//** +Write the "deleted" flag of a record on a compressed page. The flag must +already have been written on the uncompressed page. */ +UNIV_INTERN +void +page_zip_rec_set_deleted( +/*=====================*/ + page_zip_des_t* page_zip,/*!< in/out: compressed page */ + const byte* rec, /*!< in: record on the uncompressed page */ + ulint flag) /*!< in: the deleted flag (nonzero=TRUE) */ + __attribute__((nonnull)); + +/**********************************************************************//** +Write the "owned" flag of a record on a compressed page. The n_owned field +must already have been written on the uncompressed page. */ +UNIV_INTERN +void +page_zip_rec_set_owned( +/*===================*/ + page_zip_des_t* page_zip,/*!< in/out: compressed page */ + const byte* rec, /*!< in: record on the uncompressed page */ + ulint flag) /*!< in: the owned flag (nonzero=TRUE) */ + __attribute__((nonnull)); + +/**********************************************************************//** +Shift the dense page directory when a record is deleted. */ +UNIV_INTERN +void +page_zip_dir_delete( +/*================*/ + page_zip_des_t* page_zip,/*!< in/out: compressed page */ + byte* rec, /*!< in: deleted record */ + dict_index_t* index, /*!< in: index of rec */ + const ulint* offsets,/*!< in: rec_get_offsets(rec) */ + const byte* free) /*!< in: previous start of the free list */ + __attribute__((nonnull(1,2,3,4))); + +/**********************************************************************//** +Add a slot to the dense page directory. */ +UNIV_INTERN +void +page_zip_dir_add_slot( +/*==================*/ + page_zip_des_t* page_zip, /*!< in/out: compressed page */ + ulint is_clustered) /*!< in: nonzero for clustered index, + zero for others */ + __attribute__((nonnull)); +#endif diff --git a/storage/innobase/include/page0zip.h b/storage/innobase/include/page0zip.h new file mode 100644 index 00000000000..9d3b78ed2fc --- /dev/null +++ b/storage/innobase/include/page0zip.h @@ -0,0 +1,538 @@ +/***************************************************************************** + +Copyright (c) 2005, 2013, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2012, Facebook Inc. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/page0zip.h +Compressed page interface + +Created June 2005 by Marko Makela +*******************************************************/ + +#ifndef page0zip_h +#define page0zip_h + +#ifdef UNIV_MATERIALIZE +# undef UNIV_INLINE +# define UNIV_INLINE +#endif + +#include "mtr0types.h" +#include "page0types.h" +#include "buf0types.h" +#include "dict0types.h" +#include "srv0srv.h" +#include "trx0types.h" +#include "mem0mem.h" + +/* Compression level to be used by zlib. Settable by user. */ +extern uint page_zip_level; + +/* Default compression level. */ +#define DEFAULT_COMPRESSION_LEVEL 6 + +/* Whether or not to log compressed page images to avoid possible +compression algorithm changes in zlib. */ +extern my_bool page_zip_log_pages; + +/**********************************************************************//** +Determine the size of a compressed page in bytes. +@return size in bytes */ +UNIV_INLINE +ulint +page_zip_get_size( +/*==============*/ + const page_zip_des_t* page_zip) /*!< in: compressed page */ + __attribute__((nonnull, pure)); +/**********************************************************************//** +Set the size of a compressed page in bytes. */ +UNIV_INLINE +void +page_zip_set_size( +/*==============*/ + page_zip_des_t* page_zip, /*!< in/out: compressed page */ + ulint size); /*!< in: size in bytes */ + +#ifndef UNIV_HOTBACKUP +/**********************************************************************//** +Determine if a record is so big that it needs to be stored externally. +@return FALSE if the entire record can be stored locally on the page */ +UNIV_INLINE +ibool +page_zip_rec_needs_ext( +/*===================*/ + ulint rec_size, /*!< in: length of the record in bytes */ + ulint comp, /*!< in: nonzero=compact format */ + ulint n_fields, /*!< in: number of fields in the record; + ignored if zip_size == 0 */ + ulint zip_size) /*!< in: compressed page size in bytes, or 0 */ + __attribute__((const)); + +/**********************************************************************//** +Determine the guaranteed free space on an empty page. +@return minimum payload size on the page */ +UNIV_INTERN +ulint +page_zip_empty_size( +/*================*/ + ulint n_fields, /*!< in: number of columns in the index */ + ulint zip_size) /*!< in: compressed page size in bytes */ + __attribute__((const)); +#endif /* !UNIV_HOTBACKUP */ + +/**********************************************************************//** +Initialize a compressed page descriptor. */ +UNIV_INLINE +void +page_zip_des_init( +/*==============*/ + page_zip_des_t* page_zip); /*!< in/out: compressed page + descriptor */ + +/**********************************************************************//** +Configure the zlib allocator to use the given memory heap. */ +UNIV_INTERN +void +page_zip_set_alloc( +/*===============*/ + void* stream, /*!< in/out: zlib stream */ + mem_heap_t* heap); /*!< in: memory heap to use */ + +/**********************************************************************//** +Compress a page. +@return TRUE on success, FALSE on failure; page_zip will be left +intact on failure. */ +UNIV_INTERN +ibool +page_zip_compress( +/*==============*/ + page_zip_des_t* page_zip,/*!< in: size; out: data, n_blobs, + m_start, m_end, m_nonempty */ + const page_t* page, /*!< in: uncompressed page */ + dict_index_t* index, /*!< in: index of the B-tree node */ + ulint level, /*!< in: compression level */ + mtr_t* mtr) /*!< in: mini-transaction, or NULL */ + __attribute__((nonnull(1,2,3))); + +/**********************************************************************//** +Decompress a page. This function should tolerate errors on the compressed +page. Instead of letting assertions fail, it will return FALSE if an +inconsistency is detected. +@return TRUE on success, FALSE on failure */ +UNIV_INTERN +ibool +page_zip_decompress( +/*================*/ + page_zip_des_t* page_zip,/*!< in: data, ssize; + out: m_start, m_end, m_nonempty, n_blobs */ + page_t* page, /*!< out: uncompressed page, may be trashed */ + ibool all) /*!< in: TRUE=decompress the whole page; + FALSE=verify but do not copy some + page header fields that should not change + after page creation */ + __attribute__((nonnull(1,2))); + +#ifdef UNIV_DEBUG +/**********************************************************************//** +Validate a compressed page descriptor. +@return TRUE if ok */ +UNIV_INLINE +ibool +page_zip_simple_validate( +/*=====================*/ + const page_zip_des_t* page_zip); /*!< in: compressed page + descriptor */ +#endif /* UNIV_DEBUG */ + +#ifdef UNIV_ZIP_DEBUG +/**********************************************************************//** +Check that the compressed and decompressed pages match. +@return TRUE if valid, FALSE if not */ +UNIV_INTERN +ibool +page_zip_validate_low( +/*==================*/ + const page_zip_des_t* page_zip,/*!< in: compressed page */ + const page_t* page, /*!< in: uncompressed page */ + const dict_index_t* index, /*!< in: index of the page, if known */ + ibool sloppy) /*!< in: FALSE=strict, + TRUE=ignore the MIN_REC_FLAG */ + __attribute__((nonnull(1,2))); +/**********************************************************************//** +Check that the compressed and decompressed pages match. */ +UNIV_INTERN +ibool +page_zip_validate( +/*==============*/ + const page_zip_des_t* page_zip,/*!< in: compressed page */ + const page_t* page, /*!< in: uncompressed page */ + const dict_index_t* index) /*!< in: index of the page, if known */ + __attribute__((nonnull(1,2))); +#endif /* UNIV_ZIP_DEBUG */ + +/**********************************************************************//** +Determine how big record can be inserted without recompressing the page. +@return a positive number indicating the maximum size of a record +whose insertion is guaranteed to succeed, or zero or negative */ +UNIV_INLINE +lint +page_zip_max_ins_size( +/*==================*/ + const page_zip_des_t* page_zip,/*!< in: compressed page */ + ibool is_clust)/*!< in: TRUE if clustered index */ + __attribute__((nonnull, pure)); + +/**********************************************************************//** +Determine if enough space is available in the modification log. +@return TRUE if page_zip_write_rec() will succeed */ +UNIV_INLINE +ibool +page_zip_available( +/*===============*/ + const page_zip_des_t* page_zip,/*!< in: compressed page */ + ibool is_clust,/*!< in: TRUE if clustered index */ + ulint length, /*!< in: combined size of the record */ + ulint create) /*!< in: nonzero=add the record to + the heap */ + __attribute__((nonnull, pure)); + +/**********************************************************************//** +Write data to the uncompressed header portion of a page. The data must +already have been written to the uncompressed page. */ +UNIV_INLINE +void +page_zip_write_header( +/*==================*/ + page_zip_des_t* page_zip,/*!< in/out: compressed page */ + const byte* str, /*!< in: address on the uncompressed page */ + ulint length, /*!< in: length of the data */ + mtr_t* mtr) /*!< in: mini-transaction, or NULL */ + __attribute__((nonnull(1,2))); + +/**********************************************************************//** +Write an entire record on the compressed page. The data must already +have been written to the uncompressed page. */ +UNIV_INTERN +void +page_zip_write_rec( +/*===============*/ + page_zip_des_t* page_zip,/*!< in/out: compressed page */ + const byte* rec, /*!< in: record being written */ + dict_index_t* index, /*!< in: the index the record belongs to */ + const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */ + ulint create) /*!< in: nonzero=insert, zero=update */ + __attribute__((nonnull)); + +/***********************************************************//** +Parses a log record of writing a BLOB pointer of a record. +@return end of log record or NULL */ +UNIV_INTERN +byte* +page_zip_parse_write_blob_ptr( +/*==========================*/ + byte* ptr, /*!< in: redo log buffer */ + byte* end_ptr,/*!< in: redo log buffer end */ + page_t* page, /*!< in/out: uncompressed page */ + page_zip_des_t* page_zip);/*!< in/out: compressed page */ + +/**********************************************************************//** +Write a BLOB pointer of a record on the leaf page of a clustered index. +The information must already have been updated on the uncompressed page. */ +UNIV_INTERN +void +page_zip_write_blob_ptr( +/*====================*/ + page_zip_des_t* page_zip,/*!< in/out: compressed page */ + const byte* rec, /*!< in/out: record whose data is being + written */ + dict_index_t* index, /*!< in: index of the page */ + const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */ + ulint n, /*!< in: column index */ + mtr_t* mtr) /*!< in: mini-transaction handle, + or NULL if no logging is needed */ + __attribute__((nonnull(1,2,3,4))); + +/***********************************************************//** +Parses a log record of writing the node pointer of a record. +@return end of log record or NULL */ +UNIV_INTERN +byte* +page_zip_parse_write_node_ptr( +/*==========================*/ + byte* ptr, /*!< in: redo log buffer */ + byte* end_ptr,/*!< in: redo log buffer end */ + page_t* page, /*!< in/out: uncompressed page */ + page_zip_des_t* page_zip);/*!< in/out: compressed page */ + +/**********************************************************************//** +Write the node pointer of a record on a non-leaf compressed page. */ +UNIV_INTERN +void +page_zip_write_node_ptr( +/*====================*/ + page_zip_des_t* page_zip,/*!< in/out: compressed page */ + byte* rec, /*!< in/out: record */ + ulint size, /*!< in: data size of rec */ + ulint ptr, /*!< in: node pointer */ + mtr_t* mtr) /*!< in: mini-transaction, or NULL */ + __attribute__((nonnull(1,2))); + +/**********************************************************************//** +Write the trx_id and roll_ptr of a record on a B-tree leaf node page. */ +UNIV_INTERN +void +page_zip_write_trx_id_and_roll_ptr( +/*===============================*/ + page_zip_des_t* page_zip,/*!< in/out: compressed page */ + byte* rec, /*!< in/out: record */ + const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */ + ulint trx_id_col,/*!< in: column number of TRX_ID in rec */ + trx_id_t trx_id, /*!< in: transaction identifier */ + roll_ptr_t roll_ptr)/*!< in: roll_ptr */ + __attribute__((nonnull)); + +/**********************************************************************//** +Write the "deleted" flag of a record on a compressed page. The flag must +already have been written on the uncompressed page. */ +UNIV_INTERN +void +page_zip_rec_set_deleted( +/*=====================*/ + page_zip_des_t* page_zip,/*!< in/out: compressed page */ + const byte* rec, /*!< in: record on the uncompressed page */ + ulint flag) /*!< in: the deleted flag (nonzero=TRUE) */ + __attribute__((nonnull)); + +/**********************************************************************//** +Write the "owned" flag of a record on a compressed page. The n_owned field +must already have been written on the uncompressed page. */ +UNIV_INTERN +void +page_zip_rec_set_owned( +/*===================*/ + page_zip_des_t* page_zip,/*!< in/out: compressed page */ + const byte* rec, /*!< in: record on the uncompressed page */ + ulint flag) /*!< in: the owned flag (nonzero=TRUE) */ + __attribute__((nonnull)); + +/**********************************************************************//** +Insert a record to the dense page directory. */ +UNIV_INTERN +void +page_zip_dir_insert( +/*================*/ + page_zip_des_t* page_zip,/*!< in/out: compressed page */ + const byte* prev_rec,/*!< in: record after which to insert */ + const byte* free_rec,/*!< in: record from which rec was + allocated, or NULL */ + byte* rec); /*!< in: record to insert */ + +/**********************************************************************//** +Shift the dense page directory and the array of BLOB pointers +when a record is deleted. */ +UNIV_INTERN +void +page_zip_dir_delete( +/*================*/ + page_zip_des_t* page_zip, /*!< in/out: compressed page */ + byte* rec, /*!< in: deleted record */ + const dict_index_t* index, /*!< in: index of rec */ + const ulint* offsets, /*!< in: rec_get_offsets(rec) */ + const byte* free) /*!< in: previous start of + the free list */ + __attribute__((nonnull(1,2,3,4))); + +/**********************************************************************//** +Add a slot to the dense page directory. */ +UNIV_INTERN +void +page_zip_dir_add_slot( +/*==================*/ + page_zip_des_t* page_zip, /*!< in/out: compressed page */ + ulint is_clustered) /*!< in: nonzero for clustered index, + zero for others */ + __attribute__((nonnull)); + +/***********************************************************//** +Parses a log record of writing to the header of a page. +@return end of log record or NULL */ +UNIV_INTERN +byte* +page_zip_parse_write_header( +/*========================*/ + byte* ptr, /*!< in: redo log buffer */ + byte* end_ptr,/*!< in: redo log buffer end */ + page_t* page, /*!< in/out: uncompressed page */ + page_zip_des_t* page_zip);/*!< in/out: compressed page */ + +/**********************************************************************//** +Write data to the uncompressed header portion of a page. The data must +already have been written to the uncompressed page. +However, the data portion of the uncompressed page may differ from +the compressed page when a record is being inserted in +page_cur_insert_rec_low(). */ +UNIV_INLINE +void +page_zip_write_header( +/*==================*/ + page_zip_des_t* page_zip,/*!< in/out: compressed page */ + const byte* str, /*!< in: address on the uncompressed page */ + ulint length, /*!< in: length of the data */ + mtr_t* mtr) /*!< in: mini-transaction, or NULL */ + __attribute__((nonnull(1,2))); + +/**********************************************************************//** +Reorganize and compress a page. This is a low-level operation for +compressed pages, to be used when page_zip_compress() fails. +On success, a redo log entry MLOG_ZIP_PAGE_COMPRESS will be written. +The function btr_page_reorganize() should be preferred whenever possible. +IMPORTANT: if page_zip_reorganize() is invoked on a leaf page of a +non-clustered index, the caller must update the insert buffer free +bits in the same mini-transaction in such a way that the modification +will be redo-logged. +@return TRUE on success, FALSE on failure; page_zip will be left +intact on failure, but page will be overwritten. */ +UNIV_INTERN +ibool +page_zip_reorganize( +/*================*/ + buf_block_t* block, /*!< in/out: page with compressed page; + on the compressed page, in: size; + out: data, n_blobs, + m_start, m_end, m_nonempty */ + dict_index_t* index, /*!< in: index of the B-tree node */ + mtr_t* mtr) /*!< in: mini-transaction */ + __attribute__((nonnull)); +#ifndef UNIV_HOTBACKUP +/**********************************************************************//** +Copy the records of a page byte for byte. Do not copy the page header +or trailer, except those B-tree header fields that are directly +related to the storage of records. Also copy PAGE_MAX_TRX_ID. +NOTE: The caller must update the lock table and the adaptive hash index. */ +UNIV_INTERN +void +page_zip_copy_recs( +/*===============*/ + page_zip_des_t* page_zip, /*!< out: copy of src_zip + (n_blobs, m_start, m_end, + m_nonempty, data[0..size-1]) */ + page_t* page, /*!< out: copy of src */ + const page_zip_des_t* src_zip, /*!< in: compressed page */ + const page_t* src, /*!< in: page */ + dict_index_t* index, /*!< in: index of the B-tree */ + mtr_t* mtr) /*!< in: mini-transaction */ + __attribute__((nonnull)); +#endif /* !UNIV_HOTBACKUP */ + +/**********************************************************************//** +Parses a log record of compressing an index page. +@return end of log record or NULL */ +UNIV_INTERN +byte* +page_zip_parse_compress( +/*====================*/ + byte* ptr, /*!< in: buffer */ + byte* end_ptr,/*!< in: buffer end */ + page_t* page, /*!< out: uncompressed page */ + page_zip_des_t* page_zip)/*!< out: compressed page */ + __attribute__((nonnull(1,2))); + +/**********************************************************************//** +Calculate the compressed page checksum. +@return page checksum */ +UNIV_INTERN +ulint +page_zip_calc_checksum( +/*===================*/ + const void* data, /*!< in: compressed page */ + ulint size, /*!< in: size of compressed page */ + srv_checksum_algorithm_t algo) /*!< in: algorithm to use */ + __attribute__((nonnull)); + +/**********************************************************************//** +Verify a compressed page's checksum. +@return TRUE if the stored checksum is valid according to the value of +innodb_checksum_algorithm */ +UNIV_INTERN +ibool +page_zip_verify_checksum( +/*=====================*/ + const void* data, /*!< in: compressed page */ + ulint size); /*!< in: size of compressed page */ +/**********************************************************************//** +Write a log record of compressing an index page without the data on the page. */ +UNIV_INLINE +void +page_zip_compress_write_log_no_data( +/*================================*/ + ulint level, /*!< in: compression level */ + const page_t* page, /*!< in: page that is compressed */ + dict_index_t* index, /*!< in: index */ + mtr_t* mtr); /*!< in: mtr */ +/**********************************************************************//** +Parses a log record of compressing an index page without the data. +@return end of log record or NULL */ +UNIV_INLINE +byte* +page_zip_parse_compress_no_data( +/*============================*/ + byte* ptr, /*!< in: buffer */ + byte* end_ptr, /*!< in: buffer end */ + page_t* page, /*!< in: uncompressed page */ + page_zip_des_t* page_zip, /*!< out: compressed page */ + dict_index_t* index) /*!< in: index */ + __attribute__((nonnull(1,2))); + +/**********************************************************************//** +Reset the counters used for filling +INFORMATION_SCHEMA.innodb_cmp_per_index. */ +UNIV_INLINE +void +page_zip_reset_stat_per_index(); +/*===========================*/ + +#ifndef UNIV_HOTBACKUP +/** Check if a pointer to an uncompressed page matches a compressed page. +When we IMPORT a tablespace the blocks and accompanying frames are allocted +from outside the buffer pool. +@param ptr pointer to an uncompressed page frame +@param page_zip compressed page descriptor +@return TRUE if ptr and page_zip refer to the same block */ +# define PAGE_ZIP_MATCH(ptr, page_zip) \ + (((page_zip)->m_external \ + && (page_align(ptr) + UNIV_PAGE_SIZE == (page_zip)->data)) \ + || buf_frame_get_page_zip(ptr) == (page_zip)) +#else /* !UNIV_HOTBACKUP */ +/** Check if a pointer to an uncompressed page matches a compressed page. +@param ptr pointer to an uncompressed page frame +@param page_zip compressed page descriptor +@return TRUE if ptr and page_zip refer to the same block */ +# define PAGE_ZIP_MATCH(ptr, page_zip) \ + (page_align(ptr) + UNIV_PAGE_SIZE == (page_zip)->data) +#endif /* !UNIV_HOTBACKUP */ + +#ifdef UNIV_MATERIALIZE +# undef UNIV_INLINE +# define UNIV_INLINE UNIV_INLINE_ORIGINAL +#endif + +#ifndef UNIV_NONINL +# include "page0zip.ic" +#endif + +#endif /* page0zip_h */ diff --git a/storage/innobase/include/page0zip.ic b/storage/innobase/include/page0zip.ic new file mode 100644 index 00000000000..6c7d8cd32c7 --- /dev/null +++ b/storage/innobase/include/page0zip.ic @@ -0,0 +1,456 @@ +/***************************************************************************** + +Copyright (c) 2005, 2013, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2012, Facebook Inc. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/page0zip.ic +Compressed page interface + +Created June 2005 by Marko Makela +*******************************************************/ + +#ifdef UNIV_MATERIALIZE +# undef UNIV_INLINE +# define UNIV_INLINE +#endif + +#include "page0zip.h" +#include "mtr0log.h" +#include "page0page.h" + +/* The format of compressed pages is as follows. + +The header and trailer of the uncompressed pages, excluding the page +directory in the trailer, are copied as is to the header and trailer +of the compressed page. + +At the end of the compressed page, there is a dense page directory +pointing to every user record contained on the page, including deleted +records on the free list. The dense directory is indexed in the +collation order, i.e., in the order in which the record list is +linked on the uncompressed page. The infimum and supremum records are +excluded. The two most significant bits of the entries are allocated +for the delete-mark and an n_owned flag indicating the last record in +a chain of records pointed to from the sparse page directory on the +uncompressed page. + +The data between PAGE_ZIP_START and the last page directory entry will +be written in compressed format, starting at offset PAGE_DATA. +Infimum and supremum records are not stored. We exclude the +REC_N_NEW_EXTRA_BYTES in every record header. These can be recovered +from the dense page directory stored at the end of the compressed +page. + +The fields node_ptr (in non-leaf B-tree nodes; level>0), trx_id and +roll_ptr (in leaf B-tree nodes; level=0), and BLOB pointers of +externally stored columns are stored separately, in ascending order of +heap_no and column index, starting backwards from the dense page +directory. + +The compressed data stream may be followed by a modification log +covering the compressed portion of the page, as follows. + +MODIFICATION LOG ENTRY FORMAT +- write record: + - (heap_no - 1) << 1 (1..2 bytes) + - extra bytes backwards + - data bytes +- clear record: + - (heap_no - 1) << 1 | 1 (1..2 bytes) + +The integer values are stored in a variable-length format: +- 0xxxxxxx: 0..127 +- 1xxxxxxx xxxxxxxx: 0..32767 + +The end of the modification log is marked by a 0 byte. + +In summary, the compressed page looks like this: + +(1) Uncompressed page header (PAGE_DATA bytes) +(2) Compressed index information +(3) Compressed page data +(4) Page modification log (page_zip->m_start..page_zip->m_end) +(5) Empty zero-filled space +(6) BLOB pointers (on leaf pages) + - BTR_EXTERN_FIELD_REF_SIZE for each externally stored column + - in descending collation order +(7) Uncompressed columns of user records, n_dense * uncompressed_size bytes, + - indexed by heap_no + - DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN for leaf pages of clustered indexes + - REC_NODE_PTR_SIZE for non-leaf pages + - 0 otherwise +(8) dense page directory, stored backwards + - n_dense = n_heap - 2 + - existing records in ascending collation order + - deleted records (free list) in link order +*/ + +/** Start offset of the area that will be compressed */ +#define PAGE_ZIP_START PAGE_NEW_SUPREMUM_END +/** Size of an compressed page directory entry */ +#define PAGE_ZIP_DIR_SLOT_SIZE 2 +/** Mask of record offsets */ +#define PAGE_ZIP_DIR_SLOT_MASK 0x3fff +/** 'owned' flag */ +#define PAGE_ZIP_DIR_SLOT_OWNED 0x4000 +/** 'deleted' flag */ +#define PAGE_ZIP_DIR_SLOT_DEL 0x8000 + +/**********************************************************************//** +Determine the size of a compressed page in bytes. +@return size in bytes */ +UNIV_INLINE +ulint +page_zip_get_size( +/*==============*/ + const page_zip_des_t* page_zip) /*!< in: compressed page */ +{ + ulint size; + + if (!page_zip->ssize) { + return(0); + } + + size = (UNIV_ZIP_SIZE_MIN >> 1) << page_zip->ssize; + + ut_ad(size >= UNIV_ZIP_SIZE_MIN); + ut_ad(size <= UNIV_PAGE_SIZE); + + return(size); +} +/**********************************************************************//** +Set the size of a compressed page in bytes. */ +UNIV_INLINE +void +page_zip_set_size( +/*==============*/ + page_zip_des_t* page_zip, /*!< in/out: compressed page */ + ulint size) /*!< in: size in bytes */ +{ + if (size) { + int ssize; + + ut_ad(ut_is_2pow(size)); + + for (ssize = 1; size > (ulint) (512 << ssize); ssize++) { + } + + page_zip->ssize = ssize; + } else { + page_zip->ssize = 0; + } + + ut_ad(page_zip_get_size(page_zip) == size); +} + +#ifndef UNIV_HOTBACKUP +/**********************************************************************//** +Determine if a record is so big that it needs to be stored externally. +@return FALSE if the entire record can be stored locally on the page */ +UNIV_INLINE +ibool +page_zip_rec_needs_ext( +/*===================*/ + ulint rec_size, /*!< in: length of the record in bytes */ + ulint comp, /*!< in: nonzero=compact format */ + ulint n_fields, /*!< in: number of fields in the record; + ignored if zip_size == 0 */ + ulint zip_size) /*!< in: compressed page size in bytes, or 0 */ +{ + ut_ad(rec_size > comp ? REC_N_NEW_EXTRA_BYTES : REC_N_OLD_EXTRA_BYTES); + ut_ad(ut_is_2pow(zip_size)); + ut_ad(comp || !zip_size); + +#if UNIV_PAGE_SIZE_MAX > REC_MAX_DATA_SIZE + if (rec_size >= REC_MAX_DATA_SIZE) { + return(TRUE); + } +#endif + + if (zip_size) { + ut_ad(comp); + /* On a compressed page, there is a two-byte entry in + the dense page directory for every record. But there + is no record header. There should be enough room for + one record on an empty leaf page. Subtract 1 byte for + the encoded heap number. Check also the available space + on the uncompressed page. */ + return(rec_size - (REC_N_NEW_EXTRA_BYTES - 2 - 1) + >= page_zip_empty_size(n_fields, zip_size) + || rec_size >= page_get_free_space_of_empty(TRUE) / 2); + } + + return(rec_size >= page_get_free_space_of_empty(comp) / 2); +} +#endif /* !UNIV_HOTBACKUP */ + +#ifdef UNIV_DEBUG +/**********************************************************************//** +Validate a compressed page descriptor. +@return TRUE if ok */ +UNIV_INLINE +ibool +page_zip_simple_validate( +/*=====================*/ + const page_zip_des_t* page_zip)/*!< in: compressed page descriptor */ +{ + ut_ad(page_zip); + ut_ad(page_zip->data); + ut_ad(page_zip->ssize <= PAGE_ZIP_SSIZE_MAX); + ut_ad(page_zip_get_size(page_zip) + > PAGE_DATA + PAGE_ZIP_DIR_SLOT_SIZE); + ut_ad(page_zip->m_start <= page_zip->m_end); + ut_ad(page_zip->m_end < page_zip_get_size(page_zip)); + ut_ad(page_zip->n_blobs + < page_zip_get_size(page_zip) / BTR_EXTERN_FIELD_REF_SIZE); + return(TRUE); +} +#endif /* UNIV_DEBUG */ + +/**********************************************************************//** +Determine if the length of the page trailer. +@return length of the page trailer, in bytes, not including the +terminating zero byte of the modification log */ +UNIV_INLINE +ibool +page_zip_get_trailer_len( +/*=====================*/ + const page_zip_des_t* page_zip,/*!< in: compressed page */ + ibool is_clust)/*!< in: TRUE if clustered index */ +{ + ulint uncompressed_size; + + ut_ad(page_zip_simple_validate(page_zip)); + UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip)); + + if (!page_is_leaf(page_zip->data)) { + uncompressed_size = PAGE_ZIP_DIR_SLOT_SIZE + + REC_NODE_PTR_SIZE; + ut_ad(!page_zip->n_blobs); + } else if (is_clust) { + uncompressed_size = PAGE_ZIP_DIR_SLOT_SIZE + + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN; + } else { + uncompressed_size = PAGE_ZIP_DIR_SLOT_SIZE; + ut_ad(!page_zip->n_blobs); + } + + return((page_dir_get_n_heap(page_zip->data) - 2) + * uncompressed_size + + page_zip->n_blobs * BTR_EXTERN_FIELD_REF_SIZE); +} + +/**********************************************************************//** +Determine how big record can be inserted without recompressing the page. +@return a positive number indicating the maximum size of a record +whose insertion is guaranteed to succeed, or zero or negative */ +UNIV_INLINE +lint +page_zip_max_ins_size( +/*==================*/ + const page_zip_des_t* page_zip,/*!< in: compressed page */ + ibool is_clust)/*!< in: TRUE if clustered index */ +{ + ulint trailer_len; + + trailer_len = page_zip_get_trailer_len(page_zip, is_clust); + + /* When a record is created, a pointer may be added to + the dense directory. + Likewise, space for the columns that will not be + compressed will be allocated from the page trailer. + Also the BLOB pointers will be allocated from there, but + we may as well count them in the length of the record. */ + + trailer_len += PAGE_ZIP_DIR_SLOT_SIZE; + + return((lint) page_zip_get_size(page_zip) + - trailer_len - page_zip->m_end + - (REC_N_NEW_EXTRA_BYTES - 2)); +} + +/**********************************************************************//** +Determine if enough space is available in the modification log. +@return TRUE if enough space is available */ +UNIV_INLINE +ibool +page_zip_available( +/*===============*/ + const page_zip_des_t* page_zip,/*!< in: compressed page */ + ibool is_clust,/*!< in: TRUE if clustered index */ + ulint length, /*!< in: combined size of the record */ + ulint create) /*!< in: nonzero=add the record to + the heap */ +{ + ulint trailer_len; + + ut_ad(length > REC_N_NEW_EXTRA_BYTES); + + trailer_len = page_zip_get_trailer_len(page_zip, is_clust); + + /* Subtract the fixed extra bytes and add the maximum + space needed for identifying the record (encoded heap_no). */ + length -= REC_N_NEW_EXTRA_BYTES - 2; + + if (create > 0) { + /* When a record is created, a pointer may be added to + the dense directory. + Likewise, space for the columns that will not be + compressed will be allocated from the page trailer. + Also the BLOB pointers will be allocated from there, but + we may as well count them in the length of the record. */ + + trailer_len += PAGE_ZIP_DIR_SLOT_SIZE; + } + + return(length + trailer_len + page_zip->m_end + < page_zip_get_size(page_zip)); +} + +/**********************************************************************//** +Initialize a compressed page descriptor. */ +UNIV_INLINE +void +page_zip_des_init( +/*==============*/ + page_zip_des_t* page_zip) /*!< in/out: compressed page + descriptor */ +{ + memset(page_zip, 0, sizeof *page_zip); +} + +/**********************************************************************//** +Write a log record of writing to the uncompressed header portion of a page. */ +UNIV_INTERN +void +page_zip_write_header_log( +/*======================*/ + const byte* data,/*!< in: data on the uncompressed page */ + ulint length, /*!< in: length of the data */ + mtr_t* mtr); /*!< in: mini-transaction */ + +/**********************************************************************//** +Write data to the uncompressed header portion of a page. The data must +already have been written to the uncompressed page. +However, the data portion of the uncompressed page may differ from +the compressed page when a record is being inserted in +page_cur_insert_rec_zip(). */ +UNIV_INLINE +void +page_zip_write_header( +/*==================*/ + page_zip_des_t* page_zip,/*!< in/out: compressed page */ + const byte* str, /*!< in: address on the uncompressed page */ + ulint length, /*!< in: length of the data */ + mtr_t* mtr) /*!< in: mini-transaction, or NULL */ +{ + ulint pos; + + ut_ad(PAGE_ZIP_MATCH(str, page_zip)); + ut_ad(page_zip_simple_validate(page_zip)); + UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip)); + + pos = page_offset(str); + + ut_ad(pos < PAGE_DATA); + + memcpy(page_zip->data + pos, str, length); + + /* The following would fail in page_cur_insert_rec_zip(). */ + /* ut_ad(page_zip_validate(page_zip, str - pos)); */ + + if (mtr) { +#ifndef UNIV_HOTBACKUP + page_zip_write_header_log(str, length, mtr); +#endif /* !UNIV_HOTBACKUP */ + } +} + +/**********************************************************************//** +Write a log record of compressing an index page without the data on the page. */ +UNIV_INLINE +void +page_zip_compress_write_log_no_data( +/*================================*/ + ulint level, /*!< in: compression level */ + const page_t* page, /*!< in: page that is compressed */ + dict_index_t* index, /*!< in: index */ + mtr_t* mtr) /*!< in: mtr */ +{ + byte* log_ptr = mlog_open_and_write_index( + mtr, page, index, MLOG_ZIP_PAGE_COMPRESS_NO_DATA, 1); + + if (log_ptr) { + mach_write_to_1(log_ptr, level); + mlog_close(mtr, log_ptr + 1); + } +} + +/**********************************************************************//** +Parses a log record of compressing an index page without the data. +@return end of log record or NULL */ +UNIV_INLINE +byte* +page_zip_parse_compress_no_data( +/*============================*/ + byte* ptr, /*!< in: buffer */ + byte* end_ptr, /*!< in: buffer end */ + page_t* page, /*!< in: uncompressed page */ + page_zip_des_t* page_zip, /*!< out: compressed page */ + dict_index_t* index) /*!< in: index */ +{ + ulint level; + if (end_ptr == ptr) { + return(NULL); + } + + level = mach_read_from_1(ptr); + + /* If page compression fails then there must be something wrong + because a compress log record is logged only if the compression + was successful. Crash in this case. */ + + if (page + && !page_zip_compress(page_zip, page, index, level, NULL)) { + ut_error; + } + + return(ptr + 1); +} + +/**********************************************************************//** +Reset the counters used for filling +INFORMATION_SCHEMA.innodb_cmp_per_index. */ +UNIV_INLINE +void +page_zip_reset_stat_per_index() +/*===========================*/ +{ + mutex_enter(&page_zip_stat_per_index_mutex); + + page_zip_stat_per_index.erase( + page_zip_stat_per_index.begin(), + page_zip_stat_per_index.end()); + + mutex_exit(&page_zip_stat_per_index_mutex); +} + +#ifdef UNIV_MATERIALIZE +# undef UNIV_INLINE +# define UNIV_INLINE UNIV_INLINE_ORIGINAL +#endif diff --git a/storage/innobase/include/pars0grm.h b/storage/innobase/include/pars0grm.h new file mode 100644 index 00000000000..8e725fe9545 --- /dev/null +++ b/storage/innobase/include/pars0grm.h @@ -0,0 +1,261 @@ +/* A Bison parser, made by GNU Bison 2.3. */ + +/* Skeleton interface for Bison's Yacc-like parsers in C + + Copyright (C) 1984, 1989, 1990, 2000, 2001, 2002, 2003, 2004, 2005, 2006 + Free Software Foundation, Inc. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, + Boston, MA 02110-1301, USA. */ + +/* As a special exception, you may create a larger work that contains + part or all of the Bison parser skeleton and distribute that work + under terms of your choice, so long as that work isn't itself a + parser generator using the skeleton or a modified version thereof + as a parser skeleton. Alternatively, if you modify or redistribute + the parser skeleton itself, you may (at your option) remove this + special exception, which will cause the skeleton and the resulting + Bison output files to be licensed under the GNU General Public + License without this special exception. + + This special exception was added by the Free Software Foundation in + version 2.2 of Bison. */ + +/* Tokens. */ +#ifndef YYTOKENTYPE +# define YYTOKENTYPE + /* Put the tokens into the symbol table, so that GDB and other debuggers + know about them. */ + enum yytokentype { + PARS_INT_LIT = 258, + PARS_FLOAT_LIT = 259, + PARS_STR_LIT = 260, + PARS_FIXBINARY_LIT = 261, + PARS_BLOB_LIT = 262, + PARS_NULL_LIT = 263, + PARS_ID_TOKEN = 264, + PARS_AND_TOKEN = 265, + PARS_OR_TOKEN = 266, + PARS_NOT_TOKEN = 267, + PARS_GE_TOKEN = 268, + PARS_LE_TOKEN = 269, + PARS_NE_TOKEN = 270, + PARS_PROCEDURE_TOKEN = 271, + PARS_IN_TOKEN = 272, + PARS_OUT_TOKEN = 273, + PARS_BINARY_TOKEN = 274, + PARS_BLOB_TOKEN = 275, + PARS_INT_TOKEN = 276, + PARS_INTEGER_TOKEN = 277, + PARS_FLOAT_TOKEN = 278, + PARS_CHAR_TOKEN = 279, + PARS_IS_TOKEN = 280, + PARS_BEGIN_TOKEN = 281, + PARS_END_TOKEN = 282, + PARS_IF_TOKEN = 283, + PARS_THEN_TOKEN = 284, + PARS_ELSE_TOKEN = 285, + PARS_ELSIF_TOKEN = 286, + PARS_LOOP_TOKEN = 287, + PARS_WHILE_TOKEN = 288, + PARS_RETURN_TOKEN = 289, + PARS_SELECT_TOKEN = 290, + PARS_SUM_TOKEN = 291, + PARS_COUNT_TOKEN = 292, + PARS_DISTINCT_TOKEN = 293, + PARS_FROM_TOKEN = 294, + PARS_WHERE_TOKEN = 295, + PARS_FOR_TOKEN = 296, + PARS_DDOT_TOKEN = 297, + PARS_READ_TOKEN = 298, + PARS_ORDER_TOKEN = 299, + PARS_BY_TOKEN = 300, + PARS_ASC_TOKEN = 301, + PARS_DESC_TOKEN = 302, + PARS_INSERT_TOKEN = 303, + PARS_INTO_TOKEN = 304, + PARS_VALUES_TOKEN = 305, + PARS_UPDATE_TOKEN = 306, + PARS_SET_TOKEN = 307, + PARS_DELETE_TOKEN = 308, + PARS_CURRENT_TOKEN = 309, + PARS_OF_TOKEN = 310, + PARS_CREATE_TOKEN = 311, + PARS_TABLE_TOKEN = 312, + PARS_INDEX_TOKEN = 313, + PARS_UNIQUE_TOKEN = 314, + PARS_CLUSTERED_TOKEN = 315, + PARS_DOES_NOT_FIT_IN_MEM_TOKEN = 316, + PARS_ON_TOKEN = 317, + PARS_ASSIGN_TOKEN = 318, + PARS_DECLARE_TOKEN = 319, + PARS_CURSOR_TOKEN = 320, + PARS_SQL_TOKEN = 321, + PARS_OPEN_TOKEN = 322, + PARS_FETCH_TOKEN = 323, + PARS_CLOSE_TOKEN = 324, + PARS_NOTFOUND_TOKEN = 325, + PARS_TO_CHAR_TOKEN = 326, + PARS_TO_NUMBER_TOKEN = 327, + PARS_TO_BINARY_TOKEN = 328, + PARS_BINARY_TO_NUMBER_TOKEN = 329, + PARS_SUBSTR_TOKEN = 330, + PARS_REPLSTR_TOKEN = 331, + PARS_CONCAT_TOKEN = 332, + PARS_INSTR_TOKEN = 333, + PARS_LENGTH_TOKEN = 334, + PARS_SYSDATE_TOKEN = 335, + PARS_PRINTF_TOKEN = 336, + PARS_ASSERT_TOKEN = 337, + PARS_RND_TOKEN = 338, + PARS_RND_STR_TOKEN = 339, + PARS_ROW_PRINTF_TOKEN = 340, + PARS_COMMIT_TOKEN = 341, + PARS_ROLLBACK_TOKEN = 342, + PARS_WORK_TOKEN = 343, + PARS_UNSIGNED_TOKEN = 344, + PARS_EXIT_TOKEN = 345, + PARS_FUNCTION_TOKEN = 346, + PARS_LOCK_TOKEN = 347, + PARS_SHARE_TOKEN = 348, + PARS_MODE_TOKEN = 349, + PARS_LIKE_TOKEN = 350, + PARS_LIKE_TOKEN_EXACT = 351, + PARS_LIKE_TOKEN_PREFIX = 352, + PARS_LIKE_TOKEN_SUFFIX = 353, + PARS_LIKE_TOKEN_SUBSTR = 354, + PARS_TABLE_NAME_TOKEN = 355, + PARS_COMPACT_TOKEN = 356, + PARS_BLOCK_SIZE_TOKEN = 357, + PARS_BIGINT_TOKEN = 358, + NEG = 359 + }; +#endif +/* Tokens. */ +#define PARS_INT_LIT 258 +#define PARS_FLOAT_LIT 259 +#define PARS_STR_LIT 260 +#define PARS_FIXBINARY_LIT 261 +#define PARS_BLOB_LIT 262 +#define PARS_NULL_LIT 263 +#define PARS_ID_TOKEN 264 +#define PARS_AND_TOKEN 265 +#define PARS_OR_TOKEN 266 +#define PARS_NOT_TOKEN 267 +#define PARS_GE_TOKEN 268 +#define PARS_LE_TOKEN 269 +#define PARS_NE_TOKEN 270 +#define PARS_PROCEDURE_TOKEN 271 +#define PARS_IN_TOKEN 272 +#define PARS_OUT_TOKEN 273 +#define PARS_BINARY_TOKEN 274 +#define PARS_BLOB_TOKEN 275 +#define PARS_INT_TOKEN 276 +#define PARS_INTEGER_TOKEN 277 +#define PARS_FLOAT_TOKEN 278 +#define PARS_CHAR_TOKEN 279 +#define PARS_IS_TOKEN 280 +#define PARS_BEGIN_TOKEN 281 +#define PARS_END_TOKEN 282 +#define PARS_IF_TOKEN 283 +#define PARS_THEN_TOKEN 284 +#define PARS_ELSE_TOKEN 285 +#define PARS_ELSIF_TOKEN 286 +#define PARS_LOOP_TOKEN 287 +#define PARS_WHILE_TOKEN 288 +#define PARS_RETURN_TOKEN 289 +#define PARS_SELECT_TOKEN 290 +#define PARS_SUM_TOKEN 291 +#define PARS_COUNT_TOKEN 292 +#define PARS_DISTINCT_TOKEN 293 +#define PARS_FROM_TOKEN 294 +#define PARS_WHERE_TOKEN 295 +#define PARS_FOR_TOKEN 296 +#define PARS_DDOT_TOKEN 297 +#define PARS_READ_TOKEN 298 +#define PARS_ORDER_TOKEN 299 +#define PARS_BY_TOKEN 300 +#define PARS_ASC_TOKEN 301 +#define PARS_DESC_TOKEN 302 +#define PARS_INSERT_TOKEN 303 +#define PARS_INTO_TOKEN 304 +#define PARS_VALUES_TOKEN 305 +#define PARS_UPDATE_TOKEN 306 +#define PARS_SET_TOKEN 307 +#define PARS_DELETE_TOKEN 308 +#define PARS_CURRENT_TOKEN 309 +#define PARS_OF_TOKEN 310 +#define PARS_CREATE_TOKEN 311 +#define PARS_TABLE_TOKEN 312 +#define PARS_INDEX_TOKEN 313 +#define PARS_UNIQUE_TOKEN 314 +#define PARS_CLUSTERED_TOKEN 315 +#define PARS_DOES_NOT_FIT_IN_MEM_TOKEN 316 +#define PARS_ON_TOKEN 317 +#define PARS_ASSIGN_TOKEN 318 +#define PARS_DECLARE_TOKEN 319 +#define PARS_CURSOR_TOKEN 320 +#define PARS_SQL_TOKEN 321 +#define PARS_OPEN_TOKEN 322 +#define PARS_FETCH_TOKEN 323 +#define PARS_CLOSE_TOKEN 324 +#define PARS_NOTFOUND_TOKEN 325 +#define PARS_TO_CHAR_TOKEN 326 +#define PARS_TO_NUMBER_TOKEN 327 +#define PARS_TO_BINARY_TOKEN 328 +#define PARS_BINARY_TO_NUMBER_TOKEN 329 +#define PARS_SUBSTR_TOKEN 330 +#define PARS_REPLSTR_TOKEN 331 +#define PARS_CONCAT_TOKEN 332 +#define PARS_INSTR_TOKEN 333 +#define PARS_LENGTH_TOKEN 334 +#define PARS_SYSDATE_TOKEN 335 +#define PARS_PRINTF_TOKEN 336 +#define PARS_ASSERT_TOKEN 337 +#define PARS_RND_TOKEN 338 +#define PARS_RND_STR_TOKEN 339 +#define PARS_ROW_PRINTF_TOKEN 340 +#define PARS_COMMIT_TOKEN 341 +#define PARS_ROLLBACK_TOKEN 342 +#define PARS_WORK_TOKEN 343 +#define PARS_UNSIGNED_TOKEN 344 +#define PARS_EXIT_TOKEN 345 +#define PARS_FUNCTION_TOKEN 346 +#define PARS_LOCK_TOKEN 347 +#define PARS_SHARE_TOKEN 348 +#define PARS_MODE_TOKEN 349 +#define PARS_LIKE_TOKEN 350 +#define PARS_LIKE_TOKEN_EXACT 351 +#define PARS_LIKE_TOKEN_PREFIX 352 +#define PARS_LIKE_TOKEN_SUFFIX 353 +#define PARS_LIKE_TOKEN_SUBSTR 354 +#define PARS_TABLE_NAME_TOKEN 355 +#define PARS_COMPACT_TOKEN 356 +#define PARS_BLOCK_SIZE_TOKEN 357 +#define PARS_BIGINT_TOKEN 358 +#define NEG 359 + + + + +#if ! defined YYSTYPE && ! defined YYSTYPE_IS_DECLARED +typedef int YYSTYPE; +# define yystype YYSTYPE /* obsolescent; will be withdrawn */ +# define YYSTYPE_IS_DECLARED 1 +# define YYSTYPE_IS_TRIVIAL 1 +#endif + +extern YYSTYPE yylval; + diff --git a/storage/innobase/include/pars0opt.h b/storage/innobase/include/pars0opt.h new file mode 100644 index 00000000000..1084d644c90 --- /dev/null +++ b/storage/innobase/include/pars0opt.h @@ -0,0 +1,75 @@ +/***************************************************************************** + +Copyright (c) 1997, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/pars0opt.h +Simple SQL optimizer + +Created 12/21/1997 Heikki Tuuri +*******************************************************/ + +#ifndef pars0opt_h +#define pars0opt_h + +#include "univ.i" +#include "que0types.h" +#include "usr0types.h" +#include "pars0sym.h" +#include "dict0types.h" +#include "row0sel.h" + +/*******************************************************************//** +Optimizes a select. Decides which indexes to tables to use. The tables +are accessed in the order that they were written to the FROM part in the +select statement. */ +UNIV_INTERN +void +opt_search_plan( +/*============*/ + sel_node_t* sel_node); /*!< in: parsed select node */ +/*******************************************************************//** +Looks for occurrences of the columns of the table in the query subgraph and +adds them to the list of columns if an occurrence of the same column does not +already exist in the list. If the column is already in the list, puts a value +indirection to point to the occurrence in the column list, except if the +column occurrence we are looking at is in the column list, in which case +nothing is done. */ +UNIV_INTERN +void +opt_find_all_cols( +/*==============*/ + ibool copy_val, /*!< in: if TRUE, new found columns are + added as columns to copy */ + dict_index_t* index, /*!< in: index to use */ + sym_node_list_t* col_list, /*!< in: base node of a list where + to add new found columns */ + plan_t* plan, /*!< in: plan or NULL */ + que_node_t* exp); /*!< in: expression or condition */ +/********************************************************************//** +Prints info of a query plan. */ +UNIV_INTERN +void +opt_print_query_plan( +/*=================*/ + sel_node_t* sel_node); /*!< in: select node */ + +#ifndef UNIV_NONINL +#include "pars0opt.ic" +#endif + +#endif diff --git a/storage/innobase/include/pars0opt.ic b/storage/innobase/include/pars0opt.ic new file mode 100644 index 00000000000..786d911ca3d --- /dev/null +++ b/storage/innobase/include/pars0opt.ic @@ -0,0 +1,24 @@ +/***************************************************************************** + +Copyright (c) 1997, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/pars0opt.ic +Simple SQL optimizer + +Created 12/21/1997 Heikki Tuuri +*******************************************************/ diff --git a/storage/innobase/include/pars0pars.h b/storage/innobase/include/pars0pars.h new file mode 100644 index 00000000000..65ff7533828 --- /dev/null +++ b/storage/innobase/include/pars0pars.h @@ -0,0 +1,826 @@ +/***************************************************************************** + +Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/pars0pars.h +SQL parser + +Created 11/19/1996 Heikki Tuuri +*******************************************************/ + +#ifndef pars0pars_h +#define pars0pars_h + +#include "univ.i" +#include "que0types.h" +#include "usr0types.h" +#include "pars0types.h" +#include "row0types.h" +#include "trx0types.h" +#include "ut0vec.h" + +/** Type of the user functions. The first argument is always InnoDB-supplied +and varies in type, while 'user_arg' is a user-supplied argument. The +meaning of the return type also varies. See the individual use cases, e.g. +the FETCH statement, for details on them. */ +typedef ibool (*pars_user_func_cb_t)(void* arg, void* user_arg); + +/** If the following is set TRUE, the parser will emit debugging +information */ +extern int yydebug; + +#ifdef UNIV_SQL_DEBUG +/** If the following is set TRUE, the lexer will print the SQL string +as it tokenizes it */ +extern ibool pars_print_lexed; +#endif /* UNIV_SQL_DEBUG */ + +/* Global variable used while parsing a single procedure or query : the code is +NOT re-entrant */ +extern sym_tab_t* pars_sym_tab_global; + +extern pars_res_word_t pars_to_char_token; +extern pars_res_word_t pars_to_number_token; +extern pars_res_word_t pars_to_binary_token; +extern pars_res_word_t pars_binary_to_number_token; +extern pars_res_word_t pars_substr_token; +extern pars_res_word_t pars_replstr_token; +extern pars_res_word_t pars_concat_token; +extern pars_res_word_t pars_length_token; +extern pars_res_word_t pars_instr_token; +extern pars_res_word_t pars_sysdate_token; +extern pars_res_word_t pars_printf_token; +extern pars_res_word_t pars_assert_token; +extern pars_res_word_t pars_rnd_token; +extern pars_res_word_t pars_rnd_str_token; +extern pars_res_word_t pars_count_token; +extern pars_res_word_t pars_sum_token; +extern pars_res_word_t pars_distinct_token; +extern pars_res_word_t pars_binary_token; +extern pars_res_word_t pars_blob_token; +extern pars_res_word_t pars_int_token; +extern pars_res_word_t pars_bigint_token; +extern pars_res_word_t pars_char_token; +extern pars_res_word_t pars_float_token; +extern pars_res_word_t pars_update_token; +extern pars_res_word_t pars_asc_token; +extern pars_res_word_t pars_desc_token; +extern pars_res_word_t pars_open_token; +extern pars_res_word_t pars_close_token; +extern pars_res_word_t pars_share_token; +extern pars_res_word_t pars_unique_token; +extern pars_res_word_t pars_clustered_token; + +extern ulint pars_star_denoter; + +/* Procedure parameter types */ +#define PARS_INPUT 0 +#define PARS_OUTPUT 1 +#define PARS_NOT_PARAM 2 + +int +yyparse(void); + +/*************************************************************//** +Parses an SQL string returning the query graph. +@return own: the query graph */ +UNIV_INTERN +que_t* +pars_sql( +/*=====*/ + pars_info_t* info, /*!< in: extra information, or NULL */ + const char* str); /*!< in: SQL string */ +/*************************************************************//** +Retrieves characters to the lexical analyzer. +@return number of characters copied or 0 on EOF */ +UNIV_INTERN +int +pars_get_lex_chars( +/*===============*/ + char* buf, /*!< in/out: buffer where to copy */ + int max_size); /*!< in: maximum number of characters which fit + in the buffer */ +/*************************************************************//** +Called by yyparse on error. */ +UNIV_INTERN +void +yyerror( +/*====*/ + const char* s); /*!< in: error message string */ +/*********************************************************************//** +Parses a variable declaration. +@return own: symbol table node of type SYM_VAR */ +UNIV_INTERN +sym_node_t* +pars_variable_declaration( +/*======================*/ + sym_node_t* node, /*!< in: symbol table node allocated for the + id of the variable */ + pars_res_word_t* type); /*!< in: pointer to a type token */ +/*********************************************************************//** +Parses a function expression. +@return own: function node in a query tree */ +UNIV_INTERN +func_node_t* +pars_func( +/*======*/ + que_node_t* res_word,/*!< in: function name reserved word */ + que_node_t* arg); /*!< in: first argument in the argument list */ +/************************************************************************* +Rebind a LIKE search string. NOTE: We ignore any '%' characters embedded +within the search string. +@return own: function node in a query tree */ +UNIV_INTERN +int +pars_like_rebind( +/*=============*/ + sym_node_t* node, /* in: The search string node.*/ + const byte* ptr, /* in: literal to (re) bind */ + ulint len); /* in: length of literal to (re) bind*/ +/*********************************************************************//** +Parses an operator expression. +@return own: function node in a query tree */ +UNIV_INTERN +func_node_t* +pars_op( +/*====*/ + int func, /*!< in: operator token code */ + que_node_t* arg1, /*!< in: first argument */ + que_node_t* arg2); /*!< in: second argument or NULL for an unary + operator */ +/*********************************************************************//** +Parses an ORDER BY clause. Order by a single column only is supported. +@return own: order-by node in a query tree */ +UNIV_INTERN +order_node_t* +pars_order_by( +/*==========*/ + sym_node_t* column, /*!< in: column name */ + pars_res_word_t* asc); /*!< in: &pars_asc_token or pars_desc_token */ +/*********************************************************************//** +Parses a select list; creates a query graph node for the whole SELECT +statement. +@return own: select node in a query tree */ +UNIV_INTERN +sel_node_t* +pars_select_list( +/*=============*/ + que_node_t* select_list, /*!< in: select list */ + sym_node_t* into_list); /*!< in: variables list or NULL */ +/*********************************************************************//** +Parses a cursor declaration. +@return sym_node */ +UNIV_INTERN +que_node_t* +pars_cursor_declaration( +/*====================*/ + sym_node_t* sym_node, /*!< in: cursor id node in the symbol + table */ + sel_node_t* select_node); /*!< in: select node */ +/*********************************************************************//** +Parses a function declaration. +@return sym_node */ +UNIV_INTERN +que_node_t* +pars_function_declaration( +/*======================*/ + sym_node_t* sym_node); /*!< in: function id node in the symbol + table */ +/*********************************************************************//** +Parses a select statement. +@return own: select node in a query tree */ +UNIV_INTERN +sel_node_t* +pars_select_statement( +/*==================*/ + sel_node_t* select_node, /*!< in: select node already containing + the select list */ + sym_node_t* table_list, /*!< in: table list */ + que_node_t* search_cond, /*!< in: search condition or NULL */ + pars_res_word_t* for_update, /*!< in: NULL or &pars_update_token */ + pars_res_word_t* consistent_read,/*!< in: NULL or + &pars_consistent_token */ + order_node_t* order_by); /*!< in: NULL or an order-by node */ +/*********************************************************************//** +Parses a column assignment in an update. +@return column assignment node */ +UNIV_INTERN +col_assign_node_t* +pars_column_assignment( +/*===================*/ + sym_node_t* column, /*!< in: column to assign */ + que_node_t* exp); /*!< in: value to assign */ +/*********************************************************************//** +Parses a delete or update statement start. +@return own: update node in a query tree */ +UNIV_INTERN +upd_node_t* +pars_update_statement_start( +/*========================*/ + ibool is_delete, /*!< in: TRUE if delete */ + sym_node_t* table_sym, /*!< in: table name node */ + col_assign_node_t* col_assign_list);/*!< in: column assignment list, NULL + if delete */ +/*********************************************************************//** +Parses an update or delete statement. +@return own: update node in a query tree */ +UNIV_INTERN +upd_node_t* +pars_update_statement( +/*==================*/ + upd_node_t* node, /*!< in: update node */ + sym_node_t* cursor_sym, /*!< in: pointer to a cursor entry in + the symbol table or NULL */ + que_node_t* search_cond); /*!< in: search condition or NULL */ +/*********************************************************************//** +Parses an insert statement. +@return own: update node in a query tree */ +UNIV_INTERN +ins_node_t* +pars_insert_statement( +/*==================*/ + sym_node_t* table_sym, /*!< in: table name node */ + que_node_t* values_list, /*!< in: value expression list or NULL */ + sel_node_t* select); /*!< in: select condition or NULL */ +/*********************************************************************//** +Parses a procedure parameter declaration. +@return own: symbol table node of type SYM_VAR */ +UNIV_INTERN +sym_node_t* +pars_parameter_declaration( +/*=======================*/ + sym_node_t* node, /*!< in: symbol table node allocated for the + id of the parameter */ + ulint param_type, + /*!< in: PARS_INPUT or PARS_OUTPUT */ + pars_res_word_t* type); /*!< in: pointer to a type token */ +/*********************************************************************//** +Parses an elsif element. +@return elsif node */ +UNIV_INTERN +elsif_node_t* +pars_elsif_element( +/*===============*/ + que_node_t* cond, /*!< in: if-condition */ + que_node_t* stat_list); /*!< in: statement list */ +/*********************************************************************//** +Parses an if-statement. +@return if-statement node */ +UNIV_INTERN +if_node_t* +pars_if_statement( +/*==============*/ + que_node_t* cond, /*!< in: if-condition */ + que_node_t* stat_list, /*!< in: statement list */ + que_node_t* else_part); /*!< in: else-part statement list */ +/*********************************************************************//** +Parses a for-loop-statement. +@return for-statement node */ +UNIV_INTERN +for_node_t* +pars_for_statement( +/*===============*/ + sym_node_t* loop_var, /*!< in: loop variable */ + que_node_t* loop_start_limit,/*!< in: loop start expression */ + que_node_t* loop_end_limit, /*!< in: loop end expression */ + que_node_t* stat_list); /*!< in: statement list */ +/*********************************************************************//** +Parses a while-statement. +@return while-statement node */ +UNIV_INTERN +while_node_t* +pars_while_statement( +/*=================*/ + que_node_t* cond, /*!< in: while-condition */ + que_node_t* stat_list); /*!< in: statement list */ +/*********************************************************************//** +Parses an exit statement. +@return exit statement node */ +UNIV_INTERN +exit_node_t* +pars_exit_statement(void); +/*=====================*/ +/*********************************************************************//** +Parses a return-statement. +@return return-statement node */ +UNIV_INTERN +return_node_t* +pars_return_statement(void); +/*=======================*/ +/*********************************************************************//** +Parses a procedure call. +@return function node */ +UNIV_INTERN +func_node_t* +pars_procedure_call( +/*================*/ + que_node_t* res_word,/*!< in: procedure name reserved word */ + que_node_t* args); /*!< in: argument list */ +/*********************************************************************//** +Parses an assignment statement. +@return assignment statement node */ +UNIV_INTERN +assign_node_t* +pars_assignment_statement( +/*======================*/ + sym_node_t* var, /*!< in: variable to assign */ + que_node_t* val); /*!< in: value to assign */ +/*********************************************************************//** +Parses a fetch statement. into_list or user_func (but not both) must be +non-NULL. +@return fetch statement node */ +UNIV_INTERN +fetch_node_t* +pars_fetch_statement( +/*=================*/ + sym_node_t* cursor, /*!< in: cursor node */ + sym_node_t* into_list, /*!< in: variables to set, or NULL */ + sym_node_t* user_func); /*!< in: user function name, or NULL */ +/*********************************************************************//** +Parses an open or close cursor statement. +@return fetch statement node */ +UNIV_INTERN +open_node_t* +pars_open_statement( +/*================*/ + ulint type, /*!< in: ROW_SEL_OPEN_CURSOR + or ROW_SEL_CLOSE_CURSOR */ + sym_node_t* cursor); /*!< in: cursor node */ +/*********************************************************************//** +Parses a row_printf-statement. +@return row_printf-statement node */ +UNIV_INTERN +row_printf_node_t* +pars_row_printf_statement( +/*======================*/ + sel_node_t* sel_node); /*!< in: select node */ +/*********************************************************************//** +Parses a commit statement. +@return own: commit node struct */ +UNIV_INTERN +commit_node_t* +pars_commit_statement(void); +/*=======================*/ +/*********************************************************************//** +Parses a rollback statement. +@return own: rollback node struct */ +UNIV_INTERN +roll_node_t* +pars_rollback_statement(void); +/*=========================*/ +/*********************************************************************//** +Parses a column definition at a table creation. +@return column sym table node */ +UNIV_INTERN +sym_node_t* +pars_column_def( +/*============*/ + sym_node_t* sym_node, /*!< in: column node in the + symbol table */ + pars_res_word_t* type, /*!< in: data type */ + sym_node_t* len, /*!< in: length of column, or + NULL */ + void* is_unsigned, /*!< in: if not NULL, column + is of type UNSIGNED. */ + void* is_not_null); /*!< in: if not NULL, column + is of type NOT NULL. */ +/*********************************************************************//** +Parses a table creation operation. +@return table create subgraph */ +UNIV_INTERN +tab_node_t* +pars_create_table( +/*==============*/ + sym_node_t* table_sym, /*!< in: table name node in the symbol + table */ + sym_node_t* column_defs, /*!< in: list of column names */ + sym_node_t* compact, /* in: non-NULL if COMPACT table. */ + sym_node_t* block_size, /* in: block size (can be NULL) */ + void* not_fit_in_memory); + /*!< in: a non-NULL pointer means that + this is a table which in simulations + should be simulated as not fitting + in memory; thread is put to sleep + to simulate disk accesses; NOTE that + this flag is not stored to the data + dictionary on disk, and the database + will forget about non-NULL value if + it has to reload the table definition + from disk */ +/*********************************************************************//** +Parses an index creation operation. +@return index create subgraph */ +UNIV_INTERN +ind_node_t* +pars_create_index( +/*==============*/ + pars_res_word_t* unique_def, /*!< in: not NULL if a unique index */ + pars_res_word_t* clustered_def, /*!< in: not NULL if a clustered index */ + sym_node_t* index_sym, /*!< in: index name node in the symbol + table */ + sym_node_t* table_sym, /*!< in: table name node in the symbol + table */ + sym_node_t* column_list); /*!< in: list of column names */ +/*********************************************************************//** +Parses a procedure definition. +@return query fork node */ +UNIV_INTERN +que_fork_t* +pars_procedure_definition( +/*======================*/ + sym_node_t* sym_node, /*!< in: procedure id node in the symbol + table */ + sym_node_t* param_list, /*!< in: parameter declaration list */ + que_node_t* stat_list); /*!< in: statement list */ + +/*************************************************************//** +Parses a stored procedure call, when this is not within another stored +procedure, that is, the client issues a procedure call directly. +In MySQL/InnoDB, stored InnoDB procedures are invoked via the +parsed procedure tree, not via InnoDB SQL, so this function is not used. +@return query graph */ +UNIV_INTERN +que_fork_t* +pars_stored_procedure_call( +/*=======================*/ + sym_node_t* sym_node); /*!< in: stored procedure name */ +/******************************************************************//** +Completes a query graph by adding query thread and fork nodes +above it and prepares the graph for running. The fork created is of +type QUE_FORK_MYSQL_INTERFACE. +@return query thread node to run */ +UNIV_INTERN +que_thr_t* +pars_complete_graph_for_exec( +/*=========================*/ + que_node_t* node, /*!< in: root node for an incomplete + query graph, or NULL for dummy graph */ + trx_t* trx, /*!< in: transaction handle */ + mem_heap_t* heap) /*!< in: memory heap from which allocated */ + __attribute__((nonnull(2,3), warn_unused_result)); + +/****************************************************************//** +Create parser info struct. +@return own: info struct */ +UNIV_INTERN +pars_info_t* +pars_info_create(void); +/*==================*/ + +/****************************************************************//** +Free info struct and everything it contains. */ +UNIV_INTERN +void +pars_info_free( +/*===========*/ + pars_info_t* info); /*!< in, own: info struct */ + +/****************************************************************//** +Add bound literal. */ +UNIV_INTERN +void +pars_info_add_literal( +/*==================*/ + pars_info_t* info, /*!< in: info struct */ + const char* name, /*!< in: name */ + const void* address, /*!< in: address */ + ulint length, /*!< in: length of data */ + ulint type, /*!< in: type, e.g. DATA_FIXBINARY */ + ulint prtype); /*!< in: precise type, e.g. + DATA_UNSIGNED */ + +/****************************************************************//** +Equivalent to pars_info_add_literal(info, name, str, strlen(str), +DATA_VARCHAR, DATA_ENGLISH). */ +UNIV_INTERN +void +pars_info_add_str_literal( +/*======================*/ + pars_info_t* info, /*!< in: info struct */ + const char* name, /*!< in: name */ + const char* str); /*!< in: string */ +/******************************************************************** +If the literal value already exists then it rebinds otherwise it +creates a new entry.*/ +UNIV_INTERN +void +pars_info_bind_literal( +/*===================*/ + pars_info_t* info, /* in: info struct */ + const char* name, /* in: name */ + const void* address, /* in: address */ + ulint length, /* in: length of data */ + ulint type, /* in: type, e.g. DATA_FIXBINARY */ + ulint prtype); /* in: precise type, e.g. */ +/******************************************************************** +If the literal value already exists then it rebinds otherwise it +creates a new entry.*/ +UNIV_INTERN +void +pars_info_bind_varchar_literal( +/*===========================*/ + pars_info_t* info, /*!< in: info struct */ + const char* name, /*!< in: name */ + const byte* str, /*!< in: string */ + ulint str_len); /*!< in: string length */ +/****************************************************************//** +Equivalent to: + +char buf[4]; +mach_write_to_4(buf, val); +pars_info_add_literal(info, name, buf, 4, DATA_INT, 0); + +except that the buffer is dynamically allocated from the info struct's +heap. */ +UNIV_INTERN +void +pars_info_bind_int4_literal( +/*=======================*/ + pars_info_t* info, /*!< in: info struct */ + const char* name, /*!< in: name */ + const ib_uint32_t* val); /*!< in: value */ +/******************************************************************** +If the literal value already exists then it rebinds otherwise it +creates a new entry. */ +UNIV_INTERN +void +pars_info_bind_int8_literal( +/*=======================*/ + pars_info_t* info, /*!< in: info struct */ + const char* name, /*!< in: name */ + const ib_uint64_t* val); /*!< in: value */ +/****************************************************************//** +Add user function. */ +UNIV_INTERN +void +pars_info_bind_function( +/*===================*/ + pars_info_t* info, /*!< in: info struct */ + const char* name, /*!< in: function name */ + pars_user_func_cb_t func, /*!< in: function address */ + void* arg); /*!< in: user-supplied argument */ +/****************************************************************//** +Add bound id. */ +UNIV_INTERN +void +pars_info_bind_id( +/*=============*/ + pars_info_t* info, /*!< in: info struct */ + ibool copy_name,/* in: make a copy of name if TRUE */ + const char* name, /*!< in: name */ + const char* id); /*!< in: id */ +/****************************************************************//** +Equivalent to: + +char buf[4]; +mach_write_to_4(buf, val); +pars_info_add_literal(info, name, buf, 4, DATA_INT, 0); + +except that the buffer is dynamically allocated from the info struct's +heap. */ +UNIV_INTERN +void +pars_info_add_int4_literal( +/*=======================*/ + pars_info_t* info, /*!< in: info struct */ + const char* name, /*!< in: name */ + lint val); /*!< in: value */ + +/****************************************************************//** +Equivalent to: + +char buf[8]; +mach_write_to_8(buf, val); +pars_info_add_literal(info, name, buf, 8, DATA_FIXBINARY, 0); + +except that the buffer is dynamically allocated from the info struct's +heap. */ +UNIV_INTERN +void +pars_info_add_ull_literal( +/*======================*/ + pars_info_t* info, /*!< in: info struct */ + const char* name, /*!< in: name */ + ib_uint64_t val); /*!< in: value */ + +/****************************************************************//** +If the literal value already exists then it rebinds otherwise it +creates a new entry. */ +UNIV_INTERN +void +pars_info_bind_ull_literal( +/*=======================*/ + pars_info_t* info, /*!< in: info struct */ + const char* name, /*!< in: name */ + const ib_uint64_t* val) /*!< in: value */ + __attribute__((nonnull)); + +/****************************************************************//** +Add bound id. */ +UNIV_INTERN +void +pars_info_add_id( +/*=============*/ + pars_info_t* info, /*!< in: info struct */ + const char* name, /*!< in: name */ + const char* id); /*!< in: id */ + +/****************************************************************//** +Get bound literal with the given name. +@return bound literal, or NULL if not found */ +UNIV_INTERN +pars_bound_lit_t* +pars_info_get_bound_lit( +/*====================*/ + pars_info_t* info, /*!< in: info struct */ + const char* name); /*!< in: bound literal name to find */ + +/****************************************************************//** +Get bound id with the given name. +@return bound id, or NULL if not found */ +UNIV_INTERN +pars_bound_id_t* +pars_info_get_bound_id( +/*===================*/ + pars_info_t* info, /*!< in: info struct */ + const char* name); /*!< in: bound id name to find */ + +/******************************************************************//** +Release any resources used by the lexer. */ +UNIV_INTERN +void +pars_lexer_close(void); +/*==================*/ + +/** Extra information supplied for pars_sql(). */ +struct pars_info_t { + mem_heap_t* heap; /*!< our own memory heap */ + + ib_vector_t* funcs; /*!< user functions, or NUll + (pars_user_func_t*) */ + ib_vector_t* bound_lits; /*!< bound literals, or NULL + (pars_bound_lit_t*) */ + ib_vector_t* bound_ids; /*!< bound ids, or NULL + (pars_bound_id_t*) */ + + ibool graph_owns_us; /*!< if TRUE (which is the default), + que_graph_free() will free us */ +}; + +/** User-supplied function and argument. */ +struct pars_user_func_t { + const char* name; /*!< function name */ + pars_user_func_cb_t func; /*!< function address */ + void* arg; /*!< user-supplied argument */ +}; + +/** Bound literal. */ +struct pars_bound_lit_t { + const char* name; /*!< name */ + const void* address; /*!< address */ + ulint length; /*!< length of data */ + ulint type; /*!< type, e.g. DATA_FIXBINARY */ + ulint prtype; /*!< precise type, e.g. DATA_UNSIGNED */ + sym_node_t* node; /*!< symbol node */ +}; + +/** Bound identifier. */ +struct pars_bound_id_t { + const char* name; /*!< name */ + const char* id; /*!< identifier */ +}; + +/** Struct used to denote a reserved word in a parsing tree */ +struct pars_res_word_t{ + int code; /*!< the token code for the reserved word from + pars0grm.h */ +}; + +/** A predefined function or operator node in a parsing tree; this construct +is also used for some non-functions like the assignment ':=' */ +struct func_node_t{ + que_common_t common; /*!< type: QUE_NODE_FUNC */ + int func; /*!< token code of the function name */ + ulint fclass; /*!< class of the function */ + que_node_t* args; /*!< argument(s) of the function */ + UT_LIST_NODE_T(func_node_t) cond_list; + /*!< list of comparison conditions; defined + only for comparison operator nodes except, + presently, for OPT_SCROLL_TYPE ones */ + UT_LIST_NODE_T(func_node_t) func_node_list; + /*!< list of function nodes in a parsed + query graph */ +}; + +/** An order-by node in a select */ +struct order_node_t{ + que_common_t common; /*!< type: QUE_NODE_ORDER */ + sym_node_t* column; /*!< order-by column */ + ibool asc; /*!< TRUE if ascending, FALSE if descending */ +}; + +/** Procedure definition node */ +struct proc_node_t{ + que_common_t common; /*!< type: QUE_NODE_PROC */ + sym_node_t* proc_id; /*!< procedure name symbol in the symbol + table of this same procedure */ + sym_node_t* param_list; /*!< input and output parameters */ + que_node_t* stat_list; /*!< statement list */ + sym_tab_t* sym_tab; /*!< symbol table of this procedure */ +}; + +/** elsif-element node */ +struct elsif_node_t{ + que_common_t common; /*!< type: QUE_NODE_ELSIF */ + que_node_t* cond; /*!< if condition */ + que_node_t* stat_list; /*!< statement list */ +}; + +/** if-statement node */ +struct if_node_t{ + que_common_t common; /*!< type: QUE_NODE_IF */ + que_node_t* cond; /*!< if condition */ + que_node_t* stat_list; /*!< statement list */ + que_node_t* else_part; /*!< else-part statement list */ + elsif_node_t* elsif_list; /*!< elsif element list */ +}; + +/** while-statement node */ +struct while_node_t{ + que_common_t common; /*!< type: QUE_NODE_WHILE */ + que_node_t* cond; /*!< while condition */ + que_node_t* stat_list; /*!< statement list */ +}; + +/** for-loop-statement node */ +struct for_node_t{ + que_common_t common; /*!< type: QUE_NODE_FOR */ + sym_node_t* loop_var; /*!< loop variable: this is the + dereferenced symbol from the + variable declarations, not the + symbol occurrence in the for loop + definition */ + que_node_t* loop_start_limit;/*!< initial value of loop variable */ + que_node_t* loop_end_limit; /*!< end value of loop variable */ + lint loop_end_value; /*!< evaluated value for the end value: + it is calculated only when the loop + is entered, and will not change within + the loop */ + que_node_t* stat_list; /*!< statement list */ +}; + +/** exit statement node */ +struct exit_node_t{ + que_common_t common; /*!< type: QUE_NODE_EXIT */ +}; + +/** return-statement node */ +struct return_node_t{ + que_common_t common; /*!< type: QUE_NODE_RETURN */ +}; + +/** Assignment statement node */ +struct assign_node_t{ + que_common_t common; /*!< type: QUE_NODE_ASSIGNMENT */ + sym_node_t* var; /*!< variable to set */ + que_node_t* val; /*!< value to assign */ +}; + +/** Column assignment node */ +struct col_assign_node_t{ + que_common_t common; /*!< type: QUE_NODE_COL_ASSIGN */ + sym_node_t* col; /*!< column to set */ + que_node_t* val; /*!< value to assign */ +}; + +/** Classes of functions */ +/* @{ */ +#define PARS_FUNC_ARITH 1 /*!< +, -, *, / */ +#define PARS_FUNC_LOGICAL 2 /*!< AND, OR, NOT */ +#define PARS_FUNC_CMP 3 /*!< comparison operators */ +#define PARS_FUNC_PREDEFINED 4 /*!< TO_NUMBER, SUBSTR, ... */ +#define PARS_FUNC_AGGREGATE 5 /*!< COUNT, DISTINCT, SUM */ +#define PARS_FUNC_OTHER 6 /*!< these are not real functions, + e.g., := */ +/* @} */ + +#ifndef UNIV_NONINL +#include "pars0pars.ic" +#endif + +#endif diff --git a/storage/innobase/include/pars0pars.ic b/storage/innobase/include/pars0pars.ic new file mode 100644 index 00000000000..4c88337a265 --- /dev/null +++ b/storage/innobase/include/pars0pars.ic @@ -0,0 +1,24 @@ +/***************************************************************************** + +Copyright (c) 1996, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/pars0pars.ic +SQL parser + +Created 11/19/1996 Heikki Tuuri +*******************************************************/ diff --git a/storage/innobase/include/pars0sym.h b/storage/innobase/include/pars0sym.h new file mode 100644 index 00000000000..bcf73639228 --- /dev/null +++ b/storage/innobase/include/pars0sym.h @@ -0,0 +1,258 @@ +/***************************************************************************** + +Copyright (c) 1997, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/pars0sym.h +SQL parser symbol table + +Created 12/15/1997 Heikki Tuuri +*******************************************************/ + +#ifndef pars0sym_h +#define pars0sym_h + +#include "univ.i" +#include "que0types.h" +#include "usr0types.h" +#include "dict0types.h" +#include "pars0types.h" +#include "row0types.h" + +/******************************************************************//** +Creates a symbol table for a single stored procedure or query. +@return own: symbol table */ +UNIV_INTERN +sym_tab_t* +sym_tab_create( +/*===========*/ + mem_heap_t* heap); /*!< in: memory heap where to create */ +/******************************************************************//** +Frees the memory allocated dynamically AFTER parsing phase for variables +etc. in the symbol table. Does not free the mem heap where the table was +originally created. Frees also SQL explicit cursor definitions. */ +UNIV_INTERN +void +sym_tab_free_private( +/*=================*/ + sym_tab_t* sym_tab); /*!< in, own: symbol table */ +/******************************************************************//** +Adds an integer literal to a symbol table. +@return symbol table node */ +UNIV_INTERN +sym_node_t* +sym_tab_add_int_lit( +/*================*/ + sym_tab_t* sym_tab, /*!< in: symbol table */ + ulint val); /*!< in: integer value */ +/******************************************************************//** +Adds an string literal to a symbol table. +@return symbol table node */ +UNIV_INTERN +sym_node_t* +sym_tab_add_str_lit( +/*================*/ + sym_tab_t* sym_tab, /*!< in: symbol table */ + const byte* str, /*!< in: string with no quotes around + it */ + ulint len); /*!< in: string length */ +/******************************************************************//** +Add a bound literal to a symbol table. +@return symbol table node */ +UNIV_INTERN +sym_node_t* +sym_tab_add_bound_lit( +/*==================*/ + sym_tab_t* sym_tab, /*!< in: symbol table */ + const char* name, /*!< in: name of bound literal */ + ulint* lit_type); /*!< out: type of literal (PARS_*_LIT) */ +/********************************************************************** +Rebind literal to a node in the symbol table. */ + +sym_node_t* +sym_tab_rebind_lit( +/*===============*/ + /* out: symbol table node */ + sym_node_t* node, /* in: node that is bound to literal*/ + const void* address, /* in: pointer to data */ + ulint length); /* in: length of data */ +/******************************************************************//** +Adds an SQL null literal to a symbol table. +@return symbol table node */ +UNIV_INTERN +sym_node_t* +sym_tab_add_null_lit( +/*=================*/ + sym_tab_t* sym_tab); /*!< in: symbol table */ +/******************************************************************//** +Adds an identifier to a symbol table. +@return symbol table node */ +UNIV_INTERN +sym_node_t* +sym_tab_add_id( +/*===========*/ + sym_tab_t* sym_tab, /*!< in: symbol table */ + byte* name, /*!< in: identifier name */ + ulint len); /*!< in: identifier length */ + +/******************************************************************//** +Add a bound identifier to a symbol table. +@return symbol table node */ +UNIV_INTERN +sym_node_t* +sym_tab_add_bound_id( +/*===========*/ + sym_tab_t* sym_tab, /*!< in: symbol table */ + const char* name); /*!< in: name of bound id */ + +/** Index of sym_node_t::field_nos corresponding to the clustered index */ +#define SYM_CLUST_FIELD_NO 0 +/** Index of sym_node_t::field_nos corresponding to a secondary index */ +#define SYM_SEC_FIELD_NO 1 + +/** Types of a symbol table node */ +enum sym_tab_entry { + SYM_UNSET, /*!< Unset entry. */ + SYM_VAR = 91, /*!< declared parameter or local + variable of a procedure */ + SYM_IMPLICIT_VAR, /*!< storage for a intermediate result + of a calculation */ + SYM_LIT, /*!< literal */ + SYM_TABLE_REF_COUNTED, /*!< database table name, ref counted. Must + be closed explicitly. */ + SYM_TABLE, /*!< database table name */ + SYM_COLUMN, /*!< database table name */ + SYM_CURSOR, /*!< named cursor */ + SYM_PROCEDURE_NAME, /*!< stored procedure name */ + SYM_INDEX, /*!< database index name */ + SYM_FUNCTION /*!< user function name */ +}; + +/** Symbol table node */ +struct sym_node_t{ + que_common_t common; /*!< node type: + QUE_NODE_SYMBOL */ + /* NOTE: if the data field in 'common.val' is not NULL and the symbol + table node is not for a temporary column, the memory for the value has + been allocated from dynamic memory and it should be freed when the + symbol table is discarded */ + + /* 'alias' and 'indirection' are almost the same, but not quite. + 'alias' always points to the primary instance of the variable, while + 'indirection' does the same only if we should use the primary + instance's values for the node's data. This is usually the case, but + when initializing a cursor (e.g., "DECLARE CURSOR c IS SELECT * FROM + t WHERE id = x;"), we copy the values from the primary instance to + the cursor's instance so that they are fixed for the duration of the + cursor, and set 'indirection' to NULL. If we did not, the value of + 'x' could change between fetches and things would break horribly. + + TODO: It would be cleaner to make 'indirection' a boolean field and + always use 'alias' to refer to the primary node. */ + + sym_node_t* indirection; /*!< pointer to + another symbol table + node which contains + the value for this + node, NULL otherwise */ + sym_node_t* alias; /*!< pointer to + another symbol table + node for which this + node is an alias, + NULL otherwise */ + UT_LIST_NODE_T(sym_node_t) col_var_list; /*!< list of table + columns or a list of + input variables for an + explicit cursor */ + ibool copy_val; /*!< TRUE if a column + and its value should + be copied to dynamic + memory when fetched */ + ulint field_nos[2]; /*!< if a column, in + the position + SYM_CLUST_FIELD_NO is + the field number in the + clustered index; in + the position + SYM_SEC_FIELD_NO + the field number in the + non-clustered index to + use first; if not found + from the index, then + ULINT_UNDEFINED */ + ibool resolved; /*!< TRUE if the + meaning of a variable + or a column has been + resolved; for literals + this is always TRUE */ + enum sym_tab_entry token_type; /*!< type of the + parsed token */ + const char* name; /*!< name of an id */ + ulint name_len; /*!< id name length */ + dict_table_t* table; /*!< table definition + if a table id or a + column id */ + ulint col_no; /*!< column number if a + column */ + sel_buf_t* prefetch_buf; /*!< NULL, or a buffer + for cached column + values for prefetched + rows */ + sel_node_t* cursor_def; /*!< cursor definition + select node if a + named cursor */ + ulint param_type; /*!< PARS_INPUT, + PARS_OUTPUT, or + PARS_NOT_PARAM if not a + procedure parameter */ + sym_tab_t* sym_table; /*!< back pointer to + the symbol table */ + UT_LIST_NODE_T(sym_node_t) sym_list; /*!< list of symbol + nodes */ + sym_node_t* like_node; /* LIKE operator node*/ +}; + +/** Symbol table */ +struct sym_tab_t{ + que_t* query_graph; + /*!< query graph generated by the + parser */ + const char* sql_string; + /*!< SQL string to parse */ + size_t string_len; + /*!< SQL string length */ + int next_char_pos; + /*!< position of the next character in + sql_string to give to the lexical + analyzer */ + pars_info_t* info; /*!< extra information, or NULL */ + sym_node_list_t sym_list; + /*!< list of symbol nodes in the symbol + table */ + UT_LIST_BASE_NODE_T(func_node_t) + func_node_list; + /*!< list of function nodes in the + parsed query graph */ + mem_heap_t* heap; /*!< memory heap from which we can + allocate space */ +}; + +#ifndef UNIV_NONINL +#include "pars0sym.ic" +#endif + +#endif diff --git a/storage/innobase/include/pars0sym.ic b/storage/innobase/include/pars0sym.ic new file mode 100644 index 00000000000..266c1a6310d --- /dev/null +++ b/storage/innobase/include/pars0sym.ic @@ -0,0 +1,24 @@ +/***************************************************************************** + +Copyright (c) 1997, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/pars0sym.ic +SQL parser symbol table + +Created 12/15/1997 Heikki Tuuri +*******************************************************/ diff --git a/storage/innobase/include/pars0types.h b/storage/innobase/include/pars0types.h new file mode 100644 index 00000000000..47f4b432d20 --- /dev/null +++ b/storage/innobase/include/pars0types.h @@ -0,0 +1,50 @@ +/***************************************************************************** + +Copyright (c) 1998, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/pars0types.h +SQL parser global types + +Created 1/11/1998 Heikki Tuuri +*******************************************************/ + +#ifndef pars0types_h +#define pars0types_h + +struct pars_info_t; +struct pars_user_func_t; +struct pars_bound_lit_t; +struct pars_bound_id_t; +struct sym_node_t; +struct sym_tab_t; +struct pars_res_word_t; +struct func_node_t; +struct order_node_t; +struct proc_node_t; +struct elsif_node_t; +struct if_node_t; +struct while_node_t; +struct for_node_t; +struct exit_node_t; +struct return_node_t; +struct assign_node_t; +struct col_assign_node_t; + +typedef UT_LIST_BASE_NODE_T(sym_node_t) sym_node_list_t; + +#endif diff --git a/storage/innobase/include/que0que.h b/storage/innobase/include/que0que.h new file mode 100644 index 00000000000..ba8828623af --- /dev/null +++ b/storage/innobase/include/que0que.h @@ -0,0 +1,530 @@ +/***************************************************************************** + +Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/que0que.h +Query graph + +Created 5/27/1996 Heikki Tuuri +*******************************************************/ + +#ifndef que0que_h +#define que0que_h + +#include "univ.i" +#include "data0data.h" +#include "dict0types.h" +#include "trx0trx.h" +#include "trx0roll.h" +#include "srv0srv.h" +#include "usr0types.h" +#include "que0types.h" +#include "row0types.h" +#include "pars0types.h" + +/* If the following flag is set TRUE, the module will print trace info +of SQL execution in the UNIV_SQL_DEBUG version */ +extern ibool que_trace_on; + +/** Mutex protecting the query threads. */ +extern ib_mutex_t que_thr_mutex; + +/***********************************************************************//** +Creates a query graph fork node. +@return own: fork node */ +UNIV_INTERN +que_fork_t* +que_fork_create( +/*============*/ + que_t* graph, /*!< in: graph, if NULL then this + fork node is assumed to be the + graph root */ + que_node_t* parent, /*!< in: parent node */ + ulint fork_type, /*!< in: fork type */ + mem_heap_t* heap); /*!< in: memory heap where created */ +/***********************************************************************//** +Gets the first thr in a fork. */ +UNIV_INLINE +que_thr_t* +que_fork_get_first_thr( +/*===================*/ + que_fork_t* fork); /*!< in: query fork */ +/***********************************************************************//** +Gets the child node of the first thr in a fork. */ +UNIV_INLINE +que_node_t* +que_fork_get_child( +/*===============*/ + que_fork_t* fork); /*!< in: query fork */ +/***********************************************************************//** +Sets the parent of a graph node. */ +UNIV_INLINE +void +que_node_set_parent( +/*================*/ + que_node_t* node, /*!< in: graph node */ + que_node_t* parent);/*!< in: parent */ +/***********************************************************************//** +Creates a query graph thread node. +@return own: query thread node */ +UNIV_INTERN +que_thr_t* +que_thr_create( +/*===========*/ + que_fork_t* parent, /*!< in: parent node, i.e., a fork node */ + mem_heap_t* heap); /*!< in: memory heap where created */ +/**********************************************************************//** +Frees a query graph, but not the heap where it was created. Does not free +explicit cursor declarations, they are freed in que_graph_free. */ +UNIV_INTERN +void +que_graph_free_recursive( +/*=====================*/ + que_node_t* node); /*!< in: query graph node */ +/**********************************************************************//** +Frees a query graph. */ +UNIV_INTERN +void +que_graph_free( +/*===========*/ + que_t* graph); /*!< in: query graph; we assume that the memory + heap where this graph was created is private + to this graph: if not, then use + que_graph_free_recursive and free the heap + afterwards! */ +/**********************************************************************//** +Stops a query thread if graph or trx is in a state requiring it. The +conditions are tested in the order (1) graph, (2) trx. The lock_sys_t::mutex +has to be reserved. +@return TRUE if stopped */ +UNIV_INTERN +ibool +que_thr_stop( +/*=========*/ + que_thr_t* thr); /*!< in: query thread */ +/**********************************************************************//** +Moves a thread from another state to the QUE_THR_RUNNING state. Increments +the n_active_thrs counters of the query graph and transaction. */ +UNIV_INTERN +void +que_thr_move_to_run_state_for_mysql( +/*================================*/ + que_thr_t* thr, /*!< in: an query thread */ + trx_t* trx); /*!< in: transaction */ +/**********************************************************************//** +A patch for MySQL used to 'stop' a dummy query thread used in MySQL +select, when there is no error or lock wait. */ +UNIV_INTERN +void +que_thr_stop_for_mysql_no_error( +/*============================*/ + que_thr_t* thr, /*!< in: query thread */ + trx_t* trx); /*!< in: transaction */ +/**********************************************************************//** +A patch for MySQL used to 'stop' a dummy query thread used in MySQL. The +query thread is stopped and made inactive, except in the case where +it was put to the lock wait state in lock0lock.cc, but the lock has already +been granted or the transaction chosen as a victim in deadlock resolution. */ +UNIV_INTERN +void +que_thr_stop_for_mysql( +/*===================*/ + que_thr_t* thr); /*!< in: query thread */ +/**********************************************************************//** +Run a query thread. Handles lock waits. */ +UNIV_INTERN +void +que_run_threads( +/*============*/ + que_thr_t* thr); /*!< in: query thread */ +/**********************************************************************//** +Moves a suspended query thread to the QUE_THR_RUNNING state and release +a worker thread to execute it. This function should be used to end +the wait state of a query thread waiting for a lock or a stored procedure +completion. +@return query thread instance of thread to wakeup or NULL */ +UNIV_INTERN +que_thr_t* +que_thr_end_lock_wait( +/*==================*/ + trx_t* trx); /*!< in: transaction in the + QUE_THR_LOCK_WAIT state */ +/**********************************************************************//** +Starts execution of a command in a query fork. Picks a query thread which +is not in the QUE_THR_RUNNING state and moves it to that state. If none +can be chosen, a situation which may arise in parallelized fetches, NULL +is returned. +@return a query thread of the graph moved to QUE_THR_RUNNING state, or +NULL; the query thread should be executed by que_run_threads by the +caller */ +UNIV_INTERN +que_thr_t* +que_fork_start_command( +/*===================*/ + que_fork_t* fork); /*!< in: a query fork */ +/***********************************************************************//** +Gets the trx of a query thread. */ +UNIV_INLINE +trx_t* +thr_get_trx( +/*========*/ + que_thr_t* thr); /*!< in: query thread */ +/*******************************************************************//** +Determines if this thread is rolling back an incomplete transaction +in crash recovery. +@return TRUE if thr is rolling back an incomplete transaction in crash +recovery */ +UNIV_INLINE +ibool +thr_is_recv( +/*========*/ + const que_thr_t* thr); /*!< in: query thread */ +/***********************************************************************//** +Gets the type of a graph node. */ +UNIV_INLINE +ulint +que_node_get_type( +/*==============*/ + que_node_t* node); /*!< in: graph node */ +/***********************************************************************//** +Gets pointer to the value data type field of a graph node. */ +UNIV_INLINE +dtype_t* +que_node_get_data_type( +/*===================*/ + que_node_t* node); /*!< in: graph node */ +/***********************************************************************//** +Gets pointer to the value dfield of a graph node. */ +UNIV_INLINE +dfield_t* +que_node_get_val( +/*=============*/ + que_node_t* node); /*!< in: graph node */ +/***********************************************************************//** +Gets the value buffer size of a graph node. +@return val buffer size, not defined if val.data == NULL in node */ +UNIV_INLINE +ulint +que_node_get_val_buf_size( +/*======================*/ + que_node_t* node); /*!< in: graph node */ +/***********************************************************************//** +Sets the value buffer size of a graph node. */ +UNIV_INLINE +void +que_node_set_val_buf_size( +/*======================*/ + que_node_t* node, /*!< in: graph node */ + ulint size); /*!< in: size */ +/*********************************************************************//** +Gets the next list node in a list of query graph nodes. */ +UNIV_INLINE +que_node_t* +que_node_get_next( +/*==============*/ + que_node_t* node); /*!< in: node in a list */ +/*********************************************************************//** +Gets the parent node of a query graph node. +@return parent node or NULL */ +UNIV_INLINE +que_node_t* +que_node_get_parent( +/*================*/ + que_node_t* node); /*!< in: node */ +/****************************************************************//** +Get the first containing loop node (e.g. while_node_t or for_node_t) for the +given node, or NULL if the node is not within a loop. +@return containing loop node, or NULL. */ +UNIV_INTERN +que_node_t* +que_node_get_containing_loop_node( +/*==============================*/ + que_node_t* node); /*!< in: node */ +/*********************************************************************//** +Catenates a query graph node to a list of them, possible empty list. +@return one-way list of nodes */ +UNIV_INLINE +que_node_t* +que_node_list_add_last( +/*===================*/ + que_node_t* node_list, /*!< in: node list, or NULL */ + que_node_t* node); /*!< in: node */ +/************************************************************************* +Get the last node from the list.*/ +UNIV_INLINE +que_node_t* +que_node_list_get_last( +/*===================*/ + /* out: node last node from list.*/ + que_node_t* node_list); /* in: node list, or NULL */ +/*********************************************************************//** +Gets a query graph node list length. +@return length, for NULL list 0 */ +UNIV_INLINE +ulint +que_node_list_get_len( +/*==================*/ + que_node_t* node_list); /*!< in: node list, or NULL */ +/**********************************************************************//** +Checks if graph, trx, or session is in a state where the query thread should +be stopped. +@return TRUE if should be stopped; NOTE that if the peek is made +without reserving the trx_t::mutex, then another peek with the mutex +reserved is necessary before deciding the actual stopping */ +UNIV_INLINE +ibool +que_thr_peek_stop( +/*==============*/ + que_thr_t* thr); /*!< in: query thread */ +/***********************************************************************//** +Returns TRUE if the query graph is for a SELECT statement. +@return TRUE if a select */ +UNIV_INLINE +ibool +que_graph_is_select( +/*================*/ + que_t* graph); /*!< in: graph */ +/**********************************************************************//** +Prints info of an SQL query graph node. */ +UNIV_INTERN +void +que_node_print_info( +/*================*/ + que_node_t* node); /*!< in: query graph node */ +/*********************************************************************//** +Evaluate the given SQL +@return error code or DB_SUCCESS */ +UNIV_INTERN +dberr_t +que_eval_sql( +/*=========*/ + pars_info_t* info, /*!< in: info struct, or NULL */ + const char* sql, /*!< in: SQL string */ + ibool reserve_dict_mutex, + /*!< in: if TRUE, acquire/release + dict_sys->mutex around call to pars_sql. */ + trx_t* trx); /*!< in: trx */ + +/**********************************************************************//** +Round robin scheduler. +@return a query thread of the graph moved to QUE_THR_RUNNING state, or +NULL; the query thread should be executed by que_run_threads by the +caller */ +UNIV_INTERN +que_thr_t* +que_fork_scheduler_round_robin( +/*===========================*/ + que_fork_t* fork, /*!< in: a query fork */ + que_thr_t* thr); /*!< in: current pos */ + +/*********************************************************************//** +Initialise the query sub-system. */ +UNIV_INTERN +void +que_init(void); +/*==========*/ + +/*********************************************************************//** +Close the query sub-system. */ +UNIV_INTERN +void +que_close(void); +/*===========*/ + +/* Query graph query thread node: the fields are protected by the +trx_t::mutex with the exceptions named below */ + +struct que_thr_t{ + que_common_t common; /*!< type: QUE_NODE_THR */ + ulint magic_n; /*!< magic number to catch memory + corruption */ + que_node_t* child; /*!< graph child node */ + que_t* graph; /*!< graph where this node belongs */ + ulint state; /*!< state of the query thread */ + ibool is_active; /*!< TRUE if the thread has been set + to the run state in + que_thr_move_to_run_state, but not + deactivated in + que_thr_dec_reference_count */ + /*------------------------------*/ + /* The following fields are private to the OS thread executing the + query thread, and are not protected by any mutex: */ + + que_node_t* run_node; /*!< pointer to the node where the + subgraph down from this node is + currently executed */ + que_node_t* prev_node; /*!< pointer to the node from which + the control came */ + ulint resource; /*!< resource usage of the query thread + thus far */ + ulint lock_state; /*!< lock state of thread (table or + row) */ + struct srv_slot_t* + slot; /* The thread slot in the wait + array in srv_sys_t */ + /*------------------------------*/ + /* The following fields are links for the various lists that + this type can be on. */ + UT_LIST_NODE_T(que_thr_t) + thrs; /*!< list of thread nodes of the fork + node */ + UT_LIST_NODE_T(que_thr_t) + trx_thrs; /*!< lists of threads in wait list of + the trx */ + UT_LIST_NODE_T(que_thr_t) + queue; /*!< list of runnable thread nodes in + the server task queue */ + ulint fk_cascade_depth; /*!< maximum cascading call depth + supported for foreign key constraint + related delete/updates */ +}; + +#define QUE_THR_MAGIC_N 8476583 +#define QUE_THR_MAGIC_FREED 123461526 + +/* Query graph fork node: its fields are protected by the query thread mutex */ +struct que_fork_t{ + que_common_t common; /*!< type: QUE_NODE_FORK */ + que_t* graph; /*!< query graph of this node */ + ulint fork_type; /*!< fork type */ + ulint n_active_thrs; /*!< if this is the root of a graph, the + number query threads that have been + started in que_thr_move_to_run_state + but for which que_thr_dec_refer_count + has not yet been called */ + trx_t* trx; /*!< transaction: this is set only in + the root node */ + ulint state; /*!< state of the fork node */ + que_thr_t* caller; /*!< pointer to a possible calling query + thread */ + UT_LIST_BASE_NODE_T(que_thr_t) + thrs; /*!< list of query threads */ + /*------------------------------*/ + /* The fields in this section are defined only in the root node */ + sym_tab_t* sym_tab; /*!< symbol table of the query, + generated by the parser, or NULL + if the graph was created 'by hand' */ + pars_info_t* info; /*!< info struct, or NULL */ + /* The following cur_... fields are relevant only in a select graph */ + + ulint cur_end; /*!< QUE_CUR_NOT_DEFINED, QUE_CUR_START, + QUE_CUR_END */ + ulint cur_pos; /*!< if there are n rows in the result + set, values 0 and n + 1 mean before + first row, or after last row, depending + on cur_end; values 1...n mean a row + index */ + ibool cur_on_row; /*!< TRUE if cursor is on a row, i.e., + it is not before the first row or + after the last row */ + sel_node_t* last_sel_node; /*!< last executed select node, or NULL + if none */ + UT_LIST_NODE_T(que_fork_t) + graphs; /*!< list of query graphs of a session + or a stored procedure */ + /*------------------------------*/ + mem_heap_t* heap; /*!< memory heap where the fork was + created */ + +}; + +/* Query fork (or graph) types */ +#define QUE_FORK_SELECT_NON_SCROLL 1 /* forward-only cursor */ +#define QUE_FORK_SELECT_SCROLL 2 /* scrollable cursor */ +#define QUE_FORK_INSERT 3 +#define QUE_FORK_UPDATE 4 +#define QUE_FORK_ROLLBACK 5 + /* This is really the undo graph used in rollback, + no signal-sending roll_node in this graph */ +#define QUE_FORK_PURGE 6 +#define QUE_FORK_EXECUTE 7 +#define QUE_FORK_PROCEDURE 8 +#define QUE_FORK_PROCEDURE_CALL 9 +#define QUE_FORK_MYSQL_INTERFACE 10 +#define QUE_FORK_RECOVERY 11 + +/* Query fork (or graph) states */ +#define QUE_FORK_ACTIVE 1 +#define QUE_FORK_COMMAND_WAIT 2 +#define QUE_FORK_INVALID 3 +#define QUE_FORK_BEING_FREED 4 + +/* Flag which is ORed to control structure statement node types */ +#define QUE_NODE_CONTROL_STAT 1024 + +/* Query graph node types */ +#define QUE_NODE_LOCK 1 +#define QUE_NODE_INSERT 2 +#define QUE_NODE_UPDATE 4 +#define QUE_NODE_CURSOR 5 +#define QUE_NODE_SELECT 6 +#define QUE_NODE_AGGREGATE 7 +#define QUE_NODE_FORK 8 +#define QUE_NODE_THR 9 +#define QUE_NODE_UNDO 10 +#define QUE_NODE_COMMIT 11 +#define QUE_NODE_ROLLBACK 12 +#define QUE_NODE_PURGE 13 +#define QUE_NODE_CREATE_TABLE 14 +#define QUE_NODE_CREATE_INDEX 15 +#define QUE_NODE_SYMBOL 16 +#define QUE_NODE_RES_WORD 17 +#define QUE_NODE_FUNC 18 +#define QUE_NODE_ORDER 19 +#define QUE_NODE_PROC (20 + QUE_NODE_CONTROL_STAT) +#define QUE_NODE_IF (21 + QUE_NODE_CONTROL_STAT) +#define QUE_NODE_WHILE (22 + QUE_NODE_CONTROL_STAT) +#define QUE_NODE_ASSIGNMENT 23 +#define QUE_NODE_FETCH 24 +#define QUE_NODE_OPEN 25 +#define QUE_NODE_COL_ASSIGNMENT 26 +#define QUE_NODE_FOR (27 + QUE_NODE_CONTROL_STAT) +#define QUE_NODE_RETURN 28 +#define QUE_NODE_ROW_PRINTF 29 +#define QUE_NODE_ELSIF 30 +#define QUE_NODE_CALL 31 +#define QUE_NODE_EXIT 32 + +/* Query thread states */ +#define QUE_THR_RUNNING 1 +#define QUE_THR_PROCEDURE_WAIT 2 +#define QUE_THR_COMPLETED 3 /* in selects this means that the + thread is at the end of its result set + (or start, in case of a scroll cursor); + in other statements, this means the + thread has done its task */ +#define QUE_THR_COMMAND_WAIT 4 +#define QUE_THR_LOCK_WAIT 5 +#define QUE_THR_SUSPENDED 7 +#define QUE_THR_ERROR 8 + +/* Query thread lock states */ +#define QUE_THR_LOCK_NOLOCK 0 +#define QUE_THR_LOCK_ROW 1 +#define QUE_THR_LOCK_TABLE 2 + +/* From where the cursor position is counted */ +#define QUE_CUR_NOT_DEFINED 1 +#define QUE_CUR_START 2 +#define QUE_CUR_END 3 + +#ifndef UNIV_NONINL +#include "que0que.ic" +#endif + +#endif diff --git a/storage/innobase/include/que0que.ic b/storage/innobase/include/que0que.ic new file mode 100644 index 00000000000..eff5a86d958 --- /dev/null +++ b/storage/innobase/include/que0que.ic @@ -0,0 +1,309 @@ +/***************************************************************************** + +Copyright (c) 1996, 2010, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/que0que.ic +Query graph + +Created 5/27/1996 Heikki Tuuri +*******************************************************/ + +#include "usr0sess.h" + +/***********************************************************************//** +Gets the trx of a query thread. */ +UNIV_INLINE +trx_t* +thr_get_trx( +/*========*/ + que_thr_t* thr) /*!< in: query thread */ +{ + ut_ad(thr); + + return(thr->graph->trx); +} + +/*******************************************************************//** +Determines if this thread is rolling back an incomplete transaction +in crash recovery. +@return TRUE if thr is rolling back an incomplete transaction in crash +recovery */ +UNIV_INLINE +ibool +thr_is_recv( +/*========*/ + const que_thr_t* thr) /*!< in: query thread */ +{ + return(trx_is_recv(thr->graph->trx)); +} + +/***********************************************************************//** +Gets the first thr in a fork. */ +UNIV_INLINE +que_thr_t* +que_fork_get_first_thr( +/*===================*/ + que_fork_t* fork) /*!< in: query fork */ +{ + return(UT_LIST_GET_FIRST(fork->thrs)); +} + +/***********************************************************************//** +Gets the child node of the first thr in a fork. */ +UNIV_INLINE +que_node_t* +que_fork_get_child( +/*===============*/ + que_fork_t* fork) /*!< in: query fork */ +{ + que_thr_t* thr; + + thr = UT_LIST_GET_FIRST(fork->thrs); + + return(thr->child); +} + +/***********************************************************************//** +Gets the type of a graph node. */ +UNIV_INLINE +ulint +que_node_get_type( +/*==============*/ + que_node_t* node) /*!< in: graph node */ +{ + ut_ad(node); + + return(((que_common_t*) node)->type); +} + +/***********************************************************************//** +Gets pointer to the value dfield of a graph node. */ +UNIV_INLINE +dfield_t* +que_node_get_val( +/*=============*/ + que_node_t* node) /*!< in: graph node */ +{ + ut_ad(node); + + return(&(((que_common_t*) node)->val)); +} + +/***********************************************************************//** +Gets the value buffer size of a graph node. +@return val buffer size, not defined if val.data == NULL in node */ +UNIV_INLINE +ulint +que_node_get_val_buf_size( +/*======================*/ + que_node_t* node) /*!< in: graph node */ +{ + ut_ad(node); + + return(((que_common_t*) node)->val_buf_size); +} + +/***********************************************************************//** +Sets the value buffer size of a graph node. */ +UNIV_INLINE +void +que_node_set_val_buf_size( +/*======================*/ + que_node_t* node, /*!< in: graph node */ + ulint size) /*!< in: size */ +{ + ut_ad(node); + + ((que_common_t*) node)->val_buf_size = size; +} + +/***********************************************************************//** +Sets the parent of a graph node. */ +UNIV_INLINE +void +que_node_set_parent( +/*================*/ + que_node_t* node, /*!< in: graph node */ + que_node_t* parent) /*!< in: parent */ +{ + ut_ad(node); + + ((que_common_t*) node)->parent = parent; +} + +/***********************************************************************//** +Gets pointer to the value data type field of a graph node. */ +UNIV_INLINE +dtype_t* +que_node_get_data_type( +/*===================*/ + que_node_t* node) /*!< in: graph node */ +{ + ut_ad(node); + + return(dfield_get_type(&((que_common_t*) node)->val)); +} + +/*********************************************************************//** +Catenates a query graph node to a list of them, possible empty list. +@return one-way list of nodes */ +UNIV_INLINE +que_node_t* +que_node_list_add_last( +/*===================*/ + que_node_t* node_list, /*!< in: node list, or NULL */ + que_node_t* node) /*!< in: node */ +{ + que_common_t* cnode; + que_common_t* cnode2; + + cnode = (que_common_t*) node; + + cnode->brother = NULL; + + if (node_list == NULL) { + + return(node); + } + + cnode2 = (que_common_t*) node_list; + + while (cnode2->brother != NULL) { + cnode2 = (que_common_t*) cnode2->brother; + } + + cnode2->brother = node; + + return(node_list); +} + +/************************************************************************* +Removes a query graph node from the list.*/ +UNIV_INLINE +que_node_t* +que_node_list_get_last( +/*===================*/ + /* out: last node in list.*/ + que_node_t* node_list) /* in: node list */ +{ + que_common_t* node; + + ut_a(node_list != NULL); + + node = (que_common_t*) node_list; + + /* We need the last element */ + while (node->brother != NULL) { + node = (que_common_t*) node->brother; + } + + return(node); +} +/*********************************************************************//** +Gets the next list node in a list of query graph nodes. +@return next node in a list of nodes */ +UNIV_INLINE +que_node_t* +que_node_get_next( +/*==============*/ + que_node_t* node) /*!< in: node in a list */ +{ + return(((que_common_t*) node)->brother); +} + +/*********************************************************************//** +Gets a query graph node list length. +@return length, for NULL list 0 */ +UNIV_INLINE +ulint +que_node_list_get_len( +/*==================*/ + que_node_t* node_list) /*!< in: node list, or NULL */ +{ + const que_common_t* cnode; + ulint len; + + cnode = (const que_common_t*) node_list; + len = 0; + + while (cnode != NULL) { + len++; + cnode = (const que_common_t*) cnode->brother; + } + + return(len); +} + +/*********************************************************************//** +Gets the parent node of a query graph node. +@return parent node or NULL */ +UNIV_INLINE +que_node_t* +que_node_get_parent( +/*================*/ + que_node_t* node) /*!< in: node */ +{ + return(((que_common_t*) node)->parent); +} + +/**********************************************************************//** +Checks if graph, trx, or session is in a state where the query thread should +be stopped. +@return TRUE if should be stopped; NOTE that if the peek is made +without reserving the trx mutex, then another peek with the mutex +reserved is necessary before deciding the actual stopping */ +UNIV_INLINE +ibool +que_thr_peek_stop( +/*==============*/ + que_thr_t* thr) /*!< in: query thread */ +{ + trx_t* trx; + que_t* graph; + + graph = thr->graph; + trx = graph->trx; + + if (graph->state != QUE_FORK_ACTIVE + || trx->lock.que_state == TRX_QUE_LOCK_WAIT + || (trx->lock.que_state != TRX_QUE_ROLLING_BACK + && trx->lock.que_state != TRX_QUE_RUNNING)) { + + return(TRUE); + } + + return(FALSE); +} + +/***********************************************************************//** +Returns TRUE if the query graph is for a SELECT statement. +@return TRUE if a select */ +UNIV_INLINE +ibool +que_graph_is_select( +/*================*/ + que_t* graph) /*!< in: graph */ +{ + if (graph->fork_type == QUE_FORK_SELECT_SCROLL + || graph->fork_type == QUE_FORK_SELECT_NON_SCROLL) { + + return(TRUE); + } + + return(FALSE); +} diff --git a/storage/innobase/include/que0types.h b/storage/innobase/include/que0types.h new file mode 100644 index 00000000000..0f11cad301a --- /dev/null +++ b/storage/innobase/include/que0types.h @@ -0,0 +1,57 @@ +/***************************************************************************** + +Copyright (c) 1996, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/que0types.h +Query graph global types + +Created 5/27/1996 Heikki Tuuri +*******************************************************/ + +#ifndef que0types_h +#define que0types_h + +#include "data0data.h" +#include "dict0types.h" + +/* Pseudotype for all graph nodes */ +typedef void que_node_t; + +/* Query graph root is a fork node */ +typedef struct que_fork_t que_t; + +struct que_thr_t; + +/* Common struct at the beginning of each query graph node; the name of this +substruct must be 'common' */ + +struct que_common_t{ + ulint type; /*!< query node type */ + que_node_t* parent; /*!< back pointer to parent node, or NULL */ + que_node_t* brother;/* pointer to a possible brother node */ + dfield_t val; /*!< evaluated value for an expression */ + ulint val_buf_size; + /* buffer size for the evaluated value data, + if the buffer has been allocated dynamically: + if this field is != 0, and the node is a + symbol node or a function node, then we + have to free the data field in val + explicitly */ +}; + +#endif diff --git a/storage/innobase/include/read0read.h b/storage/innobase/include/read0read.h new file mode 100644 index 00000000000..980faddf98e --- /dev/null +++ b/storage/innobase/include/read0read.h @@ -0,0 +1,193 @@ +/***************************************************************************** + +Copyright (c) 1997, 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/read0read.h +Cursor read + +Created 2/16/1997 Heikki Tuuri +*******************************************************/ + +#ifndef read0read_h +#define read0read_h + +#include "univ.i" + + +#include "ut0byte.h" +#include "ut0lst.h" +#include "trx0trx.h" +#include "read0types.h" + +/*********************************************************************//** +Opens a read view where exactly the transactions serialized before this +point in time are seen in the view. +@return own: read view struct */ +UNIV_INTERN +read_view_t* +read_view_open_now( +/*===============*/ + trx_id_t cr_trx_id, /*!< in: trx_id of creating + transaction, or 0 used in purge */ + mem_heap_t* heap); /*!< in: memory heap from which + allocated */ +/*********************************************************************//** +Makes a copy of the oldest existing read view, or opens a new. The view +must be closed with ..._close. +@return own: read view struct */ +UNIV_INTERN +read_view_t* +read_view_purge_open( +/*=================*/ + mem_heap_t* heap); /*!< in: memory heap from which + allocated */ +/*********************************************************************//** +Remove a read view from the trx_sys->view_list. */ +UNIV_INLINE +void +read_view_remove( +/*=============*/ + read_view_t* view, /*!< in: read view, can be 0 */ + bool own_mutex); /*!< in: true if caller owns the + trx_sys_t::mutex */ +/*********************************************************************//** +Closes a consistent read view for MySQL. This function is called at an SQL +statement end if the trx isolation level is <= TRX_ISO_READ_COMMITTED. */ +UNIV_INTERN +void +read_view_close_for_mysql( +/*======================*/ + trx_t* trx); /*!< in: trx which has a read view */ +/*********************************************************************//** +Checks if a read view sees the specified transaction. +@return true if sees */ +UNIV_INLINE +bool +read_view_sees_trx_id( +/*==================*/ + const read_view_t* view, /*!< in: read view */ + trx_id_t trx_id) /*!< in: trx id */ + __attribute__((nonnull, warn_unused_result)); +/*********************************************************************//** +Prints a read view to stderr. */ +UNIV_INTERN +void +read_view_print( +/*============*/ + const read_view_t* view); /*!< in: read view */ +/*********************************************************************//** +Create a consistent cursor view for mysql to be used in cursors. In this +consistent read view modifications done by the creating transaction or future +transactions are not visible. */ +UNIV_INTERN +cursor_view_t* +read_cursor_view_create_for_mysql( +/*==============================*/ + trx_t* cr_trx);/*!< in: trx where cursor view is created */ +/*********************************************************************//** +Close a given consistent cursor view for mysql and restore global read view +back to a transaction read view. */ +UNIV_INTERN +void +read_cursor_view_close_for_mysql( +/*=============================*/ + trx_t* trx, /*!< in: trx */ + cursor_view_t* curview); /*!< in: cursor view to be closed */ +/*********************************************************************//** +This function sets a given consistent cursor view to a transaction +read view if given consistent cursor view is not NULL. Otherwise, function +restores a global read view to a transaction read view. */ +UNIV_INTERN +void +read_cursor_set_for_mysql( +/*======================*/ + trx_t* trx, /*!< in: transaction where cursor is set */ + cursor_view_t* curview);/*!< in: consistent cursor view to be set */ + +/** Read view lists the trx ids of those transactions for which a consistent +read should not see the modifications to the database. */ + +struct read_view_t{ + ulint type; /*!< VIEW_NORMAL, VIEW_HIGH_GRANULARITY */ + undo_no_t undo_no;/*!< 0 or if type is + VIEW_HIGH_GRANULARITY + transaction undo_no when this high-granularity + consistent read view was created */ + trx_id_t low_limit_no; + /*!< The view does not need to see the undo + logs for transactions whose transaction number + is strictly smaller (<) than this value: they + can be removed in purge if not needed by other + views */ + trx_id_t low_limit_id; + /*!< The read should not see any transaction + with trx id >= this value. In other words, + this is the "high water mark". */ + trx_id_t up_limit_id; + /*!< The read should see all trx ids which + are strictly smaller (<) than this value. + In other words, + this is the "low water mark". */ + ulint n_trx_ids; + /*!< Number of cells in the trx_ids array */ + trx_id_t* trx_ids;/*!< Additional trx ids which the read should + not see: typically, these are the read-write + active transactions at the time when the read + is serialized, except the reading transaction + itself; the trx ids in this array are in a + descending order. These trx_ids should be + between the "low" and "high" water marks, + that is, up_limit_id and low_limit_id. */ + trx_id_t creator_trx_id; + /*!< trx id of creating transaction, or + 0 used in purge */ + UT_LIST_NODE_T(read_view_t) view_list; + /*!< List of read views in trx_sys */ +}; + +/** Read view types @{ */ +#define VIEW_NORMAL 1 /*!< Normal consistent read view + where transaction does not see changes + made by active transactions except + creating transaction. */ +#define VIEW_HIGH_GRANULARITY 2 /*!< High-granularity read view where + transaction does not see changes + made by active transactions and own + changes after a point in time when this + read view was created. */ +/* @} */ + +/** Implement InnoDB framework to support consistent read views in +cursors. This struct holds both heap where consistent read view +is allocated and pointer to a read view. */ + +struct cursor_view_t{ + mem_heap_t* heap; + /*!< Memory heap for the cursor view */ + read_view_t* read_view; + /*!< Consistent read view of the cursor*/ + ulint n_mysql_tables_in_use; + /*!< number of Innobase tables used in the + processing of this cursor */ +}; + +#ifndef UNIV_NONINL +#include "read0read.ic" +#endif + +#endif diff --git a/storage/innobase/include/read0read.ic b/storage/innobase/include/read0read.ic new file mode 100644 index 00000000000..82c1028f12e --- /dev/null +++ b/storage/innobase/include/read0read.ic @@ -0,0 +1,148 @@ +/***************************************************************************** + +Copyright (c) 1997, 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/read0read.ic +Cursor read + +Created 2/16/1997 Heikki Tuuri +*******************************************************/ + +#include "trx0sys.h" + +#ifdef UNIV_DEBUG +/*********************************************************************//** +Validates a read view object. */ +static +bool +read_view_validate( +/*===============*/ + const read_view_t* view) /*!< in: view to validate */ +{ + ut_ad(mutex_own(&trx_sys->mutex)); + + /* Check that the view->trx_ids array is in descending order. */ + for (ulint i = 1; i < view->n_trx_ids; ++i) { + + ut_a(view->trx_ids[i] < view->trx_ids[i - 1]); + } + + return(true); +} + +/** Functor to validate the view list. */ +struct ViewCheck { + + ViewCheck() : m_prev_view(0) { } + + void operator()(const read_view_t* view) + { + ut_a(m_prev_view == NULL + || m_prev_view->low_limit_no >= view->low_limit_no); + + m_prev_view = view; + } + + const read_view_t* m_prev_view; +}; + +/*********************************************************************//** +Validates a read view list. */ +static +bool +read_view_list_validate(void) +/*=========================*/ +{ + ut_ad(mutex_own(&trx_sys->mutex)); + + ut_list_map(trx_sys->view_list, &read_view_t::view_list, ViewCheck()); + + return(true); +} +#endif /* UNIV_DEBUG */ + +/*********************************************************************//** +Checks if a read view sees the specified transaction. +@return true if sees */ +UNIV_INLINE +bool +read_view_sees_trx_id( +/*==================*/ + const read_view_t* view, /*!< in: read view */ + trx_id_t trx_id) /*!< in: trx id */ +{ + if (trx_id < view->up_limit_id) { + + return(true); + } else if (trx_id >= view->low_limit_id) { + + return(false); + } else { + ulint lower = 0; + ulint upper = view->n_trx_ids - 1; + + ut_a(view->n_trx_ids > 0); + + do { + ulint mid = (lower + upper) >> 1; + trx_id_t mid_id = view->trx_ids[mid]; + + if (mid_id == trx_id) { + return(FALSE); + } else if (mid_id < trx_id) { + if (mid > 0) { + upper = mid - 1; + } else { + break; + } + } else { + lower = mid + 1; + } + } while (lower <= upper); + } + + return(true); +} + +/*********************************************************************//** +Remove a read view from the trx_sys->view_list. */ +UNIV_INLINE +void +read_view_remove( +/*=============*/ + read_view_t* view, /*!< in: read view, can be 0 */ + bool own_mutex) /*!< in: true if caller owns the + trx_sys_t::mutex */ +{ + if (view != 0) { + if (!own_mutex) { + mutex_enter(&trx_sys->mutex); + } + + ut_ad(read_view_validate(view)); + + UT_LIST_REMOVE(view_list, trx_sys->view_list, view); + + ut_ad(read_view_list_validate()); + + if (!own_mutex) { + mutex_exit(&trx_sys->mutex); + } + } +} + diff --git a/storage/innobase/include/read0types.h b/storage/innobase/include/read0types.h new file mode 100644 index 00000000000..969f4ebb637 --- /dev/null +++ b/storage/innobase/include/read0types.h @@ -0,0 +1,32 @@ +/***************************************************************************** + +Copyright (c) 1997, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/read0types.h +Cursor read + +Created 2/16/1997 Heikki Tuuri +*******************************************************/ + +#ifndef read0types_h +#define read0types_h + +struct read_view_t; +struct cursor_view_t; + +#endif diff --git a/storage/innobase/include/rem0cmp.h b/storage/innobase/include/rem0cmp.h new file mode 100644 index 00000000000..cb3c85ac2c8 --- /dev/null +++ b/storage/innobase/include/rem0cmp.h @@ -0,0 +1,301 @@ +/***************************************************************************** + +Copyright (c) 1994, 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/*******************************************************************//** +@file include/rem0cmp.h +Comparison services for records + +Created 7/1/1994 Heikki Tuuri +************************************************************************/ + +#ifndef rem0cmp_h +#define rem0cmp_h + +#include "univ.i" +#include "data0data.h" +#include "data0type.h" +#include "dict0dict.h" +#include "rem0rec.h" + +/*************************************************************//** +Returns TRUE if two columns are equal for comparison purposes. +@return TRUE if the columns are considered equal in comparisons */ +UNIV_INTERN +ibool +cmp_cols_are_equal( +/*===============*/ + const dict_col_t* col1, /*!< in: column 1 */ + const dict_col_t* col2, /*!< in: column 2 */ + ibool check_charsets); + /*!< in: whether to check charsets */ +/*************************************************************//** +This function is used to compare two data fields for which we know the +data type. +@return 1, 0, -1, if data1 is greater, equal, less than data2, respectively */ +UNIV_INLINE +int +cmp_data_data( +/*==========*/ + ulint mtype, /*!< in: main type */ + ulint prtype, /*!< in: precise type */ + const byte* data1, /*!< in: data field (== a pointer to a memory + buffer) */ + ulint len1, /*!< in: data field length or UNIV_SQL_NULL */ + const byte* data2, /*!< in: data field (== a pointer to a memory + buffer) */ + ulint len2); /*!< in: data field length or UNIV_SQL_NULL */ +/*************************************************************//** +This function is used to compare two data fields for which we know the +data type. +@return 1, 0, -1, if data1 is greater, equal, less than data2, respectively */ +UNIV_INTERN +int +cmp_data_data_slow( +/*===============*/ + ulint mtype, /*!< in: main type */ + ulint prtype, /*!< in: precise type */ + const byte* data1, /*!< in: data field (== a pointer to a memory + buffer) */ + ulint len1, /*!< in: data field length or UNIV_SQL_NULL */ + const byte* data2, /*!< in: data field (== a pointer to a memory + buffer) */ + ulint len2); /*!< in: data field length or UNIV_SQL_NULL */ + +/***************************************************************** +This function is used to compare two data fields for which we know the +data type to be VARCHAR. +@return 1, 0, -1, if lhs is greater, equal, less than rhs, respectively */ +UNIV_INTERN +int +cmp_data_data_slow_varchar( +/*=======================*/ + const byte* lhs, /* in: data field (== a pointer to a memory + buffer) */ + ulint lhs_len,/* in: data field length or UNIV_SQL_NULL */ + const byte* rhs, /* in: data field (== a pointer to a memory + buffer) */ + ulint rhs_len);/* in: data field length or UNIV_SQL_NULL */ +/***************************************************************** +This function is used to compare two varchar/char fields. The comparison +is for the LIKE operator. +@return 1, 0, -1, if lhs is greater, equal, less than rhs, respectively */ +UNIV_INTERN +int +cmp_data_data_slow_like_prefix( +/*===========================*/ + const byte* data1, /* in: data field (== a pointer to a memory + buffer) */ + ulint len1, /* in: data field length or UNIV_SQL_NULL */ + const byte* data2, /* in: data field (== a pointer to a memory + buffer) */ + ulint len2); /* in: data field length or UNIV_SQL_NULL */ +/***************************************************************** +This function is used to compare two varchar/char fields. The comparison +is for the LIKE operator. +@return 1, 0, -1, if data1 is greater, equal, less than data2, respectively */ +UNIV_INTERN +int +cmp_data_data_slow_like_suffix( +/*===========================*/ + const byte* data1, /* in: data field (== a pointer to a memory + buffer) */ + ulint len1, /* in: data field length or UNIV_SQL_NULL */ + const byte* data2, /* in: data field (== a pointer to a memory + buffer) */ + ulint len2); /* in: data field length or UNIV_SQL_NULL */ +/***************************************************************** +This function is used to compare two varchar/char fields. The comparison +is for the LIKE operator. +@return 1, 0, -1, if data1 is greater, equal, less than data2, respectively */ +UNIV_INTERN +int +cmp_data_data_slow_like_substr( +/*===========================*/ + const byte* data1, /* in: data field (== a pointer to a memory + buffer) */ + ulint len1, /* in: data field length or UNIV_SQL_NULL */ + const byte* data2, /* in: data field (== a pointer to a memory + buffer) */ + ulint len2); /* in: data field length or UNIV_SQL_NULL */ +/*************************************************************//** +This function is used to compare two dfields where at least the first +has its data type field set. +@return 1, 0, -1, if dfield1 is greater, equal, less than dfield2, +respectively */ +UNIV_INLINE +int +cmp_dfield_dfield( +/*==============*/ + const dfield_t* dfield1,/*!< in: data field; must have type field set */ + const dfield_t* dfield2);/*!< in: data field */ +/*************************************************************//** +This function is used to compare a data tuple to a physical record. +Only dtuple->n_fields_cmp first fields are taken into account for +the data tuple! If we denote by n = n_fields_cmp, then rec must +have either m >= n fields, or it must differ from dtuple in some of +the m fields rec has. If rec has an externally stored field we do not +compare it but return with value 0 if such a comparison should be +made. +@return 1, 0, -1, if dtuple is greater, equal, less than rec, +respectively, when only the common first fields are compared, or until +the first externally stored field in rec */ +UNIV_INTERN +int +cmp_dtuple_rec_with_match_low( +/*==========================*/ + const dtuple_t* dtuple, /*!< in: data tuple */ + const rec_t* rec, /*!< in: physical record which differs from + dtuple in some of the common fields, or which + has an equal number or more fields than + dtuple */ + const ulint* offsets,/*!< in: array returned by rec_get_offsets() */ + ulint n_cmp, /*!< in: number of fields to compare */ + ulint* matched_fields, + /*!< in/out: number of already completely + matched fields; when function returns, + contains the value for current comparison */ + ulint* matched_bytes) + /*!< in/out: number of already matched + bytes within the first field not completely + matched; when function returns, contains the + value for current comparison */ + __attribute__((nonnull)); +#define cmp_dtuple_rec_with_match(tuple,rec,offsets,fields,bytes) \ + cmp_dtuple_rec_with_match_low( \ + tuple,rec,offsets,dtuple_get_n_fields_cmp(tuple),fields,bytes) +/**************************************************************//** +Compares a data tuple to a physical record. +@see cmp_dtuple_rec_with_match +@return 1, 0, -1, if dtuple is greater, equal, less than rec, respectively */ +UNIV_INTERN +int +cmp_dtuple_rec( +/*===========*/ + const dtuple_t* dtuple, /*!< in: data tuple */ + const rec_t* rec, /*!< in: physical record */ + const ulint* offsets);/*!< in: array returned by rec_get_offsets() */ +/**************************************************************//** +Checks if a dtuple is a prefix of a record. The last field in dtuple +is allowed to be a prefix of the corresponding field in the record. +@return TRUE if prefix */ +UNIV_INTERN +ibool +cmp_dtuple_is_prefix_of_rec( +/*========================*/ + const dtuple_t* dtuple, /*!< in: data tuple */ + const rec_t* rec, /*!< in: physical record */ + const ulint* offsets);/*!< in: array returned by rec_get_offsets() */ +/*************************************************************//** +Compare two physical records that contain the same number of columns, +none of which are stored externally. +@retval 1 if rec1 (including non-ordering columns) is greater than rec2 +@retval -1 if rec1 (including non-ordering columns) is less than rec2 +@retval 0 if rec1 is a duplicate of rec2 */ +UNIV_INTERN +int +cmp_rec_rec_simple( +/*===============*/ + const rec_t* rec1, /*!< in: physical record */ + const rec_t* rec2, /*!< in: physical record */ + const ulint* offsets1,/*!< in: rec_get_offsets(rec1, ...) */ + const ulint* offsets2,/*!< in: rec_get_offsets(rec2, ...) */ + const dict_index_t* index, /*!< in: data dictionary index */ + struct TABLE* table) /*!< in: MySQL table, for reporting + duplicate key value if applicable, + or NULL */ + __attribute__((nonnull(1,2,3,4), warn_unused_result)); +/*************************************************************//** +This function is used to compare two physical records. Only the common +first fields are compared, and if an externally stored field is +encountered, then 0 is returned. +@return 1, 0, -1 if rec1 is greater, equal, less, respectively */ +UNIV_INTERN +int +cmp_rec_rec_with_match( +/*===================*/ + const rec_t* rec1, /*!< in: physical record */ + const rec_t* rec2, /*!< in: physical record */ + const ulint* offsets1,/*!< in: rec_get_offsets(rec1, index) */ + const ulint* offsets2,/*!< in: rec_get_offsets(rec2, index) */ + dict_index_t* index, /*!< in: data dictionary index */ + ibool nulls_unequal, + /* in: TRUE if this is for index statistics + cardinality estimation, and innodb_stats_method + is "nulls_unequal" or "nulls_ignored" */ + ulint* matched_fields, /*!< in/out: number of already completely + matched fields; when the function returns, + contains the value the for current + comparison */ + ulint* matched_bytes);/*!< in/out: number of already matched + bytes within the first field not completely + matched; when the function returns, contains + the value for the current comparison */ +/*************************************************************//** +This function is used to compare two physical records. Only the common +first fields are compared. +@return 1, 0 , -1 if rec1 is greater, equal, less, respectively, than +rec2; only the common first fields are compared */ +UNIV_INLINE +int +cmp_rec_rec( +/*========*/ + const rec_t* rec1, /*!< in: physical record */ + const rec_t* rec2, /*!< in: physical record */ + const ulint* offsets1,/*!< in: rec_get_offsets(rec1, index) */ + const ulint* offsets2,/*!< in: rec_get_offsets(rec2, index) */ + dict_index_t* index); /*!< in: data dictionary index */ + +/***************************************************************** +This function is used to compare two dfields where at least the first +has its data type field set. */ +UNIV_INTERN +int +cmp_dfield_dfield_like_prefix( +/*==========================*/ + /* out: 1, 0, -1, if dfield1 is greater, equal, + less than dfield2, respectively */ + dfield_t* dfield1,/* in: data field; must have type field set */ + dfield_t* dfield2);/* in: data field */ +/***************************************************************** +This function is used to compare two dfields where at least the first +has its data type field set. */ +UNIV_INLINE +int +cmp_dfield_dfield_like_substr( +/*==========================*/ + /* out: 1, 0, -1, if dfield1 is greater, equal, + less than dfield2, respectively */ + dfield_t* dfield1,/* in: data field; must have type field set */ + dfield_t* dfield2);/* in: data field */ +/***************************************************************** +This function is used to compare two dfields where at least the first +has its data type field set. */ +UNIV_INLINE +int +cmp_dfield_dfield_like_suffix( +/*==========================*/ + /* out: 1, 0, -1, if dfield1 is greater, equal, + less than dfield2, respectively */ + dfield_t* dfield1,/* in: data field; must have type field set */ + dfield_t* dfield2);/* in: data field */ + +#ifndef UNIV_NONINL +#include "rem0cmp.ic" +#endif + +#endif diff --git a/storage/innobase/include/rem0cmp.ic b/storage/innobase/include/rem0cmp.ic new file mode 100644 index 00000000000..67a2dcacba1 --- /dev/null +++ b/storage/innobase/include/rem0cmp.ic @@ -0,0 +1,186 @@ +/***************************************************************************** + +Copyright (c) 1994, 2011, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/*******************************************************************//** +@file include/rem0cmp.ic +Comparison services for records + +Created 7/1/1994 Heikki Tuuri +************************************************************************/ + +/*************************************************************//** +This function is used to compare two data fields for which we know the +data type. +@return 1, 0, -1, if data1 is greater, equal, less than data2, respectively */ +UNIV_INLINE +int +cmp_data_data( +/*==========*/ + ulint mtype, /*!< in: main type */ + ulint prtype, /*!< in: precise type */ + const byte* data1, /*!< in: data field (== a pointer to a memory + buffer) */ + ulint len1, /*!< in: data field length or UNIV_SQL_NULL */ + const byte* data2, /*!< in: data field (== a pointer to a memory + buffer) */ + ulint len2) /*!< in: data field length or UNIV_SQL_NULL */ +{ + return(cmp_data_data_slow(mtype, prtype, data1, len1, data2, len2)); +} + +/***************************************************************** +This function is used to compare two (CHAR) data fields for the LIKE +operator. */ +UNIV_INLINE +int +cmp_data_data_like_prefix( +/*======================*/ + /* out: 1, 0, -1, if data1 is greater, equal, + less than data2, respectively */ + byte* data1, /* in: data field (== a pointer to a memory + buffer) */ + ulint len1, /* in: data field length or UNIV_SQL_NULL */ + byte* data2, /* in: data field (== a pointer to a memory + buffer) */ + ulint len2) /* in: data field length or UNIV_SQL_NULL */ +{ + return(cmp_data_data_slow_like_prefix(data1, len1, data2, len2)); +} +/***************************************************************** +This function is used to compare two (CHAR) data fields for the LIKE +operator. */ +UNIV_INLINE +int +cmp_data_data_like_suffix( +/*======================*/ + /* out: 1, 0, -1, if data1 is greater, equal, + less than data2, respectively */ + byte* data1, /* in: data field (== a pointer to a memory + buffer) */ + ulint len1, /* in: data field length or UNIV_SQL_NULL */ + byte* data2, /* in: data field (== a pointer to a memory + buffer) */ + ulint len2) /* in: data field length or UNIV_SQL_NULL */ +{ + return(cmp_data_data_slow_like_suffix(data1, len1, data2, len2)); +} +/***************************************************************** +This function is used to compare two (CHAR) data fields for the LIKE +operator. */ +UNIV_INLINE +int +cmp_data_data_like_substr( +/*======================*/ + /* out: 1, 0, -1, if data1 is greater, equal, + less than data2, respectively */ + byte* data1, /* in: data field (== a pointer to a memory + buffer) */ + ulint len1, /* in: data field length or UNIV_SQL_NULL */ + byte* data2, /* in: data field (== a pointer to a memory + buffer) */ + ulint len2) /* in: data field length or UNIV_SQL_NULL */ +{ + return(cmp_data_data_slow_like_substr(data1, len1, data2, len2)); +} +/*************************************************************//** +This function is used to compare two dfields where at least the first +has its data type field set. +@return 1, 0, -1, if dfield1 is greater, equal, less than dfield2, +respectively */ +UNIV_INLINE +int +cmp_dfield_dfield( +/*==============*/ + const dfield_t* dfield1,/*!< in: data field; must have type field set */ + const dfield_t* dfield2)/*!< in: data field */ +{ + const dtype_t* type; + + ut_ad(dfield_check_typed(dfield1)); + + type = dfield_get_type(dfield1); + + return(cmp_data_data(type->mtype, type->prtype, + (const byte*) dfield_get_data(dfield1), + dfield_get_len(dfield1), + (const byte*) dfield_get_data(dfield2), + dfield_get_len(dfield2))); +} + +/***************************************************************** +This function is used to compare two dfields where at least the first +has its data type field set. */ +UNIV_INLINE +int +cmp_dfield_dfield_like_suffix( +/*==========================*/ + /* out: 1, 0, -1, if dfield1 is greater, equal, + less than dfield2, respectively */ + dfield_t* dfield1,/* in: data field; must have type field set */ + dfield_t* dfield2)/* in: data field */ +{ + ut_ad(dfield_check_typed(dfield1)); + + return(cmp_data_data_like_suffix( + (byte*) dfield_get_data(dfield1), + dfield_get_len(dfield1), + (byte*) dfield_get_data(dfield2), + dfield_get_len(dfield2))); +} + +/***************************************************************** +This function is used to compare two dfields where at least the first +has its data type field set. */ +UNIV_INLINE +int +cmp_dfield_dfield_like_substr( +/*==========================*/ + /* out: 1, 0, -1, if dfield1 is greater, equal, + less than dfield2, respectively */ + dfield_t* dfield1,/* in: data field; must have type field set */ + dfield_t* dfield2)/* in: data field */ +{ + ut_ad(dfield_check_typed(dfield1)); + + return(cmp_data_data_like_substr( + (byte*) dfield_get_data(dfield1), + dfield_get_len(dfield1), + (byte*) dfield_get_data(dfield2), + dfield_get_len(dfield2))); +} +/*************************************************************//** +This function is used to compare two physical records. Only the common +first fields are compared. +@return 1, 0 , -1 if rec1 is greater, equal, less, respectively, than +rec2; only the common first fields are compared */ +UNIV_INLINE +int +cmp_rec_rec( +/*========*/ + const rec_t* rec1, /*!< in: physical record */ + const rec_t* rec2, /*!< in: physical record */ + const ulint* offsets1,/*!< in: rec_get_offsets(rec1, index) */ + const ulint* offsets2,/*!< in: rec_get_offsets(rec2, index) */ + dict_index_t* index) /*!< in: data dictionary index */ +{ + ulint match_f = 0; + ulint match_b = 0; + + return(cmp_rec_rec_with_match(rec1, rec2, offsets1, offsets2, index, + FALSE, &match_f, &match_b)); +} diff --git a/storage/innobase/include/rem0rec.h b/storage/innobase/include/rem0rec.h new file mode 100644 index 00000000000..8e7d5ff2d48 --- /dev/null +++ b/storage/innobase/include/rem0rec.h @@ -0,0 +1,988 @@ +/***************************************************************************** + +Copyright (c) 1994, 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/********************************************************************//** +@file include/rem0rec.h +Record manager + +Created 5/30/1994 Heikki Tuuri +*************************************************************************/ + +#ifndef rem0rec_h +#define rem0rec_h + +#include "univ.i" +#include "data0data.h" +#include "rem0types.h" +#include "mtr0types.h" +#include "page0types.h" + +/* Info bit denoting the predefined minimum record: this bit is set +if and only if the record is the first user record on a non-leaf +B-tree page that is the leftmost page on its level +(PAGE_LEVEL is nonzero and FIL_PAGE_PREV is FIL_NULL). */ +#define REC_INFO_MIN_REC_FLAG 0x10UL +/* The deleted flag in info bits */ +#define REC_INFO_DELETED_FLAG 0x20UL /* when bit is set to 1, it means the + record has been delete marked */ + +/* Number of extra bytes in an old-style record, +in addition to the data and the offsets */ +#define REC_N_OLD_EXTRA_BYTES 6 +/* Number of extra bytes in a new-style record, +in addition to the data and the offsets */ +#define REC_N_NEW_EXTRA_BYTES 5 + +/* Record status values */ +#define REC_STATUS_ORDINARY 0 +#define REC_STATUS_NODE_PTR 1 +#define REC_STATUS_INFIMUM 2 +#define REC_STATUS_SUPREMUM 3 + +/* The following four constants are needed in page0zip.cc in order to +efficiently compress and decompress pages. */ + +/* The offset of heap_no in a compact record */ +#define REC_NEW_HEAP_NO 4 +/* The shift of heap_no in a compact record. +The status is stored in the low-order bits. */ +#define REC_HEAP_NO_SHIFT 3 + +/* Length of a B-tree node pointer, in bytes */ +#define REC_NODE_PTR_SIZE 4 + +/** SQL null flag in a 1-byte offset of ROW_FORMAT=REDUNDANT records */ +#define REC_1BYTE_SQL_NULL_MASK 0x80UL +/** SQL null flag in a 2-byte offset of ROW_FORMAT=REDUNDANT records */ +#define REC_2BYTE_SQL_NULL_MASK 0x8000UL + +/** In a 2-byte offset of ROW_FORMAT=REDUNDANT records, the second most +significant bit denotes that the tail of a field is stored off-page. */ +#define REC_2BYTE_EXTERN_MASK 0x4000UL + +#ifdef UNIV_DEBUG +/* Length of the rec_get_offsets() header */ +# define REC_OFFS_HEADER_SIZE 4 +#else /* UNIV_DEBUG */ +/* Length of the rec_get_offsets() header */ +# define REC_OFFS_HEADER_SIZE 2 +#endif /* UNIV_DEBUG */ + +/* Number of elements that should be initially allocated for the +offsets[] array, first passed to rec_get_offsets() */ +#define REC_OFFS_NORMAL_SIZE 100 +#define REC_OFFS_SMALL_SIZE 10 + +/******************************************************//** +The following function is used to get the pointer of the next chained record +on the same page. +@return pointer to the next chained record, or NULL if none */ +UNIV_INLINE +const rec_t* +rec_get_next_ptr_const( +/*===================*/ + const rec_t* rec, /*!< in: physical record */ + ulint comp) /*!< in: nonzero=compact page format */ + __attribute__((nonnull, pure, warn_unused_result)); +/******************************************************//** +The following function is used to get the pointer of the next chained record +on the same page. +@return pointer to the next chained record, or NULL if none */ +UNIV_INLINE +rec_t* +rec_get_next_ptr( +/*=============*/ + rec_t* rec, /*!< in: physical record */ + ulint comp) /*!< in: nonzero=compact page format */ + __attribute__((nonnull, pure, warn_unused_result)); +/******************************************************//** +The following function is used to get the offset of the +next chained record on the same page. +@return the page offset of the next chained record, or 0 if none */ +UNIV_INLINE +ulint +rec_get_next_offs( +/*==============*/ + const rec_t* rec, /*!< in: physical record */ + ulint comp) /*!< in: nonzero=compact page format */ + __attribute__((nonnull, pure, warn_unused_result)); +/******************************************************//** +The following function is used to set the next record offset field +of an old-style record. */ +UNIV_INLINE +void +rec_set_next_offs_old( +/*==================*/ + rec_t* rec, /*!< in: old-style physical record */ + ulint next) /*!< in: offset of the next record */ + __attribute__((nonnull)); +/******************************************************//** +The following function is used to set the next record offset field +of a new-style record. */ +UNIV_INLINE +void +rec_set_next_offs_new( +/*==================*/ + rec_t* rec, /*!< in/out: new-style physical record */ + ulint next) /*!< in: offset of the next record */ + __attribute__((nonnull)); +/******************************************************//** +The following function is used to get the number of fields +in an old-style record. +@return number of data fields */ +UNIV_INLINE +ulint +rec_get_n_fields_old( +/*=================*/ + const rec_t* rec) /*!< in: physical record */ + __attribute__((nonnull, pure, warn_unused_result)); +/******************************************************//** +The following function is used to get the number of fields +in a record. +@return number of data fields */ +UNIV_INLINE +ulint +rec_get_n_fields( +/*=============*/ + const rec_t* rec, /*!< in: physical record */ + const dict_index_t* index) /*!< in: record descriptor */ + __attribute__((nonnull, pure, warn_unused_result)); +/******************************************************//** +The following function is used to get the number of records owned by the +previous directory record. +@return number of owned records */ +UNIV_INLINE +ulint +rec_get_n_owned_old( +/*================*/ + const rec_t* rec) /*!< in: old-style physical record */ + __attribute__((nonnull, pure, warn_unused_result)); +/******************************************************//** +The following function is used to set the number of owned records. */ +UNIV_INLINE +void +rec_set_n_owned_old( +/*================*/ + rec_t* rec, /*!< in: old-style physical record */ + ulint n_owned) /*!< in: the number of owned */ + __attribute__((nonnull)); +/******************************************************//** +The following function is used to get the number of records owned by the +previous directory record. +@return number of owned records */ +UNIV_INLINE +ulint +rec_get_n_owned_new( +/*================*/ + const rec_t* rec) /*!< in: new-style physical record */ + __attribute__((nonnull, pure, warn_unused_result)); +/******************************************************//** +The following function is used to set the number of owned records. */ +UNIV_INLINE +void +rec_set_n_owned_new( +/*================*/ + rec_t* rec, /*!< in/out: new-style physical record */ + page_zip_des_t* page_zip,/*!< in/out: compressed page, or NULL */ + ulint n_owned)/*!< in: the number of owned */ + __attribute__((nonnull(1))); +/******************************************************//** +The following function is used to retrieve the info bits of +a record. +@return info bits */ +UNIV_INLINE +ulint +rec_get_info_bits( +/*==============*/ + const rec_t* rec, /*!< in: physical record */ + ulint comp) /*!< in: nonzero=compact page format */ + __attribute__((nonnull, pure, warn_unused_result)); +/******************************************************//** +The following function is used to set the info bits of a record. */ +UNIV_INLINE +void +rec_set_info_bits_old( +/*==================*/ + rec_t* rec, /*!< in: old-style physical record */ + ulint bits) /*!< in: info bits */ + __attribute__((nonnull)); +/******************************************************//** +The following function is used to set the info bits of a record. */ +UNIV_INLINE +void +rec_set_info_bits_new( +/*==================*/ + rec_t* rec, /*!< in/out: new-style physical record */ + ulint bits) /*!< in: info bits */ + __attribute__((nonnull)); +/******************************************************//** +The following function retrieves the status bits of a new-style record. +@return status bits */ +UNIV_INLINE +ulint +rec_get_status( +/*===========*/ + const rec_t* rec) /*!< in: physical record */ + __attribute__((nonnull, pure, warn_unused_result)); + +/******************************************************//** +The following function is used to set the status bits of a new-style record. */ +UNIV_INLINE +void +rec_set_status( +/*===========*/ + rec_t* rec, /*!< in/out: physical record */ + ulint bits) /*!< in: info bits */ + __attribute__((nonnull)); + +/******************************************************//** +The following function is used to retrieve the info and status +bits of a record. (Only compact records have status bits.) +@return info bits */ +UNIV_INLINE +ulint +rec_get_info_and_status_bits( +/*=========================*/ + const rec_t* rec, /*!< in: physical record */ + ulint comp) /*!< in: nonzero=compact page format */ + __attribute__((nonnull, pure, warn_unused_result)); +/******************************************************//** +The following function is used to set the info and status +bits of a record. (Only compact records have status bits.) */ +UNIV_INLINE +void +rec_set_info_and_status_bits( +/*=========================*/ + rec_t* rec, /*!< in/out: compact physical record */ + ulint bits) /*!< in: info bits */ + __attribute__((nonnull)); + +/******************************************************//** +The following function tells if record is delete marked. +@return nonzero if delete marked */ +UNIV_INLINE +ulint +rec_get_deleted_flag( +/*=================*/ + const rec_t* rec, /*!< in: physical record */ + ulint comp) /*!< in: nonzero=compact page format */ + __attribute__((nonnull, pure, warn_unused_result)); +/******************************************************//** +The following function is used to set the deleted bit. */ +UNIV_INLINE +void +rec_set_deleted_flag_old( +/*=====================*/ + rec_t* rec, /*!< in: old-style physical record */ + ulint flag) /*!< in: nonzero if delete marked */ + __attribute__((nonnull)); +/******************************************************//** +The following function is used to set the deleted bit. */ +UNIV_INLINE +void +rec_set_deleted_flag_new( +/*=====================*/ + rec_t* rec, /*!< in/out: new-style physical record */ + page_zip_des_t* page_zip,/*!< in/out: compressed page, or NULL */ + ulint flag) /*!< in: nonzero if delete marked */ + __attribute__((nonnull(1))); +/******************************************************//** +The following function tells if a new-style record is a node pointer. +@return TRUE if node pointer */ +UNIV_INLINE +ibool +rec_get_node_ptr_flag( +/*==================*/ + const rec_t* rec) /*!< in: physical record */ + __attribute__((nonnull, pure, warn_unused_result)); +/******************************************************//** +The following function is used to get the order number +of an old-style record in the heap of the index page. +@return heap order number */ +UNIV_INLINE +ulint +rec_get_heap_no_old( +/*================*/ + const rec_t* rec) /*!< in: physical record */ + __attribute__((nonnull, pure, warn_unused_result)); +/******************************************************//** +The following function is used to set the heap number +field in an old-style record. */ +UNIV_INLINE +void +rec_set_heap_no_old( +/*================*/ + rec_t* rec, /*!< in: physical record */ + ulint heap_no)/*!< in: the heap number */ + __attribute__((nonnull)); +/******************************************************//** +The following function is used to get the order number +of a new-style record in the heap of the index page. +@return heap order number */ +UNIV_INLINE +ulint +rec_get_heap_no_new( +/*================*/ + const rec_t* rec) /*!< in: physical record */ + __attribute__((nonnull, pure, warn_unused_result)); +/******************************************************//** +The following function is used to set the heap number +field in a new-style record. */ +UNIV_INLINE +void +rec_set_heap_no_new( +/*================*/ + rec_t* rec, /*!< in/out: physical record */ + ulint heap_no)/*!< in: the heap number */ + __attribute__((nonnull)); +/******************************************************//** +The following function is used to test whether the data offsets +in the record are stored in one-byte or two-byte format. +@return TRUE if 1-byte form */ +UNIV_INLINE +ibool +rec_get_1byte_offs_flag( +/*====================*/ + const rec_t* rec) /*!< in: physical record */ + __attribute__((nonnull, pure, warn_unused_result)); + +/******************************************************//** +The following function is used to set the 1-byte offsets flag. */ +UNIV_INLINE +void +rec_set_1byte_offs_flag( +/*====================*/ + rec_t* rec, /*!< in: physical record */ + ibool flag) /*!< in: TRUE if 1byte form */ + __attribute__((nonnull)); + +/******************************************************//** +Returns the offset of nth field end if the record is stored in the 1-byte +offsets form. If the field is SQL null, the flag is ORed in the returned +value. +@return offset of the start of the field, SQL null flag ORed */ +UNIV_INLINE +ulint +rec_1_get_field_end_info( +/*=====================*/ + const rec_t* rec, /*!< in: record */ + ulint n) /*!< in: field index */ + __attribute__((nonnull, pure, warn_unused_result)); + +/******************************************************//** +Returns the offset of nth field end if the record is stored in the 2-byte +offsets form. If the field is SQL null, the flag is ORed in the returned +value. +@return offset of the start of the field, SQL null flag and extern +storage flag ORed */ +UNIV_INLINE +ulint +rec_2_get_field_end_info( +/*=====================*/ + const rec_t* rec, /*!< in: record */ + ulint n) /*!< in: field index */ + __attribute__((nonnull, pure, warn_unused_result)); + +/******************************************************//** +Returns nonzero if the field is stored off-page. +@retval 0 if the field is stored in-page +@retval REC_2BYTE_EXTERN_MASK if the field is stored externally */ +UNIV_INLINE +ulint +rec_2_is_field_extern( +/*==================*/ + const rec_t* rec, /*!< in: record */ + ulint n) /*!< in: field index */ + __attribute__((nonnull, pure, warn_unused_result)); + +/******************************************************//** +Determine how many of the first n columns in a compact +physical record are stored externally. +@return number of externally stored columns */ +UNIV_INTERN +ulint +rec_get_n_extern_new( +/*=================*/ + const rec_t* rec, /*!< in: compact physical record */ + const dict_index_t* index, /*!< in: record descriptor */ + ulint n) /*!< in: number of columns to scan */ + __attribute__((nonnull, warn_unused_result)); + +/******************************************************//** +The following function determines the offsets to each field +in the record. It can reuse a previously allocated array. +@return the new offsets */ +UNIV_INTERN +ulint* +rec_get_offsets_func( +/*=================*/ + const rec_t* rec, /*!< in: physical record */ + const dict_index_t* index, /*!< in: record descriptor */ + ulint* offsets,/*!< in/out: array consisting of + offsets[0] allocated elements, + or an array from rec_get_offsets(), + or NULL */ + ulint n_fields,/*!< in: maximum number of + initialized fields + (ULINT_UNDEFINED if all fields) */ +#ifdef UNIV_DEBUG + const char* file, /*!< in: file name where called */ + ulint line, /*!< in: line number where called */ +#endif /* UNIV_DEBUG */ + mem_heap_t** heap) /*!< in/out: memory heap */ +#ifdef UNIV_DEBUG + __attribute__((nonnull(1,2,5,7),warn_unused_result)); +#else /* UNIV_DEBUG */ + __attribute__((nonnull(1,2,5),warn_unused_result)); +#endif /* UNIV_DEBUG */ + +#ifdef UNIV_DEBUG +# define rec_get_offsets(rec,index,offsets,n,heap) \ + rec_get_offsets_func(rec,index,offsets,n,__FILE__,__LINE__,heap) +#else /* UNIV_DEBUG */ +# define rec_get_offsets(rec, index, offsets, n, heap) \ + rec_get_offsets_func(rec, index, offsets, n, heap) +#endif /* UNIV_DEBUG */ + +/******************************************************//** +The following function determines the offsets to each field +in the record. It can reuse a previously allocated array. */ +UNIV_INTERN +void +rec_get_offsets_reverse( +/*====================*/ + const byte* extra, /*!< in: the extra bytes of a + compact record in reverse order, + excluding the fixed-size + REC_N_NEW_EXTRA_BYTES */ + const dict_index_t* index, /*!< in: record descriptor */ + ulint node_ptr,/*!< in: nonzero=node pointer, + 0=leaf node */ + ulint* offsets)/*!< in/out: array consisting of + offsets[0] allocated elements */ + __attribute__((nonnull)); +#ifdef UNIV_DEBUG +/************************************************************//** +Validates offsets returned by rec_get_offsets(). +@return TRUE if valid */ +UNIV_INLINE +ibool +rec_offs_validate( +/*==============*/ + const rec_t* rec, /*!< in: record or NULL */ + const dict_index_t* index, /*!< in: record descriptor or NULL */ + const ulint* offsets)/*!< in: array returned by + rec_get_offsets() */ + __attribute__((nonnull(3), warn_unused_result)); +/************************************************************//** +Updates debug data in offsets, in order to avoid bogus +rec_offs_validate() failures. */ +UNIV_INLINE +void +rec_offs_make_valid( +/*================*/ + const rec_t* rec, /*!< in: record */ + const dict_index_t* index, /*!< in: record descriptor */ + ulint* offsets)/*!< in: array returned by + rec_get_offsets() */ + __attribute__((nonnull)); +#else +# define rec_offs_make_valid(rec, index, offsets) ((void) 0) +#endif /* UNIV_DEBUG */ + +/************************************************************//** +The following function is used to get the offset to the nth +data field in an old-style record. +@return offset to the field */ +UNIV_INTERN +ulint +rec_get_nth_field_offs_old( +/*=======================*/ + const rec_t* rec, /*!< in: record */ + ulint n, /*!< in: index of the field */ + ulint* len) /*!< out: length of the field; UNIV_SQL_NULL + if SQL null */ + __attribute__((nonnull)); +#define rec_get_nth_field_old(rec, n, len) \ +((rec) + rec_get_nth_field_offs_old(rec, n, len)) +/************************************************************//** +Gets the physical size of an old-style field. +Also an SQL null may have a field of size > 0, +if the data type is of a fixed size. +@return field size in bytes */ +UNIV_INLINE +ulint +rec_get_nth_field_size( +/*===================*/ + const rec_t* rec, /*!< in: record */ + ulint n) /*!< in: index of the field */ + __attribute__((nonnull, pure, warn_unused_result)); +/************************************************************//** +The following function is used to get an offset to the nth +data field in a record. +@return offset from the origin of rec */ +UNIV_INLINE +ulint +rec_get_nth_field_offs( +/*===================*/ + const ulint* offsets,/*!< in: array returned by rec_get_offsets() */ + ulint n, /*!< in: index of the field */ + ulint* len) /*!< out: length of the field; UNIV_SQL_NULL + if SQL null */ + __attribute__((nonnull)); +#define rec_get_nth_field(rec, offsets, n, len) \ +((rec) + rec_get_nth_field_offs(offsets, n, len)) +/******************************************************//** +Determine if the offsets are for a record in the new +compact format. +@return nonzero if compact format */ +UNIV_INLINE +ulint +rec_offs_comp( +/*==========*/ + const ulint* offsets)/*!< in: array returned by rec_get_offsets() */ + __attribute__((nonnull, pure, warn_unused_result)); +/******************************************************//** +Determine if the offsets are for a record containing +externally stored columns. +@return nonzero if externally stored */ +UNIV_INLINE +ulint +rec_offs_any_extern( +/*================*/ + const ulint* offsets)/*!< in: array returned by rec_get_offsets() */ + __attribute__((nonnull, pure, warn_unused_result)); +/******************************************************//** +Determine if the offsets are for a record containing null BLOB pointers. +@return first field containing a null BLOB pointer, or NULL if none found */ +UNIV_INLINE +const byte* +rec_offs_any_null_extern( +/*=====================*/ + const rec_t* rec, /*!< in: record */ + const ulint* offsets) /*!< in: rec_get_offsets(rec) */ + __attribute__((nonnull, pure, warn_unused_result)); +/******************************************************//** +Returns nonzero if the extern bit is set in nth field of rec. +@return nonzero if externally stored */ +UNIV_INLINE +ulint +rec_offs_nth_extern( +/*================*/ + const ulint* offsets,/*!< in: array returned by rec_get_offsets() */ + ulint n) /*!< in: nth field */ + __attribute__((nonnull, pure, warn_unused_result)); +/******************************************************//** +Returns nonzero if the SQL NULL bit is set in nth field of rec. +@return nonzero if SQL NULL */ +UNIV_INLINE +ulint +rec_offs_nth_sql_null( +/*==================*/ + const ulint* offsets,/*!< in: array returned by rec_get_offsets() */ + ulint n) /*!< in: nth field */ + __attribute__((nonnull, pure, warn_unused_result)); +/******************************************************//** +Gets the physical size of a field. +@return length of field */ +UNIV_INLINE +ulint +rec_offs_nth_size( +/*==============*/ + const ulint* offsets,/*!< in: array returned by rec_get_offsets() */ + ulint n) /*!< in: nth field */ + __attribute__((nonnull, pure, warn_unused_result)); + +/******************************************************//** +Returns the number of extern bits set in a record. +@return number of externally stored fields */ +UNIV_INLINE +ulint +rec_offs_n_extern( +/*==============*/ + const ulint* offsets)/*!< in: array returned by rec_get_offsets() */ + __attribute__((nonnull, pure, warn_unused_result)); +/***********************************************************//** +This is used to modify the value of an already existing field in a record. +The previous value must have exactly the same size as the new value. If len +is UNIV_SQL_NULL then the field is treated as an SQL null. +For records in ROW_FORMAT=COMPACT (new-style records), len must not be +UNIV_SQL_NULL unless the field already is SQL null. */ +UNIV_INLINE +void +rec_set_nth_field( +/*==============*/ + rec_t* rec, /*!< in: record */ + const ulint* offsets,/*!< in: array returned by rec_get_offsets() */ + ulint n, /*!< in: index number of the field */ + const void* data, /*!< in: pointer to the data if not SQL null */ + ulint len) /*!< in: length of the data or UNIV_SQL_NULL. + If not SQL null, must have the same + length as the previous value. + If SQL null, previous value must be + SQL null. */ + __attribute__((nonnull(1,2))); +/**********************************************************//** +The following function returns the data size of an old-style physical +record, that is the sum of field lengths. SQL null fields +are counted as length 0 fields. The value returned by the function +is the distance from record origin to record end in bytes. +@return size */ +UNIV_INLINE +ulint +rec_get_data_size_old( +/*==================*/ + const rec_t* rec) /*!< in: physical record */ + __attribute__((nonnull, pure, warn_unused_result)); +/**********************************************************//** +The following function returns the number of allocated elements +for an array of offsets. +@return number of elements */ +UNIV_INLINE +ulint +rec_offs_get_n_alloc( +/*=================*/ + const ulint* offsets)/*!< in: array for rec_get_offsets() */ + __attribute__((nonnull, pure, warn_unused_result)); +/**********************************************************//** +The following function sets the number of allocated elements +for an array of offsets. */ +UNIV_INLINE +void +rec_offs_set_n_alloc( +/*=================*/ + ulint* offsets, /*!< out: array for rec_get_offsets(), + must be allocated */ + ulint n_alloc) /*!< in: number of elements */ + __attribute__((nonnull)); +#define rec_offs_init(offsets) \ + rec_offs_set_n_alloc(offsets, (sizeof offsets) / sizeof *offsets) +/**********************************************************//** +The following function returns the number of fields in a record. +@return number of fields */ +UNIV_INLINE +ulint +rec_offs_n_fields( +/*==============*/ + const ulint* offsets)/*!< in: array returned by rec_get_offsets() */ + __attribute__((nonnull, pure, warn_unused_result)); +/**********************************************************//** +The following function returns the data size of a physical +record, that is the sum of field lengths. SQL null fields +are counted as length 0 fields. The value returned by the function +is the distance from record origin to record end in bytes. +@return size */ +UNIV_INLINE +ulint +rec_offs_data_size( +/*===============*/ + const ulint* offsets)/*!< in: array returned by rec_get_offsets() */ + __attribute__((nonnull, pure, warn_unused_result)); +/**********************************************************//** +Returns the total size of record minus data size of record. +The value returned by the function is the distance from record +start to record origin in bytes. +@return size */ +UNIV_INLINE +ulint +rec_offs_extra_size( +/*================*/ + const ulint* offsets)/*!< in: array returned by rec_get_offsets() */ + __attribute__((nonnull, pure, warn_unused_result)); +/**********************************************************//** +Returns the total size of a physical record. +@return size */ +UNIV_INLINE +ulint +rec_offs_size( +/*==========*/ + const ulint* offsets)/*!< in: array returned by rec_get_offsets() */ + __attribute__((nonnull, pure, warn_unused_result)); +#ifdef UNIV_DEBUG +/**********************************************************//** +Returns a pointer to the start of the record. +@return pointer to start */ +UNIV_INLINE +byte* +rec_get_start( +/*==========*/ + const rec_t* rec, /*!< in: pointer to record */ + const ulint* offsets)/*!< in: array returned by rec_get_offsets() */ + __attribute__((nonnull, pure, warn_unused_result)); +/**********************************************************//** +Returns a pointer to the end of the record. +@return pointer to end */ +UNIV_INLINE +byte* +rec_get_end( +/*========*/ + const rec_t* rec, /*!< in: pointer to record */ + const ulint* offsets)/*!< in: array returned by rec_get_offsets() */ + __attribute__((nonnull, pure, warn_unused_result)); +#else /* UNIV_DEBUG */ +# define rec_get_start(rec, offsets) ((rec) - rec_offs_extra_size(offsets)) +# define rec_get_end(rec, offsets) ((rec) + rec_offs_data_size(offsets)) +#endif /* UNIV_DEBUG */ +/***************************************************************//** +Copies a physical record to a buffer. +@return pointer to the origin of the copy */ +UNIV_INLINE +rec_t* +rec_copy( +/*=====*/ + void* buf, /*!< in: buffer */ + const rec_t* rec, /*!< in: physical record */ + const ulint* offsets)/*!< in: array returned by rec_get_offsets() */ + __attribute__((nonnull)); +#ifndef UNIV_HOTBACKUP +/**********************************************************//** +Determines the size of a data tuple prefix in a temporary file. +@return total size */ +UNIV_INTERN +ulint +rec_get_converted_size_temp( +/*========================*/ + const dict_index_t* index, /*!< in: record descriptor */ + const dfield_t* fields, /*!< in: array of data fields */ + ulint n_fields,/*!< in: number of data fields */ + ulint* extra) /*!< out: extra size */ + __attribute__((warn_unused_result, nonnull)); + +/******************************************************//** +Determine the offset to each field in temporary file. +@see rec_convert_dtuple_to_temp() */ +UNIV_INTERN +void +rec_init_offsets_temp( +/*==================*/ + const rec_t* rec, /*!< in: temporary file record */ + const dict_index_t* index, /*!< in: record descriptor */ + ulint* offsets)/*!< in/out: array of offsets; + in: n=rec_offs_n_fields(offsets) */ + __attribute__((nonnull)); + +/*********************************************************//** +Builds a temporary file record out of a data tuple. +@see rec_init_offsets_temp() */ +UNIV_INTERN +void +rec_convert_dtuple_to_temp( +/*=======================*/ + rec_t* rec, /*!< out: record */ + const dict_index_t* index, /*!< in: record descriptor */ + const dfield_t* fields, /*!< in: array of data fields */ + ulint n_fields) /*!< in: number of fields */ + __attribute__((nonnull)); + +/**************************************************************//** +Copies the first n fields of a physical record to a new physical record in +a buffer. +@return own: copied record */ +UNIV_INTERN +rec_t* +rec_copy_prefix_to_buf( +/*===================*/ + const rec_t* rec, /*!< in: physical record */ + const dict_index_t* index, /*!< in: record descriptor */ + ulint n_fields, /*!< in: number of fields + to copy */ + byte** buf, /*!< in/out: memory buffer + for the copied prefix, + or NULL */ + ulint* buf_size) /*!< in/out: buffer size */ + __attribute__((nonnull)); +/************************************************************//** +Folds a prefix of a physical record to a ulint. +@return the folded value */ +UNIV_INLINE +ulint +rec_fold( +/*=====*/ + const rec_t* rec, /*!< in: the physical record */ + const ulint* offsets, /*!< in: array returned by + rec_get_offsets() */ + ulint n_fields, /*!< in: number of complete + fields to fold */ + ulint n_bytes, /*!< in: number of bytes to fold + in an incomplete last field */ + index_id_t tree_id) /*!< in: index tree id */ + __attribute__((nonnull, pure, warn_unused_result)); +#endif /* !UNIV_HOTBACKUP */ +/*********************************************************//** +Builds a physical record out of a data tuple and +stores it into the given buffer. +@return pointer to the origin of physical record */ +UNIV_INTERN +rec_t* +rec_convert_dtuple_to_rec( +/*======================*/ + byte* buf, /*!< in: start address of the + physical record */ + const dict_index_t* index, /*!< in: record descriptor */ + const dtuple_t* dtuple, /*!< in: data tuple */ + ulint n_ext) /*!< in: number of + externally stored columns */ + __attribute__((nonnull, warn_unused_result)); +/**********************************************************//** +Returns the extra size of an old-style physical record if we know its +data size and number of fields. +@return extra size */ +UNIV_INLINE +ulint +rec_get_converted_extra_size( +/*=========================*/ + ulint data_size, /*!< in: data size */ + ulint n_fields, /*!< in: number of fields */ + ulint n_ext) /*!< in: number of externally stored columns */ + __attribute__((const)); +/**********************************************************//** +Determines the size of a data tuple prefix in ROW_FORMAT=COMPACT. +@return total size */ +UNIV_INTERN +ulint +rec_get_converted_size_comp_prefix( +/*===============================*/ + const dict_index_t* index, /*!< in: record descriptor */ + const dfield_t* fields, /*!< in: array of data fields */ + ulint n_fields,/*!< in: number of data fields */ + ulint* extra) /*!< out: extra size */ + __attribute__((warn_unused_result, nonnull(1,2))); +/**********************************************************//** +Determines the size of a data tuple in ROW_FORMAT=COMPACT. +@return total size */ +UNIV_INTERN +ulint +rec_get_converted_size_comp( +/*========================*/ + const dict_index_t* index, /*!< in: record descriptor; + dict_table_is_comp() is + assumed to hold, even if + it does not */ + ulint status, /*!< in: status bits of the record */ + const dfield_t* fields, /*!< in: array of data fields */ + ulint n_fields,/*!< in: number of data fields */ + ulint* extra) /*!< out: extra size */ + __attribute__((nonnull(1,3))); +/**********************************************************//** +The following function returns the size of a data tuple when converted to +a physical record. +@return size */ +UNIV_INLINE +ulint +rec_get_converted_size( +/*===================*/ + dict_index_t* index, /*!< in: record descriptor */ + const dtuple_t* dtuple, /*!< in: data tuple */ + ulint n_ext) /*!< in: number of externally stored columns */ + __attribute__((warn_unused_result, nonnull)); +#ifndef UNIV_HOTBACKUP +/**************************************************************//** +Copies the first n fields of a physical record to a data tuple. +The fields are copied to the memory heap. */ +UNIV_INTERN +void +rec_copy_prefix_to_dtuple( +/*======================*/ + dtuple_t* tuple, /*!< out: data tuple */ + const rec_t* rec, /*!< in: physical record */ + const dict_index_t* index, /*!< in: record descriptor */ + ulint n_fields, /*!< in: number of fields + to copy */ + mem_heap_t* heap) /*!< in: memory heap */ + __attribute__((nonnull)); +#endif /* !UNIV_HOTBACKUP */ +/***************************************************************//** +Validates the consistency of a physical record. +@return TRUE if ok */ +UNIV_INTERN +ibool +rec_validate( +/*=========*/ + const rec_t* rec, /*!< in: physical record */ + const ulint* offsets)/*!< in: array returned by rec_get_offsets() */ + __attribute__((nonnull)); +/***************************************************************//** +Prints an old-style physical record. */ +UNIV_INTERN +void +rec_print_old( +/*==========*/ + FILE* file, /*!< in: file where to print */ + const rec_t* rec) /*!< in: physical record */ + __attribute__((nonnull)); +#ifndef UNIV_HOTBACKUP +/***************************************************************//** +Prints a physical record in ROW_FORMAT=COMPACT. Ignores the +record header. */ +UNIV_INTERN +void +rec_print_comp( +/*===========*/ + FILE* file, /*!< in: file where to print */ + const rec_t* rec, /*!< in: physical record */ + const ulint* offsets)/*!< in: array returned by rec_get_offsets() */ + __attribute__((nonnull)); +/***************************************************************//** +Prints a physical record. */ +UNIV_INTERN +void +rec_print_new( +/*==========*/ + FILE* file, /*!< in: file where to print */ + const rec_t* rec, /*!< in: physical record */ + const ulint* offsets)/*!< in: array returned by rec_get_offsets() */ + __attribute__((nonnull)); +/***************************************************************//** +Prints a physical record. */ +UNIV_INTERN +void +rec_print( +/*======*/ + FILE* file, /*!< in: file where to print */ + const rec_t* rec, /*!< in: physical record */ + const dict_index_t* index) /*!< in: record descriptor */ + __attribute__((nonnull)); + +# ifdef UNIV_DEBUG +/************************************************************//** +Reads the DB_TRX_ID of a clustered index record. +@return the value of DB_TRX_ID */ +UNIV_INTERN +trx_id_t +rec_get_trx_id( +/*===========*/ + const rec_t* rec, /*!< in: record */ + const dict_index_t* index) /*!< in: clustered index */ + __attribute__((nonnull, warn_unused_result)); +# endif /* UNIV_DEBUG */ +#endif /* UNIV_HOTBACKUP */ + +/* Maximum lengths for the data in a physical record if the offsets +are given in one byte (resp. two byte) format. */ +#define REC_1BYTE_OFFS_LIMIT 0x7FUL +#define REC_2BYTE_OFFS_LIMIT 0x7FFFUL + +/* The data size of record must be smaller than this because we reserve +two upmost bits in a two byte offset for special purposes */ +#define REC_MAX_DATA_SIZE (16 * 1024) + +#ifndef UNIV_NONINL +#include "rem0rec.ic" +#endif + +#endif diff --git a/storage/innobase/include/rem0rec.ic b/storage/innobase/include/rem0rec.ic new file mode 100644 index 00000000000..a539320dd2a --- /dev/null +++ b/storage/innobase/include/rem0rec.ic @@ -0,0 +1,1718 @@ +/***************************************************************************** + +Copyright (c) 1994, 2013, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/********************************************************************//** +@file include/rem0rec.ic +Record manager + +Created 5/30/1994 Heikki Tuuri +*************************************************************************/ + +#include "mach0data.h" +#include "ut0byte.h" +#include "dict0dict.h" +#include "btr0types.h" + +/* Compact flag ORed to the extra size returned by rec_get_offsets() */ +#define REC_OFFS_COMPACT ((ulint) 1 << 31) +/* SQL NULL flag in offsets returned by rec_get_offsets() */ +#define REC_OFFS_SQL_NULL ((ulint) 1 << 31) +/* External flag in offsets returned by rec_get_offsets() */ +#define REC_OFFS_EXTERNAL ((ulint) 1 << 30) +/* Mask for offsets returned by rec_get_offsets() */ +#define REC_OFFS_MASK (REC_OFFS_EXTERNAL - 1) + +/* Offsets of the bit-fields in an old-style record. NOTE! In the table the +most significant bytes and bits are written below less significant. + + (1) byte offset (2) bit usage within byte + downward from + origin -> 1 8 bits pointer to next record + 2 8 bits pointer to next record + 3 1 bit short flag + 7 bits number of fields + 4 3 bits number of fields + 5 bits heap number + 5 8 bits heap number + 6 4 bits n_owned + 4 bits info bits +*/ + +/* Offsets of the bit-fields in a new-style record. NOTE! In the table the +most significant bytes and bits are written below less significant. + + (1) byte offset (2) bit usage within byte + downward from + origin -> 1 8 bits relative offset of next record + 2 8 bits relative offset of next record + the relative offset is an unsigned 16-bit + integer: + (offset_of_next_record + - offset_of_this_record) mod 64Ki, + where mod is the modulo as a non-negative + number; + we can calculate the offset of the next + record with the formula: + relative_offset + offset_of_this_record + mod UNIV_PAGE_SIZE + 3 3 bits status: + 000=conventional record + 001=node pointer record (inside B-tree) + 010=infimum record + 011=supremum record + 1xx=reserved + 5 bits heap number + 4 8 bits heap number + 5 4 bits n_owned + 4 bits info bits +*/ + +/* We list the byte offsets from the origin of the record, the mask, +and the shift needed to obtain each bit-field of the record. */ + +#define REC_NEXT 2 +#define REC_NEXT_MASK 0xFFFFUL +#define REC_NEXT_SHIFT 0 + +#define REC_OLD_SHORT 3 /* This is single byte bit-field */ +#define REC_OLD_SHORT_MASK 0x1UL +#define REC_OLD_SHORT_SHIFT 0 + +#define REC_OLD_N_FIELDS 4 +#define REC_OLD_N_FIELDS_MASK 0x7FEUL +#define REC_OLD_N_FIELDS_SHIFT 1 + +#define REC_NEW_STATUS 3 /* This is single byte bit-field */ +#define REC_NEW_STATUS_MASK 0x7UL +#define REC_NEW_STATUS_SHIFT 0 + +#define REC_OLD_HEAP_NO 5 +#define REC_HEAP_NO_MASK 0xFFF8UL +#if 0 /* defined in rem0rec.h for use of page0zip.cc */ +#define REC_NEW_HEAP_NO 4 +#define REC_HEAP_NO_SHIFT 3 +#endif + +#define REC_OLD_N_OWNED 6 /* This is single byte bit-field */ +#define REC_NEW_N_OWNED 5 /* This is single byte bit-field */ +#define REC_N_OWNED_MASK 0xFUL +#define REC_N_OWNED_SHIFT 0 + +#define REC_OLD_INFO_BITS 6 /* This is single byte bit-field */ +#define REC_NEW_INFO_BITS 5 /* This is single byte bit-field */ +#define REC_INFO_BITS_MASK 0xF0UL +#define REC_INFO_BITS_SHIFT 0 + +#if REC_OLD_SHORT_MASK << (8 * (REC_OLD_SHORT - 3)) \ + ^ REC_OLD_N_FIELDS_MASK << (8 * (REC_OLD_N_FIELDS - 4)) \ + ^ REC_HEAP_NO_MASK << (8 * (REC_OLD_HEAP_NO - 4)) \ + ^ REC_N_OWNED_MASK << (8 * (REC_OLD_N_OWNED - 3)) \ + ^ REC_INFO_BITS_MASK << (8 * (REC_OLD_INFO_BITS - 3)) \ + ^ 0xFFFFFFFFUL +# error "sum of old-style masks != 0xFFFFFFFFUL" +#endif +#if REC_NEW_STATUS_MASK << (8 * (REC_NEW_STATUS - 3)) \ + ^ REC_HEAP_NO_MASK << (8 * (REC_NEW_HEAP_NO - 4)) \ + ^ REC_N_OWNED_MASK << (8 * (REC_NEW_N_OWNED - 3)) \ + ^ REC_INFO_BITS_MASK << (8 * (REC_NEW_INFO_BITS - 3)) \ + ^ 0xFFFFFFUL +# error "sum of new-style masks != 0xFFFFFFUL" +#endif + +/***********************************************************//** +Sets the value of the ith field SQL null bit of an old-style record. */ +UNIV_INTERN +void +rec_set_nth_field_null_bit( +/*=======================*/ + rec_t* rec, /*!< in: record */ + ulint i, /*!< in: ith field */ + ibool val); /*!< in: value to set */ +/***********************************************************//** +Sets an old-style record field to SQL null. +The physical size of the field is not changed. */ +UNIV_INTERN +void +rec_set_nth_field_sql_null( +/*=======================*/ + rec_t* rec, /*!< in: record */ + ulint n); /*!< in: index of the field */ + +/******************************************************//** +Gets a bit field from within 1 byte. */ +UNIV_INLINE +ulint +rec_get_bit_field_1( +/*================*/ + const rec_t* rec, /*!< in: pointer to record origin */ + ulint offs, /*!< in: offset from the origin down */ + ulint mask, /*!< in: mask used to filter bits */ + ulint shift) /*!< in: shift right applied after masking */ +{ + ut_ad(rec); + + return((mach_read_from_1(rec - offs) & mask) >> shift); +} + +/******************************************************//** +Sets a bit field within 1 byte. */ +UNIV_INLINE +void +rec_set_bit_field_1( +/*================*/ + rec_t* rec, /*!< in: pointer to record origin */ + ulint val, /*!< in: value to set */ + ulint offs, /*!< in: offset from the origin down */ + ulint mask, /*!< in: mask used to filter bits */ + ulint shift) /*!< in: shift right applied after masking */ +{ + ut_ad(rec); + ut_ad(offs <= REC_N_OLD_EXTRA_BYTES); + ut_ad(mask); + ut_ad(mask <= 0xFFUL); + ut_ad(((mask >> shift) << shift) == mask); + ut_ad(((val << shift) & mask) == (val << shift)); + + mach_write_to_1(rec - offs, + (mach_read_from_1(rec - offs) & ~mask) + | (val << shift)); +} + +/******************************************************//** +Gets a bit field from within 2 bytes. */ +UNIV_INLINE +ulint +rec_get_bit_field_2( +/*================*/ + const rec_t* rec, /*!< in: pointer to record origin */ + ulint offs, /*!< in: offset from the origin down */ + ulint mask, /*!< in: mask used to filter bits */ + ulint shift) /*!< in: shift right applied after masking */ +{ + ut_ad(rec); + + return((mach_read_from_2(rec - offs) & mask) >> shift); +} + +/******************************************************//** +Sets a bit field within 2 bytes. */ +UNIV_INLINE +void +rec_set_bit_field_2( +/*================*/ + rec_t* rec, /*!< in: pointer to record origin */ + ulint val, /*!< in: value to set */ + ulint offs, /*!< in: offset from the origin down */ + ulint mask, /*!< in: mask used to filter bits */ + ulint shift) /*!< in: shift right applied after masking */ +{ + ut_ad(rec); + ut_ad(offs <= REC_N_OLD_EXTRA_BYTES); + ut_ad(mask > 0xFFUL); + ut_ad(mask <= 0xFFFFUL); + ut_ad((mask >> shift) & 1); + ut_ad(0 == ((mask >> shift) & ((mask >> shift) + 1))); + ut_ad(((mask >> shift) << shift) == mask); + ut_ad(((val << shift) & mask) == (val << shift)); + + mach_write_to_2(rec - offs, + (mach_read_from_2(rec - offs) & ~mask) + | (val << shift)); +} + +/******************************************************//** +The following function is used to get the pointer of the next chained record +on the same page. +@return pointer to the next chained record, or NULL if none */ +UNIV_INLINE +const rec_t* +rec_get_next_ptr_const( +/*===================*/ + const rec_t* rec, /*!< in: physical record */ + ulint comp) /*!< in: nonzero=compact page format */ +{ + ulint field_value; + + ut_ad(REC_NEXT_MASK == 0xFFFFUL); + ut_ad(REC_NEXT_SHIFT == 0); + + field_value = mach_read_from_2(rec - REC_NEXT); + + if (field_value == 0) { + + return(NULL); + } + + if (comp) { +#if UNIV_PAGE_SIZE_MAX <= 32768 + /* Note that for 64 KiB pages, field_value can 'wrap around' + and the debug assertion is not valid */ + + /* In the following assertion, field_value is interpreted + as signed 16-bit integer in 2's complement arithmetics. + If all platforms defined int16_t in the standard headers, + the expression could be written simpler as + (int16_t) field_value + ut_align_offset(...) < UNIV_PAGE_SIZE + */ + ut_ad((field_value >= 32768 + ? field_value - 65536 + : field_value) + + ut_align_offset(rec, UNIV_PAGE_SIZE) + < UNIV_PAGE_SIZE); +#endif + /* There must be at least REC_N_NEW_EXTRA_BYTES + 1 + between each record. */ + ut_ad((field_value > REC_N_NEW_EXTRA_BYTES + && field_value < 32768) + || field_value < (uint16) -REC_N_NEW_EXTRA_BYTES); + + return((byte*) ut_align_down(rec, UNIV_PAGE_SIZE) + + ut_align_offset(rec + field_value, UNIV_PAGE_SIZE)); + } else { + ut_ad(field_value < UNIV_PAGE_SIZE); + + return((byte*) ut_align_down(rec, UNIV_PAGE_SIZE) + + field_value); + } +} + +/******************************************************//** +The following function is used to get the pointer of the next chained record +on the same page. +@return pointer to the next chained record, or NULL if none */ +UNIV_INLINE +rec_t* +rec_get_next_ptr( +/*=============*/ + rec_t* rec, /*!< in: physical record */ + ulint comp) /*!< in: nonzero=compact page format */ +{ + return(const_cast<rec_t*>(rec_get_next_ptr_const(rec, comp))); +} + +/******************************************************//** +The following function is used to get the offset of the next chained record +on the same page. +@return the page offset of the next chained record, or 0 if none */ +UNIV_INLINE +ulint +rec_get_next_offs( +/*==============*/ + const rec_t* rec, /*!< in: physical record */ + ulint comp) /*!< in: nonzero=compact page format */ +{ + ulint field_value; +#if REC_NEXT_MASK != 0xFFFFUL +# error "REC_NEXT_MASK != 0xFFFFUL" +#endif +#if REC_NEXT_SHIFT +# error "REC_NEXT_SHIFT != 0" +#endif + + field_value = mach_read_from_2(rec - REC_NEXT); + + if (comp) { +#if UNIV_PAGE_SIZE_MAX <= 32768 + /* Note that for 64 KiB pages, field_value can 'wrap around' + and the debug assertion is not valid */ + + /* In the following assertion, field_value is interpreted + as signed 16-bit integer in 2's complement arithmetics. + If all platforms defined int16_t in the standard headers, + the expression could be written simpler as + (int16_t) field_value + ut_align_offset(...) < UNIV_PAGE_SIZE + */ + ut_ad((field_value >= 32768 + ? field_value - 65536 + : field_value) + + ut_align_offset(rec, UNIV_PAGE_SIZE) + < UNIV_PAGE_SIZE); +#endif + if (field_value == 0) { + + return(0); + } + + /* There must be at least REC_N_NEW_EXTRA_BYTES + 1 + between each record. */ + ut_ad((field_value > REC_N_NEW_EXTRA_BYTES + && field_value < 32768) + || field_value < (uint16) -REC_N_NEW_EXTRA_BYTES); + + return(ut_align_offset(rec + field_value, UNIV_PAGE_SIZE)); + } else { + ut_ad(field_value < UNIV_PAGE_SIZE); + + return(field_value); + } +} + +/******************************************************//** +The following function is used to set the next record offset field +of an old-style record. */ +UNIV_INLINE +void +rec_set_next_offs_old( +/*==================*/ + rec_t* rec, /*!< in: old-style physical record */ + ulint next) /*!< in: offset of the next record */ +{ + ut_ad(rec); + ut_ad(UNIV_PAGE_SIZE > next); +#if REC_NEXT_MASK != 0xFFFFUL +# error "REC_NEXT_MASK != 0xFFFFUL" +#endif +#if REC_NEXT_SHIFT +# error "REC_NEXT_SHIFT != 0" +#endif + + mach_write_to_2(rec - REC_NEXT, next); +} + +/******************************************************//** +The following function is used to set the next record offset field +of a new-style record. */ +UNIV_INLINE +void +rec_set_next_offs_new( +/*==================*/ + rec_t* rec, /*!< in/out: new-style physical record */ + ulint next) /*!< in: offset of the next record */ +{ + ulint field_value; + + ut_ad(rec); + ut_ad(UNIV_PAGE_SIZE > next); + + if (!next) { + field_value = 0; + } else { + /* The following two statements calculate + next - offset_of_rec mod 64Ki, where mod is the modulo + as a non-negative number */ + + field_value = (ulint) + ((lint) next + - (lint) ut_align_offset(rec, UNIV_PAGE_SIZE)); + field_value &= REC_NEXT_MASK; + } + + mach_write_to_2(rec - REC_NEXT, field_value); +} + +/******************************************************//** +The following function is used to get the number of fields +in an old-style record. +@return number of data fields */ +UNIV_INLINE +ulint +rec_get_n_fields_old( +/*=================*/ + const rec_t* rec) /*!< in: physical record */ +{ + ulint ret; + + ut_ad(rec); + + ret = rec_get_bit_field_2(rec, REC_OLD_N_FIELDS, + REC_OLD_N_FIELDS_MASK, + REC_OLD_N_FIELDS_SHIFT); + ut_ad(ret <= REC_MAX_N_FIELDS); + ut_ad(ret > 0); + + return(ret); +} + +/******************************************************//** +The following function is used to set the number of fields +in an old-style record. */ +UNIV_INLINE +void +rec_set_n_fields_old( +/*=================*/ + rec_t* rec, /*!< in: physical record */ + ulint n_fields) /*!< in: the number of fields */ +{ + ut_ad(rec); + ut_ad(n_fields <= REC_MAX_N_FIELDS); + ut_ad(n_fields > 0); + + rec_set_bit_field_2(rec, n_fields, REC_OLD_N_FIELDS, + REC_OLD_N_FIELDS_MASK, REC_OLD_N_FIELDS_SHIFT); +} + +/******************************************************//** +The following function retrieves the status bits of a new-style record. +@return status bits */ +UNIV_INLINE +ulint +rec_get_status( +/*===========*/ + const rec_t* rec) /*!< in: physical record */ +{ + ulint ret; + + ut_ad(rec); + + ret = rec_get_bit_field_1(rec, REC_NEW_STATUS, + REC_NEW_STATUS_MASK, REC_NEW_STATUS_SHIFT); + ut_ad((ret & ~REC_NEW_STATUS_MASK) == 0); + + return(ret); +} + +/******************************************************//** +The following function is used to get the number of fields +in a record. +@return number of data fields */ +UNIV_INLINE +ulint +rec_get_n_fields( +/*=============*/ + const rec_t* rec, /*!< in: physical record */ + const dict_index_t* index) /*!< in: record descriptor */ +{ + ut_ad(rec); + ut_ad(index); + + if (!dict_table_is_comp(index->table)) { + return(rec_get_n_fields_old(rec)); + } + + switch (rec_get_status(rec)) { + case REC_STATUS_ORDINARY: + return(dict_index_get_n_fields(index)); + case REC_STATUS_NODE_PTR: + return(dict_index_get_n_unique_in_tree(index) + 1); + case REC_STATUS_INFIMUM: + case REC_STATUS_SUPREMUM: + return(1); + default: + ut_error; + return(ULINT_UNDEFINED); + } +} + +/******************************************************//** +The following function is used to get the number of records owned by the +previous directory record. +@return number of owned records */ +UNIV_INLINE +ulint +rec_get_n_owned_old( +/*================*/ + const rec_t* rec) /*!< in: old-style physical record */ +{ + return(rec_get_bit_field_1(rec, REC_OLD_N_OWNED, + REC_N_OWNED_MASK, REC_N_OWNED_SHIFT)); +} + +/******************************************************//** +The following function is used to set the number of owned records. */ +UNIV_INLINE +void +rec_set_n_owned_old( +/*================*/ + rec_t* rec, /*!< in: old-style physical record */ + ulint n_owned) /*!< in: the number of owned */ +{ + rec_set_bit_field_1(rec, n_owned, REC_OLD_N_OWNED, + REC_N_OWNED_MASK, REC_N_OWNED_SHIFT); +} + +/******************************************************//** +The following function is used to get the number of records owned by the +previous directory record. +@return number of owned records */ +UNIV_INLINE +ulint +rec_get_n_owned_new( +/*================*/ + const rec_t* rec) /*!< in: new-style physical record */ +{ + return(rec_get_bit_field_1(rec, REC_NEW_N_OWNED, + REC_N_OWNED_MASK, REC_N_OWNED_SHIFT)); +} + +/******************************************************//** +The following function is used to set the number of owned records. */ +UNIV_INLINE +void +rec_set_n_owned_new( +/*================*/ + rec_t* rec, /*!< in/out: new-style physical record */ + page_zip_des_t* page_zip,/*!< in/out: compressed page, or NULL */ + ulint n_owned)/*!< in: the number of owned */ +{ + rec_set_bit_field_1(rec, n_owned, REC_NEW_N_OWNED, + REC_N_OWNED_MASK, REC_N_OWNED_SHIFT); + if (page_zip && rec_get_status(rec) != REC_STATUS_SUPREMUM) { + page_zip_rec_set_owned(page_zip, rec, n_owned); + } +} + +/******************************************************//** +The following function is used to retrieve the info bits of a record. +@return info bits */ +UNIV_INLINE +ulint +rec_get_info_bits( +/*==============*/ + const rec_t* rec, /*!< in: physical record */ + ulint comp) /*!< in: nonzero=compact page format */ +{ + return(rec_get_bit_field_1( + rec, comp ? REC_NEW_INFO_BITS : REC_OLD_INFO_BITS, + REC_INFO_BITS_MASK, REC_INFO_BITS_SHIFT)); +} + +/******************************************************//** +The following function is used to set the info bits of a record. */ +UNIV_INLINE +void +rec_set_info_bits_old( +/*==================*/ + rec_t* rec, /*!< in: old-style physical record */ + ulint bits) /*!< in: info bits */ +{ + rec_set_bit_field_1(rec, bits, REC_OLD_INFO_BITS, + REC_INFO_BITS_MASK, REC_INFO_BITS_SHIFT); +} +/******************************************************//** +The following function is used to set the info bits of a record. */ +UNIV_INLINE +void +rec_set_info_bits_new( +/*==================*/ + rec_t* rec, /*!< in/out: new-style physical record */ + ulint bits) /*!< in: info bits */ +{ + rec_set_bit_field_1(rec, bits, REC_NEW_INFO_BITS, + REC_INFO_BITS_MASK, REC_INFO_BITS_SHIFT); +} + +/******************************************************//** +The following function is used to set the status bits of a new-style record. */ +UNIV_INLINE +void +rec_set_status( +/*===========*/ + rec_t* rec, /*!< in/out: physical record */ + ulint bits) /*!< in: info bits */ +{ + rec_set_bit_field_1(rec, bits, REC_NEW_STATUS, + REC_NEW_STATUS_MASK, REC_NEW_STATUS_SHIFT); +} + +/******************************************************//** +The following function is used to retrieve the info and status +bits of a record. (Only compact records have status bits.) +@return info bits */ +UNIV_INLINE +ulint +rec_get_info_and_status_bits( +/*=========================*/ + const rec_t* rec, /*!< in: physical record */ + ulint comp) /*!< in: nonzero=compact page format */ +{ + ulint bits; +#if (REC_NEW_STATUS_MASK >> REC_NEW_STATUS_SHIFT) \ +& (REC_INFO_BITS_MASK >> REC_INFO_BITS_SHIFT) +# error "REC_NEW_STATUS_MASK and REC_INFO_BITS_MASK overlap" +#endif + if (comp) { + bits = rec_get_info_bits(rec, TRUE) | rec_get_status(rec); + } else { + bits = rec_get_info_bits(rec, FALSE); + ut_ad(!(bits & ~(REC_INFO_BITS_MASK >> REC_INFO_BITS_SHIFT))); + } + return(bits); +} +/******************************************************//** +The following function is used to set the info and status +bits of a record. (Only compact records have status bits.) */ +UNIV_INLINE +void +rec_set_info_and_status_bits( +/*=========================*/ + rec_t* rec, /*!< in/out: physical record */ + ulint bits) /*!< in: info bits */ +{ +#if (REC_NEW_STATUS_MASK >> REC_NEW_STATUS_SHIFT) \ +& (REC_INFO_BITS_MASK >> REC_INFO_BITS_SHIFT) +# error "REC_NEW_STATUS_MASK and REC_INFO_BITS_MASK overlap" +#endif + rec_set_status(rec, bits & REC_NEW_STATUS_MASK); + rec_set_info_bits_new(rec, bits & ~REC_NEW_STATUS_MASK); +} + +/******************************************************//** +The following function tells if record is delete marked. +@return nonzero if delete marked */ +UNIV_INLINE +ulint +rec_get_deleted_flag( +/*=================*/ + const rec_t* rec, /*!< in: physical record */ + ulint comp) /*!< in: nonzero=compact page format */ +{ + if (comp) { + return(rec_get_bit_field_1(rec, REC_NEW_INFO_BITS, + REC_INFO_DELETED_FLAG, + REC_INFO_BITS_SHIFT)); + } else { + return(rec_get_bit_field_1(rec, REC_OLD_INFO_BITS, + REC_INFO_DELETED_FLAG, + REC_INFO_BITS_SHIFT)); + } +} + +/******************************************************//** +The following function is used to set the deleted bit. */ +UNIV_INLINE +void +rec_set_deleted_flag_old( +/*=====================*/ + rec_t* rec, /*!< in: old-style physical record */ + ulint flag) /*!< in: nonzero if delete marked */ +{ + ulint val; + + val = rec_get_info_bits(rec, FALSE); + + if (flag) { + val |= REC_INFO_DELETED_FLAG; + } else { + val &= ~REC_INFO_DELETED_FLAG; + } + + rec_set_info_bits_old(rec, val); +} + +/******************************************************//** +The following function is used to set the deleted bit. */ +UNIV_INLINE +void +rec_set_deleted_flag_new( +/*=====================*/ + rec_t* rec, /*!< in/out: new-style physical record */ + page_zip_des_t* page_zip,/*!< in/out: compressed page, or NULL */ + ulint flag) /*!< in: nonzero if delete marked */ +{ + ulint val; + + val = rec_get_info_bits(rec, TRUE); + + if (flag) { + val |= REC_INFO_DELETED_FLAG; + } else { + val &= ~REC_INFO_DELETED_FLAG; + } + + rec_set_info_bits_new(rec, val); + + if (page_zip) { + page_zip_rec_set_deleted(page_zip, rec, flag); + } +} + +/******************************************************//** +The following function tells if a new-style record is a node pointer. +@return TRUE if node pointer */ +UNIV_INLINE +ibool +rec_get_node_ptr_flag( +/*==================*/ + const rec_t* rec) /*!< in: physical record */ +{ + return(REC_STATUS_NODE_PTR == rec_get_status(rec)); +} + +/******************************************************//** +The following function is used to get the order number +of an old-style record in the heap of the index page. +@return heap order number */ +UNIV_INLINE +ulint +rec_get_heap_no_old( +/*================*/ + const rec_t* rec) /*!< in: physical record */ +{ + return(rec_get_bit_field_2(rec, REC_OLD_HEAP_NO, + REC_HEAP_NO_MASK, REC_HEAP_NO_SHIFT)); +} + +/******************************************************//** +The following function is used to set the heap number +field in an old-style record. */ +UNIV_INLINE +void +rec_set_heap_no_old( +/*================*/ + rec_t* rec, /*!< in: physical record */ + ulint heap_no)/*!< in: the heap number */ +{ + rec_set_bit_field_2(rec, heap_no, REC_OLD_HEAP_NO, + REC_HEAP_NO_MASK, REC_HEAP_NO_SHIFT); +} + +/******************************************************//** +The following function is used to get the order number +of a new-style record in the heap of the index page. +@return heap order number */ +UNIV_INLINE +ulint +rec_get_heap_no_new( +/*================*/ + const rec_t* rec) /*!< in: physical record */ +{ + return(rec_get_bit_field_2(rec, REC_NEW_HEAP_NO, + REC_HEAP_NO_MASK, REC_HEAP_NO_SHIFT)); +} + +/******************************************************//** +The following function is used to set the heap number +field in a new-style record. */ +UNIV_INLINE +void +rec_set_heap_no_new( +/*================*/ + rec_t* rec, /*!< in/out: physical record */ + ulint heap_no)/*!< in: the heap number */ +{ + rec_set_bit_field_2(rec, heap_no, REC_NEW_HEAP_NO, + REC_HEAP_NO_MASK, REC_HEAP_NO_SHIFT); +} + +/******************************************************//** +The following function is used to test whether the data offsets in the record +are stored in one-byte or two-byte format. +@return TRUE if 1-byte form */ +UNIV_INLINE +ibool +rec_get_1byte_offs_flag( +/*====================*/ + const rec_t* rec) /*!< in: physical record */ +{ +#if TRUE != 1 +#error "TRUE != 1" +#endif + + return(rec_get_bit_field_1(rec, REC_OLD_SHORT, REC_OLD_SHORT_MASK, + REC_OLD_SHORT_SHIFT)); +} + +/******************************************************//** +The following function is used to set the 1-byte offsets flag. */ +UNIV_INLINE +void +rec_set_1byte_offs_flag( +/*====================*/ + rec_t* rec, /*!< in: physical record */ + ibool flag) /*!< in: TRUE if 1byte form */ +{ +#if TRUE != 1 +#error "TRUE != 1" +#endif + ut_ad(flag <= TRUE); + + rec_set_bit_field_1(rec, flag, REC_OLD_SHORT, REC_OLD_SHORT_MASK, + REC_OLD_SHORT_SHIFT); +} + +/******************************************************//** +Returns the offset of nth field end if the record is stored in the 1-byte +offsets form. If the field is SQL null, the flag is ORed in the returned +value. +@return offset of the start of the field, SQL null flag ORed */ +UNIV_INLINE +ulint +rec_1_get_field_end_info( +/*=====================*/ + const rec_t* rec, /*!< in: record */ + ulint n) /*!< in: field index */ +{ + ut_ad(rec_get_1byte_offs_flag(rec)); + ut_ad(n < rec_get_n_fields_old(rec)); + + return(mach_read_from_1(rec - (REC_N_OLD_EXTRA_BYTES + n + 1))); +} + +/******************************************************//** +Returns the offset of nth field end if the record is stored in the 2-byte +offsets form. If the field is SQL null, the flag is ORed in the returned +value. +@return offset of the start of the field, SQL null flag and extern +storage flag ORed */ +UNIV_INLINE +ulint +rec_2_get_field_end_info( +/*=====================*/ + const rec_t* rec, /*!< in: record */ + ulint n) /*!< in: field index */ +{ + ut_ad(!rec_get_1byte_offs_flag(rec)); + ut_ad(n < rec_get_n_fields_old(rec)); + + return(mach_read_from_2(rec - (REC_N_OLD_EXTRA_BYTES + 2 * n + 2))); +} + +/******************************************************//** +Returns nonzero if the field is stored off-page. +@retval 0 if the field is stored in-page +@retval REC_2BYTE_EXTERN_MASK if the field is stored externally */ +UNIV_INLINE +ulint +rec_2_is_field_extern( +/*==================*/ + const rec_t* rec, /*!< in: record */ + ulint n) /*!< in: field index */ +{ + return(rec_2_get_field_end_info(rec, n) & REC_2BYTE_EXTERN_MASK); +} + +/* Get the base address of offsets. The extra_size is stored at +this position, and following positions hold the end offsets of +the fields. */ +#define rec_offs_base(offsets) (offsets + REC_OFFS_HEADER_SIZE) + +/**********************************************************//** +The following function returns the number of allocated elements +for an array of offsets. +@return number of elements */ +UNIV_INLINE +ulint +rec_offs_get_n_alloc( +/*=================*/ + const ulint* offsets)/*!< in: array for rec_get_offsets() */ +{ + ulint n_alloc; + ut_ad(offsets); + n_alloc = offsets[0]; + ut_ad(n_alloc > REC_OFFS_HEADER_SIZE); + UNIV_MEM_ASSERT_W(offsets, n_alloc * sizeof *offsets); + return(n_alloc); +} + +/**********************************************************//** +The following function sets the number of allocated elements +for an array of offsets. */ +UNIV_INLINE +void +rec_offs_set_n_alloc( +/*=================*/ + ulint* offsets, /*!< out: array for rec_get_offsets(), + must be allocated */ + ulint n_alloc) /*!< in: number of elements */ +{ + ut_ad(offsets); + ut_ad(n_alloc > REC_OFFS_HEADER_SIZE); + UNIV_MEM_ASSERT_AND_ALLOC(offsets, n_alloc * sizeof *offsets); + offsets[0] = n_alloc; +} + +/**********************************************************//** +The following function returns the number of fields in a record. +@return number of fields */ +UNIV_INLINE +ulint +rec_offs_n_fields( +/*==============*/ + const ulint* offsets)/*!< in: array returned by rec_get_offsets() */ +{ + ulint n_fields; + ut_ad(offsets); + n_fields = offsets[1]; + ut_ad(n_fields > 0); + ut_ad(n_fields <= REC_MAX_N_FIELDS); + ut_ad(n_fields + REC_OFFS_HEADER_SIZE + <= rec_offs_get_n_alloc(offsets)); + return(n_fields); +} + +/************************************************************//** +Validates offsets returned by rec_get_offsets(). +@return TRUE if valid */ +UNIV_INLINE +ibool +rec_offs_validate( +/*==============*/ + const rec_t* rec, /*!< in: record or NULL */ + const dict_index_t* index, /*!< in: record descriptor or NULL */ + const ulint* offsets)/*!< in: array returned by + rec_get_offsets() */ +{ + ulint i = rec_offs_n_fields(offsets); + ulint last = ULINT_MAX; + ulint comp = *rec_offs_base(offsets) & REC_OFFS_COMPACT; + + if (rec) { + ut_ad((ulint) rec == offsets[2]); + if (!comp) { + ut_a(rec_get_n_fields_old(rec) >= i); + } + } + if (index) { + ulint max_n_fields; + ut_ad((ulint) index == offsets[3]); + max_n_fields = ut_max( + dict_index_get_n_fields(index), + dict_index_get_n_unique_in_tree(index) + 1); + if (comp && rec) { + switch (rec_get_status(rec)) { + case REC_STATUS_ORDINARY: + break; + case REC_STATUS_NODE_PTR: + max_n_fields = dict_index_get_n_unique_in_tree( + index) + 1; + break; + case REC_STATUS_INFIMUM: + case REC_STATUS_SUPREMUM: + max_n_fields = 1; + break; + default: + ut_error; + } + } + /* index->n_def == 0 for dummy indexes if !comp */ + ut_a(!comp || index->n_def); + ut_a(!index->n_def || i <= max_n_fields); + } + while (i--) { + ulint curr = rec_offs_base(offsets)[1 + i] & REC_OFFS_MASK; + ut_a(curr <= last); + last = curr; + } + return(TRUE); +} +#ifdef UNIV_DEBUG +/************************************************************//** +Updates debug data in offsets, in order to avoid bogus +rec_offs_validate() failures. */ +UNIV_INLINE +void +rec_offs_make_valid( +/*================*/ + const rec_t* rec, /*!< in: record */ + const dict_index_t* index, /*!< in: record descriptor */ + ulint* offsets)/*!< in: array returned by + rec_get_offsets() */ +{ + ut_ad(rec); + ut_ad(index); + ut_ad(offsets); + ut_ad(rec_get_n_fields(rec, index) >= rec_offs_n_fields(offsets)); + offsets[2] = (ulint) rec; + offsets[3] = (ulint) index; +} +#endif /* UNIV_DEBUG */ + +/************************************************************//** +The following function is used to get an offset to the nth +data field in a record. +@return offset from the origin of rec */ +UNIV_INLINE +ulint +rec_get_nth_field_offs( +/*===================*/ + const ulint* offsets,/*!< in: array returned by rec_get_offsets() */ + ulint n, /*!< in: index of the field */ + ulint* len) /*!< out: length of the field; UNIV_SQL_NULL + if SQL null */ +{ + ulint offs; + ulint length; + ut_ad(n < rec_offs_n_fields(offsets)); + ut_ad(len); + + if (n == 0) { + offs = 0; + } else { + offs = rec_offs_base(offsets)[n] & REC_OFFS_MASK; + } + + length = rec_offs_base(offsets)[1 + n]; + + if (length & REC_OFFS_SQL_NULL) { + length = UNIV_SQL_NULL; + } else { + length &= REC_OFFS_MASK; + length -= offs; + } + + *len = length; + return(offs); +} + +/******************************************************//** +Determine if the offsets are for a record in the new +compact format. +@return nonzero if compact format */ +UNIV_INLINE +ulint +rec_offs_comp( +/*==========*/ + const ulint* offsets)/*!< in: array returned by rec_get_offsets() */ +{ + ut_ad(rec_offs_validate(NULL, NULL, offsets)); + return(*rec_offs_base(offsets) & REC_OFFS_COMPACT); +} + +/******************************************************//** +Determine if the offsets are for a record containing +externally stored columns. +@return nonzero if externally stored */ +UNIV_INLINE +ulint +rec_offs_any_extern( +/*================*/ + const ulint* offsets)/*!< in: array returned by rec_get_offsets() */ +{ + ut_ad(rec_offs_validate(NULL, NULL, offsets)); + return(*rec_offs_base(offsets) & REC_OFFS_EXTERNAL); +} + +/******************************************************//** +Determine if the offsets are for a record containing null BLOB pointers. +@return first field containing a null BLOB pointer, or NULL if none found */ +UNIV_INLINE +const byte* +rec_offs_any_null_extern( +/*=====================*/ + const rec_t* rec, /*!< in: record */ + const ulint* offsets) /*!< in: rec_get_offsets(rec) */ +{ + ulint i; + ut_ad(rec_offs_validate(rec, NULL, offsets)); + + if (!rec_offs_any_extern(offsets)) { + return(NULL); + } + + for (i = 0; i < rec_offs_n_fields(offsets); i++) { + if (rec_offs_nth_extern(offsets, i)) { + ulint len; + const byte* field + = rec_get_nth_field(rec, offsets, i, &len); + + ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE); + if (!memcmp(field + len + - BTR_EXTERN_FIELD_REF_SIZE, + field_ref_zero, + BTR_EXTERN_FIELD_REF_SIZE)) { + return(field); + } + } + } + + return(NULL); +} + +/******************************************************//** +Returns nonzero if the extern bit is set in nth field of rec. +@return nonzero if externally stored */ +UNIV_INLINE +ulint +rec_offs_nth_extern( +/*================*/ + const ulint* offsets,/*!< in: array returned by rec_get_offsets() */ + ulint n) /*!< in: nth field */ +{ + ut_ad(rec_offs_validate(NULL, NULL, offsets)); + ut_ad(n < rec_offs_n_fields(offsets)); + return(rec_offs_base(offsets)[1 + n] & REC_OFFS_EXTERNAL); +} + +/******************************************************//** +Returns nonzero if the SQL NULL bit is set in nth field of rec. +@return nonzero if SQL NULL */ +UNIV_INLINE +ulint +rec_offs_nth_sql_null( +/*==================*/ + const ulint* offsets,/*!< in: array returned by rec_get_offsets() */ + ulint n) /*!< in: nth field */ +{ + ut_ad(rec_offs_validate(NULL, NULL, offsets)); + ut_ad(n < rec_offs_n_fields(offsets)); + return(rec_offs_base(offsets)[1 + n] & REC_OFFS_SQL_NULL); +} + +/******************************************************//** +Gets the physical size of a field. +@return length of field */ +UNIV_INLINE +ulint +rec_offs_nth_size( +/*==============*/ + const ulint* offsets,/*!< in: array returned by rec_get_offsets() */ + ulint n) /*!< in: nth field */ +{ + ut_ad(rec_offs_validate(NULL, NULL, offsets)); + ut_ad(n < rec_offs_n_fields(offsets)); + if (!n) { + return(rec_offs_base(offsets)[1 + n] & REC_OFFS_MASK); + } + return((rec_offs_base(offsets)[1 + n] - rec_offs_base(offsets)[n]) + & REC_OFFS_MASK); +} + +/******************************************************//** +Returns the number of extern bits set in a record. +@return number of externally stored fields */ +UNIV_INLINE +ulint +rec_offs_n_extern( +/*==============*/ + const ulint* offsets)/*!< in: array returned by rec_get_offsets() */ +{ + ulint n = 0; + + if (rec_offs_any_extern(offsets)) { + ulint i; + + for (i = rec_offs_n_fields(offsets); i--; ) { + if (rec_offs_nth_extern(offsets, i)) { + n++; + } + } + } + + return(n); +} + +/******************************************************//** +Returns the offset of n - 1th field end if the record is stored in the 1-byte +offsets form. If the field is SQL null, the flag is ORed in the returned +value. This function and the 2-byte counterpart are defined here because the +C-compiler was not able to sum negative and positive constant offsets, and +warned of constant arithmetic overflow within the compiler. +@return offset of the start of the PREVIOUS field, SQL null flag ORed */ +UNIV_INLINE +ulint +rec_1_get_prev_field_end_info( +/*==========================*/ + const rec_t* rec, /*!< in: record */ + ulint n) /*!< in: field index */ +{ + ut_ad(rec_get_1byte_offs_flag(rec)); + ut_ad(n <= rec_get_n_fields_old(rec)); + + return(mach_read_from_1(rec - (REC_N_OLD_EXTRA_BYTES + n))); +} + +/******************************************************//** +Returns the offset of n - 1th field end if the record is stored in the 2-byte +offsets form. If the field is SQL null, the flag is ORed in the returned +value. +@return offset of the start of the PREVIOUS field, SQL null flag ORed */ +UNIV_INLINE +ulint +rec_2_get_prev_field_end_info( +/*==========================*/ + const rec_t* rec, /*!< in: record */ + ulint n) /*!< in: field index */ +{ + ut_ad(!rec_get_1byte_offs_flag(rec)); + ut_ad(n <= rec_get_n_fields_old(rec)); + + return(mach_read_from_2(rec - (REC_N_OLD_EXTRA_BYTES + 2 * n))); +} + +/******************************************************//** +Sets the field end info for the nth field if the record is stored in the +1-byte format. */ +UNIV_INLINE +void +rec_1_set_field_end_info( +/*=====================*/ + rec_t* rec, /*!< in: record */ + ulint n, /*!< in: field index */ + ulint info) /*!< in: value to set */ +{ + ut_ad(rec_get_1byte_offs_flag(rec)); + ut_ad(n < rec_get_n_fields_old(rec)); + + mach_write_to_1(rec - (REC_N_OLD_EXTRA_BYTES + n + 1), info); +} + +/******************************************************//** +Sets the field end info for the nth field if the record is stored in the +2-byte format. */ +UNIV_INLINE +void +rec_2_set_field_end_info( +/*=====================*/ + rec_t* rec, /*!< in: record */ + ulint n, /*!< in: field index */ + ulint info) /*!< in: value to set */ +{ + ut_ad(!rec_get_1byte_offs_flag(rec)); + ut_ad(n < rec_get_n_fields_old(rec)); + + mach_write_to_2(rec - (REC_N_OLD_EXTRA_BYTES + 2 * n + 2), info); +} + +/******************************************************//** +Returns the offset of nth field start if the record is stored in the 1-byte +offsets form. +@return offset of the start of the field */ +UNIV_INLINE +ulint +rec_1_get_field_start_offs( +/*=======================*/ + const rec_t* rec, /*!< in: record */ + ulint n) /*!< in: field index */ +{ + ut_ad(rec_get_1byte_offs_flag(rec)); + ut_ad(n <= rec_get_n_fields_old(rec)); + + if (n == 0) { + + return(0); + } + + return(rec_1_get_prev_field_end_info(rec, n) + & ~REC_1BYTE_SQL_NULL_MASK); +} + +/******************************************************//** +Returns the offset of nth field start if the record is stored in the 2-byte +offsets form. +@return offset of the start of the field */ +UNIV_INLINE +ulint +rec_2_get_field_start_offs( +/*=======================*/ + const rec_t* rec, /*!< in: record */ + ulint n) /*!< in: field index */ +{ + ut_ad(!rec_get_1byte_offs_flag(rec)); + ut_ad(n <= rec_get_n_fields_old(rec)); + + if (n == 0) { + + return(0); + } + + return(rec_2_get_prev_field_end_info(rec, n) + & ~(REC_2BYTE_SQL_NULL_MASK | REC_2BYTE_EXTERN_MASK)); +} + +/******************************************************//** +The following function is used to read the offset of the start of a data field +in the record. The start of an SQL null field is the end offset of the +previous non-null field, or 0, if none exists. If n is the number of the last +field + 1, then the end offset of the last field is returned. +@return offset of the start of the field */ +UNIV_INLINE +ulint +rec_get_field_start_offs( +/*=====================*/ + const rec_t* rec, /*!< in: record */ + ulint n) /*!< in: field index */ +{ + ut_ad(rec); + ut_ad(n <= rec_get_n_fields_old(rec)); + + if (n == 0) { + + return(0); + } + + if (rec_get_1byte_offs_flag(rec)) { + + return(rec_1_get_field_start_offs(rec, n)); + } + + return(rec_2_get_field_start_offs(rec, n)); +} + +/************************************************************//** +Gets the physical size of an old-style field. +Also an SQL null may have a field of size > 0, +if the data type is of a fixed size. +@return field size in bytes */ +UNIV_INLINE +ulint +rec_get_nth_field_size( +/*===================*/ + const rec_t* rec, /*!< in: record */ + ulint n) /*!< in: index of the field */ +{ + ulint os; + ulint next_os; + + os = rec_get_field_start_offs(rec, n); + next_os = rec_get_field_start_offs(rec, n + 1); + + ut_ad(next_os - os < UNIV_PAGE_SIZE); + + return(next_os - os); +} + +/***********************************************************//** +This is used to modify the value of an already existing field in a record. +The previous value must have exactly the same size as the new value. If len +is UNIV_SQL_NULL then the field is treated as an SQL null. +For records in ROW_FORMAT=COMPACT (new-style records), len must not be +UNIV_SQL_NULL unless the field already is SQL null. */ +UNIV_INLINE +void +rec_set_nth_field( +/*==============*/ + rec_t* rec, /*!< in: record */ + const ulint* offsets,/*!< in: array returned by rec_get_offsets() */ + ulint n, /*!< in: index number of the field */ + const void* data, /*!< in: pointer to the data + if not SQL null */ + ulint len) /*!< in: length of the data or UNIV_SQL_NULL */ +{ + byte* data2; + ulint len2; + + ut_ad(rec); + ut_ad(rec_offs_validate(rec, NULL, offsets)); + + if (len == UNIV_SQL_NULL) { + if (!rec_offs_nth_sql_null(offsets, n)) { + ut_a(!rec_offs_comp(offsets)); + rec_set_nth_field_sql_null(rec, n); + } + + return; + } + + data2 = rec_get_nth_field(rec, offsets, n, &len2); + if (len2 == UNIV_SQL_NULL) { + ut_ad(!rec_offs_comp(offsets)); + rec_set_nth_field_null_bit(rec, n, FALSE); + ut_ad(len == rec_get_nth_field_size(rec, n)); + } else { + ut_ad(len2 == len); + } + + ut_memcpy(data2, data, len); +} + +/**********************************************************//** +The following function returns the data size of an old-style physical +record, that is the sum of field lengths. SQL null fields +are counted as length 0 fields. The value returned by the function +is the distance from record origin to record end in bytes. +@return size */ +UNIV_INLINE +ulint +rec_get_data_size_old( +/*==================*/ + const rec_t* rec) /*!< in: physical record */ +{ + ut_ad(rec); + + return(rec_get_field_start_offs(rec, rec_get_n_fields_old(rec))); +} + +/**********************************************************//** +The following function sets the number of fields in offsets. */ +UNIV_INLINE +void +rec_offs_set_n_fields( +/*==================*/ + ulint* offsets, /*!< in/out: array returned by + rec_get_offsets() */ + ulint n_fields) /*!< in: number of fields */ +{ + ut_ad(offsets); + ut_ad(n_fields > 0); + ut_ad(n_fields <= REC_MAX_N_FIELDS); + ut_ad(n_fields + REC_OFFS_HEADER_SIZE + <= rec_offs_get_n_alloc(offsets)); + offsets[1] = n_fields; +} + +/**********************************************************//** +The following function returns the data size of a physical +record, that is the sum of field lengths. SQL null fields +are counted as length 0 fields. The value returned by the function +is the distance from record origin to record end in bytes. +@return size */ +UNIV_INLINE +ulint +rec_offs_data_size( +/*===============*/ + const ulint* offsets)/*!< in: array returned by rec_get_offsets() */ +{ + ulint size; + + ut_ad(rec_offs_validate(NULL, NULL, offsets)); + size = rec_offs_base(offsets)[rec_offs_n_fields(offsets)] + & REC_OFFS_MASK; + ut_ad(size < UNIV_PAGE_SIZE); + return(size); +} + +/**********************************************************//** +Returns the total size of record minus data size of record. The value +returned by the function is the distance from record start to record origin +in bytes. +@return size */ +UNIV_INLINE +ulint +rec_offs_extra_size( +/*================*/ + const ulint* offsets)/*!< in: array returned by rec_get_offsets() */ +{ + ulint size; + ut_ad(rec_offs_validate(NULL, NULL, offsets)); + size = *rec_offs_base(offsets) & ~(REC_OFFS_COMPACT | REC_OFFS_EXTERNAL); + ut_ad(size < UNIV_PAGE_SIZE); + return(size); +} + +/**********************************************************//** +Returns the total size of a physical record. +@return size */ +UNIV_INLINE +ulint +rec_offs_size( +/*==========*/ + const ulint* offsets)/*!< in: array returned by rec_get_offsets() */ +{ + return(rec_offs_data_size(offsets) + rec_offs_extra_size(offsets)); +} + +#ifdef UNIV_DEBUG +/**********************************************************//** +Returns a pointer to the end of the record. +@return pointer to end */ +UNIV_INLINE +byte* +rec_get_end( +/*========*/ + const rec_t* rec, /*!< in: pointer to record */ + const ulint* offsets)/*!< in: array returned by rec_get_offsets() */ +{ + ut_ad(rec_offs_validate(rec, NULL, offsets)); + return(const_cast<rec_t*>(rec + rec_offs_data_size(offsets))); +} + +/**********************************************************//** +Returns a pointer to the start of the record. +@return pointer to start */ +UNIV_INLINE +byte* +rec_get_start( +/*==========*/ + const rec_t* rec, /*!< in: pointer to record */ + const ulint* offsets)/*!< in: array returned by rec_get_offsets() */ +{ + ut_ad(rec_offs_validate(rec, NULL, offsets)); + return(const_cast<rec_t*>(rec - rec_offs_extra_size(offsets))); +} +#endif /* UNIV_DEBUG */ + +/***************************************************************//** +Copies a physical record to a buffer. +@return pointer to the origin of the copy */ +UNIV_INLINE +rec_t* +rec_copy( +/*=====*/ + void* buf, /*!< in: buffer */ + const rec_t* rec, /*!< in: physical record */ + const ulint* offsets)/*!< in: array returned by rec_get_offsets() */ +{ + ulint extra_len; + ulint data_len; + + ut_ad(rec && buf); + ut_ad(rec_offs_validate(rec, NULL, offsets)); + ut_ad(rec_validate(rec, offsets)); + + extra_len = rec_offs_extra_size(offsets); + data_len = rec_offs_data_size(offsets); + + ut_memcpy(buf, rec - extra_len, extra_len + data_len); + + return((byte*) buf + extra_len); +} + +/**********************************************************//** +Returns the extra size of an old-style physical record if we know its +data size and number of fields. +@return extra size */ +UNIV_INLINE +ulint +rec_get_converted_extra_size( +/*=========================*/ + ulint data_size, /*!< in: data size */ + ulint n_fields, /*!< in: number of fields */ + ulint n_ext) /*!< in: number of externally stored columns */ +{ + if (!n_ext && data_size <= REC_1BYTE_OFFS_LIMIT) { + + return(REC_N_OLD_EXTRA_BYTES + n_fields); + } + + return(REC_N_OLD_EXTRA_BYTES + 2 * n_fields); +} + +/**********************************************************//** +The following function returns the size of a data tuple when converted to +a physical record. +@return size */ +UNIV_INLINE +ulint +rec_get_converted_size( +/*===================*/ + dict_index_t* index, /*!< in: record descriptor */ + const dtuple_t* dtuple, /*!< in: data tuple */ + ulint n_ext) /*!< in: number of externally stored columns */ +{ + ulint data_size; + ulint extra_size; + + ut_ad(index); + ut_ad(dtuple); + ut_ad(dtuple_check_typed(dtuple)); + + ut_ad(dict_index_is_univ(index) + || dtuple_get_n_fields(dtuple) + == (((dtuple_get_info_bits(dtuple) & REC_NEW_STATUS_MASK) + == REC_STATUS_NODE_PTR) + ? dict_index_get_n_unique_in_tree(index) + 1 + : dict_index_get_n_fields(index))); + + if (dict_table_is_comp(index->table)) { + return(rec_get_converted_size_comp(index, + dtuple_get_info_bits(dtuple) + & REC_NEW_STATUS_MASK, + dtuple->fields, + dtuple->n_fields, NULL)); + } + + data_size = dtuple_get_data_size(dtuple, 0); + + extra_size = rec_get_converted_extra_size( + data_size, dtuple_get_n_fields(dtuple), n_ext); + +#if 0 + /* This code is inactive since it may be the wrong place to add + in the size of node pointers used in parent pages AND it is not + currently needed since ha_innobase::max_supported_key_length() + ensures that the key size limit for each page size is well below + the actual limit ((free space on page / 4) - record overhead). + But those limits will need to be raised when InnoDB can + support multiple page sizes. At that time, we will need + to consider the node pointer on these universal btrees. */ + + if (dict_index_is_univ(index)) { + /* This is for the insert buffer B-tree. + All fields in the leaf tuple ascend to the + parent node plus the child page pointer. */ + + /* ibuf cannot contain externally stored fields */ + ut_ad(n_ext == 0); + + /* Add the data pointer and recompute extra_size + based on one more field. */ + data_size += REC_NODE_PTR_SIZE; + extra_size = rec_get_converted_extra_size( + data_size, + dtuple_get_n_fields(dtuple) + 1, + 0); + + /* Be sure dtuple->n_fields has this node ptr + accounted for. This function should correspond to + what rec_convert_dtuple_to_rec() needs in storage. + In optimistic insert or update-not-in-place, we will + have to ensure that if the record is converted to a + node pointer, it will not become too large.*/ + } +#endif + + return(data_size + extra_size); +} + +#ifndef UNIV_HOTBACKUP +/************************************************************//** +Folds a prefix of a physical record to a ulint. Folds only existing fields, +that is, checks that we do not run out of the record. +@return the folded value */ +UNIV_INLINE +ulint +rec_fold( +/*=====*/ + const rec_t* rec, /*!< in: the physical record */ + const ulint* offsets, /*!< in: array returned by + rec_get_offsets() */ + ulint n_fields, /*!< in: number of complete + fields to fold */ + ulint n_bytes, /*!< in: number of bytes to fold + in an incomplete last field */ + index_id_t tree_id) /*!< in: index tree id */ +{ + ulint i; + const byte* data; + ulint len; + ulint fold; + ulint n_fields_rec; + + ut_ad(rec_offs_validate(rec, NULL, offsets)); + ut_ad(rec_validate(rec, offsets)); + ut_ad(n_fields + n_bytes > 0); + + n_fields_rec = rec_offs_n_fields(offsets); + ut_ad(n_fields <= n_fields_rec); + ut_ad(n_fields < n_fields_rec || n_bytes == 0); + + if (n_fields > n_fields_rec) { + n_fields = n_fields_rec; + } + + if (n_fields == n_fields_rec) { + n_bytes = 0; + } + + fold = ut_fold_ull(tree_id); + + for (i = 0; i < n_fields; i++) { + data = rec_get_nth_field(rec, offsets, i, &len); + + if (len != UNIV_SQL_NULL) { + fold = ut_fold_ulint_pair(fold, + ut_fold_binary(data, len)); + } + } + + if (n_bytes > 0) { + data = rec_get_nth_field(rec, offsets, i, &len); + + if (len != UNIV_SQL_NULL) { + if (len > n_bytes) { + len = n_bytes; + } + + fold = ut_fold_ulint_pair(fold, + ut_fold_binary(data, len)); + } + } + + return(fold); +} +#endif /* !UNIV_HOTBACKUP */ diff --git a/storage/innobase/include/rem0types.h b/storage/innobase/include/rem0types.h new file mode 100644 index 00000000000..f8133f77466 --- /dev/null +++ b/storage/innobase/include/rem0types.h @@ -0,0 +1,74 @@ +/***************************************************************************** + +Copyright (c) 1994, 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/********************************************************************//** +@file include/rem0types.h +Record manager global types + +Created 5/30/1994 Heikki Tuuri +*************************************************************************/ + +#ifndef rem0types_h +#define rem0types_h + +/* We define the physical record simply as an array of bytes */ +typedef byte rec_t; + +/* Maximum values for various fields (for non-blob tuples) */ +#define REC_MAX_N_FIELDS (1024 - 1) +#define REC_MAX_HEAP_NO (2 * 8192 - 1) +#define REC_MAX_N_OWNED (16 - 1) + +/* Maximum number of user defined fields/columns. The reserved columns +are the ones InnoDB adds internally: DB_ROW_ID, DB_TRX_ID, DB_ROLL_PTR. +We need "* 2" because mlog_parse_index() creates a dummy table object +possibly, with some of the system columns in it, and then adds the 3 +system columns (again) using dict_table_add_system_columns(). The problem +is that mlog_parse_index() cannot recognize the system columns by +just having n_fields, n_uniq and the lengths of the columns. */ +#define REC_MAX_N_USER_FIELDS (REC_MAX_N_FIELDS - DATA_N_SYS_COLS * 2) + +/* REC_ANTELOPE_MAX_INDEX_COL_LEN is measured in bytes and is the maximum +indexed field length (or indexed prefix length) for indexes on tables of +ROW_FORMAT=REDUNDANT and ROW_FORMAT=COMPACT format. +Before we support UTF-8 encodings with mbmaxlen = 4, a UTF-8 character +may take at most 3 bytes. So the limit was set to 3*256, so that one +can create a column prefix index on 256 characters of a TEXT or VARCHAR +column also in the UTF-8 charset. +This constant MUST NOT BE CHANGED, or the compatibility of InnoDB data +files would be at risk! */ +#define REC_ANTELOPE_MAX_INDEX_COL_LEN 768 + +/** Maximum indexed field length for table format UNIV_FORMAT_B and +beyond. +This (3072) is the maximum index row length allowed, so we cannot create index +prefix column longer than that. */ +#define REC_VERSION_56_MAX_INDEX_COL_LEN 3072 + +/** Innodb row types are a subset of the MySQL global enum row_type. +They are made into their own enum so that switch statements can account +for each of them. */ +enum rec_format_enum { + REC_FORMAT_REDUNDANT = 0, /*!< REDUNDANT row format */ + REC_FORMAT_COMPACT = 1, /*!< COMPACT row format */ + REC_FORMAT_COMPRESSED = 2, /*!< COMPRESSED row format */ + REC_FORMAT_DYNAMIC = 3 /*!< DYNAMIC row format */ +}; +typedef enum rec_format_enum rec_format_t; + +#endif diff --git a/storage/innobase/include/row0ext.h b/storage/innobase/include/row0ext.h new file mode 100644 index 00000000000..a098e2f9b29 --- /dev/null +++ b/storage/innobase/include/row0ext.h @@ -0,0 +1,102 @@ +/***************************************************************************** + +Copyright (c) 2006, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/row0ext.h +Caching of externally stored column prefixes + +Created September 2006 Marko Makela +*******************************************************/ + +#ifndef row0ext_h +#define row0ext_h + +#include "univ.i" +#include "row0types.h" +#include "data0types.h" +#include "mem0mem.h" +#include "dict0types.h" + +/********************************************************************//** +Creates a cache of column prefixes of externally stored columns. +@return own: column prefix cache */ +UNIV_INTERN +row_ext_t* +row_ext_create( +/*===========*/ + ulint n_ext, /*!< in: number of externally stored columns */ + const ulint* ext, /*!< in: col_no's of externally stored columns + in the InnoDB table object, as reported by + dict_col_get_no(); NOT relative to the records + in the clustered index */ + ulint flags, /*!< in: table->flags */ + const dtuple_t* tuple, /*!< in: data tuple containing the field + references of the externally stored + columns; must be indexed by col_no; + the clustered index record must be + covered by a lock or a page latch + to prevent deletion (rollback or purge). */ + mem_heap_t* heap); /*!< in: heap where created */ + +/********************************************************************//** +Looks up a column prefix of an externally stored column. +@return column prefix, or NULL if the column is not stored externally, +or pointer to field_ref_zero if the BLOB pointer is unset */ +UNIV_INLINE +const byte* +row_ext_lookup_ith( +/*===============*/ + const row_ext_t* ext, /*!< in/out: column prefix cache */ + ulint i, /*!< in: index of ext->ext[] */ + ulint* len); /*!< out: length of prefix, in bytes, + at most the length determined by + DICT_MAX_FIELD_LEN_BY_FORMAT() */ +/********************************************************************//** +Looks up a column prefix of an externally stored column. +@return column prefix, or NULL if the column is not stored externally, +or pointer to field_ref_zero if the BLOB pointer is unset */ +UNIV_INLINE +const byte* +row_ext_lookup( +/*===========*/ + const row_ext_t* ext, /*!< in: column prefix cache */ + ulint col, /*!< in: column number in the InnoDB + table object, as reported by + dict_col_get_no(); NOT relative to the + records in the clustered index */ + ulint* len); /*!< out: length of prefix, in bytes, + at most the length determined by + DICT_MAX_FIELD_LEN_BY_FORMAT() */ + +/** Prefixes of externally stored columns */ +struct row_ext_t{ + ulint n_ext; /*!< number of externally stored columns */ + const ulint* ext; /*!< col_no's of externally stored columns */ + byte* buf; /*!< backing store of the column prefix cache */ + ulint max_len;/*!< maximum prefix length, it could be + REC_ANTELOPE_MAX_INDEX_COL_LEN or + REC_VERSION_56_MAX_INDEX_COL_LEN depending + on row format */ + ulint len[1]; /*!< prefix lengths; 0 if not cached */ +}; + +#ifndef UNIV_NONINL +#include "row0ext.ic" +#endif + +#endif diff --git a/storage/innobase/include/row0ext.ic b/storage/innobase/include/row0ext.ic new file mode 100644 index 00000000000..39e150d91d5 --- /dev/null +++ b/storage/innobase/include/row0ext.ic @@ -0,0 +1,87 @@ +/***************************************************************************** + +Copyright (c) 2006, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/row0ext.ic +Caching of externally stored column prefixes + +Created September 2006 Marko Makela +*******************************************************/ + +#include "rem0types.h" +#include "btr0types.h" + +/********************************************************************//** +Looks up a column prefix of an externally stored column. +@return column prefix, or NULL if the column is not stored externally, +or pointer to field_ref_zero if the BLOB pointer is unset */ +UNIV_INLINE +const byte* +row_ext_lookup_ith( +/*===============*/ + const row_ext_t* ext, /*!< in/out: column prefix cache */ + ulint i, /*!< in: index of ext->ext[] */ + ulint* len) /*!< out: length of prefix, in bytes, + at most ext->max_len */ +{ + ut_ad(ext); + ut_ad(len); + ut_ad(i < ext->n_ext); + + *len = ext->len[i]; + + ut_ad(*len <= ext->max_len); + ut_ad(ext->max_len > 0); + + if (*len == 0) { + /* The BLOB could not be fetched to the cache. */ + return(field_ref_zero); + } else { + return(ext->buf + i * ext->max_len); + } +} + +/********************************************************************//** +Looks up a column prefix of an externally stored column. +@return column prefix, or NULL if the column is not stored externally, +or pointer to field_ref_zero if the BLOB pointer is unset */ +UNIV_INLINE +const byte* +row_ext_lookup( +/*===========*/ + const row_ext_t* ext, /*!< in: column prefix cache */ + ulint col, /*!< in: column number in the InnoDB + table object, as reported by + dict_col_get_no(); NOT relative to the + records in the clustered index */ + ulint* len) /*!< out: length of prefix, in bytes, + at most ext->max_len */ +{ + ulint i; + + ut_ad(ext); + ut_ad(len); + + for (i = 0; i < ext->n_ext; i++) { + if (col == ext->ext[i]) { + return(row_ext_lookup_ith(ext, i, len)); + } + } + + return(NULL); +} diff --git a/storage/innobase/include/row0ftsort.h b/storage/innobase/include/row0ftsort.h new file mode 100644 index 00000000000..4e04a099140 --- /dev/null +++ b/storage/innobase/include/row0ftsort.h @@ -0,0 +1,279 @@ +/***************************************************************************** + +Copyright (c) 2010, 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/row0ftsort.h +Create Full Text Index with (parallel) merge sort + +Created 10/13/2010 Jimmy Yang +*******************************************************/ + +#ifndef row0ftsort_h +#define row0ftsort_h + +#include "univ.i" +#include "data0data.h" +#include "dict0types.h" +#include "row0mysql.h" +#include "fts0fts.h" +#include "fts0types.h" +#include "fts0priv.h" +#include "row0merge.h" + +/** This structure defineds information the scan thread will fetch +and put to the linked list for parallel tokenization/sort threads +to process */ +typedef struct fts_doc_item fts_doc_item_t; + +/** Information about temporary files used in merge sort */ +struct fts_doc_item { + dfield_t* field; /*!< field contains document string */ + doc_id_t doc_id; /*!< document ID */ + UT_LIST_NODE_T(fts_doc_item_t) doc_list; + /*!< list of doc items */ +}; + +/** This defines the list type that scan thread would feed the parallel +tokenization threads and sort threads. */ +typedef UT_LIST_BASE_NODE_T(fts_doc_item_t) fts_doc_list_t; + +#define FTS_NUM_AUX_INDEX 6 +#define FTS_PLL_MERGE 1 + +/** Sort information passed to each individual parallel sort thread */ +struct fts_psort_t; + +/** Common info passed to each parallel sort thread */ +struct fts_psort_common_t { + row_merge_dup_t* dup; /*!< descriptor of FTS index */ + dict_table_t* new_table; /*!< source table */ + trx_t* trx; /*!< transaction */ + fts_psort_t* all_info; /*!< all parallel sort info */ + os_event_t sort_event; /*!< sort event */ + os_event_t merge_event; /*!< merge event */ + ibool opt_doc_id_size;/*!< whether to use 4 bytes + instead of 8 bytes integer to + store Doc ID during sort, if + Doc ID will not be big enough + to use 8 bytes value */ +}; + +struct fts_psort_t { + ulint psort_id; /*!< Parallel sort ID */ + row_merge_buf_t* merge_buf[FTS_NUM_AUX_INDEX]; + /*!< sort buffer */ + merge_file_t* merge_file[FTS_NUM_AUX_INDEX]; + /*!< sort file */ + row_merge_block_t* merge_block[FTS_NUM_AUX_INDEX]; + /*!< buffer to write to file */ + row_merge_block_t* block_alloc[FTS_NUM_AUX_INDEX]; + /*!< buffer to allocated */ + ulint child_status; /*!< child thread status */ + ulint state; /*!< parent thread state */ + fts_doc_list_t fts_doc_list; /*!< doc list to process */ + fts_psort_common_t* psort_common; /*!< ptr to all psort info */ + os_thread_t thread_hdl; /*!< thread handler */ + dberr_t error; /*!< db error during psort */ + ulint memory_used; /*!< memory used by fts_doc_list */ + ib_mutex_t mutex; /*!< mutex for fts_doc_list */ +}; + +/** Structure stores information from string tokenization operation */ +struct fts_tokenize_ctx { + ulint processed_len; /*!< processed string length */ + ulint init_pos; /*!< doc start position */ + ulint buf_used; /*!< the sort buffer (ID) when + tokenization stops, which + could due to sort buffer full */ + ulint rows_added[FTS_NUM_AUX_INDEX]; + /*!< number of rows added for + each FTS index partition */ + ib_rbt_t* cached_stopword;/*!< in: stopword list */ + dfield_t sort_field[FTS_NUM_FIELDS_SORT]; + /*!< in: sort field */ +}; + +typedef struct fts_tokenize_ctx fts_tokenize_ctx_t; + +/** Structure stores information needed for the insertion phase of FTS +parallel sort. */ +struct fts_psort_insert { + trx_t* trx; /*!< Transaction used for insertion */ + que_t** ins_graph; /*!< insert graph */ + fts_table_t fts_table; /*!< auxiliary table */ + CHARSET_INFO* charset; /*!< charset info */ + mem_heap_t* heap; /*!< heap */ + ibool opt_doc_id_size;/*!< Whether to use smaller (4 bytes) + integer for Doc ID */ +}; + +typedef struct fts_psort_insert fts_psort_insert_t; + + +/** status bit used for communication between parent and child thread */ +#define FTS_PARENT_COMPLETE 1 +#define FTS_PARENT_EXITING 2 +#define FTS_CHILD_COMPLETE 1 +#define FTS_CHILD_EXITING 2 + +/** Print some debug information */ +#define FTSORT_PRINT + +#ifdef FTSORT_PRINT +#define DEBUG_FTS_SORT_PRINT(str) \ + do { \ + ut_print_timestamp(stderr); \ + fprintf(stderr, str); \ + } while (0) +#else +#define DEBUG_FTS_SORT_PRINT(str) +#endif /* FTSORT_PRINT */ + +/*************************************************************//** +Create a temporary "fts sort index" used to merge sort the +tokenized doc string. The index has three "fields": + +1) Tokenized word, +2) Doc ID +3) Word's position in original 'doc'. + +@return dict_index_t structure for the fts sort index */ +UNIV_INTERN +dict_index_t* +row_merge_create_fts_sort_index( +/*============================*/ + dict_index_t* index, /*!< in: Original FTS index + based on which this sort index + is created */ + const dict_table_t* table, /*!< in: table that FTS index + is being created on */ + ibool* opt_doc_id_size); + /*!< out: whether to use 4 bytes + instead of 8 bytes integer to + store Doc ID during sort */ + +/********************************************************************//** +Initialize FTS parallel sort structures. +@return TRUE if all successful */ +UNIV_INTERN +ibool +row_fts_psort_info_init( +/*====================*/ + trx_t* trx, /*!< in: transaction */ + row_merge_dup_t* dup, /*!< in,own: descriptor of + FTS index being created */ + const dict_table_t* new_table,/*!< in: table where indexes are + created */ + ibool opt_doc_id_size, + /*!< in: whether to use 4 bytes + instead of 8 bytes integer to + store Doc ID during sort */ + fts_psort_t** psort, /*!< out: parallel sort info to be + instantiated */ + fts_psort_t** merge) /*!< out: parallel merge info + to be instantiated */ + __attribute__((nonnull)); +/********************************************************************//** +Clean up and deallocate FTS parallel sort structures, and close +temparary merge sort files */ +UNIV_INTERN +void +row_fts_psort_info_destroy( +/*=======================*/ + fts_psort_t* psort_info, /*!< parallel sort info */ + fts_psort_t* merge_info); /*!< parallel merge info */ +/********************************************************************//** +Free up merge buffers when merge sort is done */ +UNIV_INTERN +void +row_fts_free_pll_merge_buf( +/*=======================*/ + fts_psort_t* psort_info); /*!< in: parallel sort info */ + +/*********************************************************************//** +Function performs parallel tokenization of the incoming doc strings. +@return OS_THREAD_DUMMY_RETURN */ +UNIV_INTERN +os_thread_ret_t +fts_parallel_tokenization( +/*======================*/ + void* arg); /*!< in: psort_info for the thread */ +/*********************************************************************//** +Start the parallel tokenization and parallel merge sort */ +UNIV_INTERN +void +row_fts_start_psort( +/*================*/ + fts_psort_t* psort_info); /*!< in: parallel sort info */ +/*********************************************************************//** +Function performs the merge and insertion of the sorted records. +@return OS_THREAD_DUMMY_RETURN */ +UNIV_INTERN +os_thread_ret_t +fts_parallel_merge( +/*===============*/ + void* arg); /*!< in: parallel merge info */ +/*********************************************************************//** +Kick off the parallel merge and insert thread */ +UNIV_INTERN +void +row_fts_start_parallel_merge( +/*=========================*/ + fts_psort_t* merge_info); /*!< in: parallel sort info */ +/********************************************************************//** +Read sorted FTS data files and insert data tuples to auxillary tables. +@return DB_SUCCESS or error number */ +UNIV_INTERN +void +row_fts_insert_tuple( +/*=================*/ + fts_psort_insert_t* + ins_ctx, /*!< in: insert context */ + fts_tokenizer_word_t* word, /*!< in: last processed + tokenized word */ + ib_vector_t* positions, /*!< in: word position */ + doc_id_t* in_doc_id, /*!< in: last item doc id */ + dtuple_t* dtuple); /*!< in: entry to insert */ +/********************************************************************//** +Propagate a newly added record up one level in the selection tree +@return parent where this value propagated to */ +UNIV_INTERN +int +row_merge_fts_sel_propagate( +/*========================*/ + int propogated, /*<! in: tree node propagated */ + int* sel_tree, /*<! in: selection tree */ + ulint level, /*<! in: selection tree level */ + const mrec_t** mrec, /*<! in: sort record */ + ulint** offsets, /*<! in: record offsets */ + dict_index_t* index); /*<! in: FTS index */ +/********************************************************************//** +Read sorted file containing index data tuples and insert these data +tuples to the index +@return DB_SUCCESS or error number */ +UNIV_INTERN +dberr_t +row_fts_merge_insert( +/*=================*/ + dict_index_t* index, /*!< in: index */ + dict_table_t* table, /*!< in: new table */ + fts_psort_t* psort_info, /*!< parallel sort info */ + ulint id) /* !< in: which auxiliary table's data + to insert to */ + __attribute__((nonnull)); +#endif /* row0ftsort_h */ diff --git a/storage/innobase/include/row0import.h b/storage/innobase/include/row0import.h new file mode 100644 index 00000000000..aa46fdb7c27 --- /dev/null +++ b/storage/innobase/include/row0import.h @@ -0,0 +1,91 @@ +/***************************************************************************** + +Copyright (c) 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/row0import.h +Header file for import tablespace functions. + +Created 2012-02-08 by Sunny Bains +*******************************************************/ + +#ifndef row0import_h +#define row0import_h + +#include "univ.i" +#include "db0err.h" +#include "dict0types.h" + +// Forward declarations +struct trx_t; +struct dict_table_t; +struct row_prebuilt_t; + +/*****************************************************************//** +Imports a tablespace. The space id in the .ibd file must match the space id +of the table in the data dictionary. +@return error code or DB_SUCCESS */ +UNIV_INTERN +dberr_t +row_import_for_mysql( +/*=================*/ + dict_table_t* table, /*!< in/out: table */ + row_prebuilt_t* prebuilt) /*!< in: prebuilt struct + in MySQL */ + __attribute__((nonnull, warn_unused_result)); + +/*****************************************************************//** +Update the DICT_TF2_DISCARDED flag in SYS_TABLES. +@return DB_SUCCESS or error code. */ +UNIV_INTERN +dberr_t +row_import_update_discarded_flag( +/*=============================*/ + trx_t* trx, /*!< in/out: transaction that + covers the update */ + table_id_t table_id, /*!< in: Table for which we want + to set the root table->flags2 */ + bool discarded, /*!< in: set MIX_LEN column bit + to discarded, if true */ + bool dict_locked) /*!< in: Set to true if the + caller already owns the + dict_sys_t:: mutex. */ + __attribute__((nonnull, warn_unused_result)); + +/*****************************************************************//** +Update the (space, root page) of a table's indexes from the values +in the data dictionary. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +row_import_update_index_root( +/*=========================*/ + trx_t* trx, /*!< in/out: transaction that + covers the update */ + const dict_table_t* table, /*!< in: Table for which we want + to set the root page_no */ + bool reset, /*!< in: if true then set to + FIL_NUL */ + bool dict_locked) /*!< in: Set to true if the + caller already owns the + dict_sys_t:: mutex. */ + __attribute__((nonnull, warn_unused_result)); +#ifndef UNIV_NONINL +#include "row0import.ic" +#endif + +#endif /* row0import_h */ diff --git a/storage/innobase/include/row0import.ic b/storage/innobase/include/row0import.ic new file mode 100644 index 00000000000..c5bbab49f6f --- /dev/null +++ b/storage/innobase/include/row0import.ic @@ -0,0 +1,25 @@ +/***************************************************************************** + +Copyright (c) 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/row0import.ic + +Import tablespace inline functions. + +Created 2012-02-08 Sunny Bains +*******************************************************/ diff --git a/storage/innobase/include/row0ins.h b/storage/innobase/include/row0ins.h new file mode 100644 index 00000000000..2a892d2f5df --- /dev/null +++ b/storage/innobase/include/row0ins.h @@ -0,0 +1,240 @@ +/***************************************************************************** + +Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/row0ins.h +Insert into a table + +Created 4/20/1996 Heikki Tuuri +*******************************************************/ + +#ifndef row0ins_h +#define row0ins_h + +#include "univ.i" +#include "data0data.h" +#include "que0types.h" +#include "dict0types.h" +#include "trx0types.h" +#include "row0types.h" + +/***************************************************************//** +Checks if foreign key constraint fails for an index entry. Sets shared locks +which lock either the success or the failure of the constraint. NOTE that +the caller must have a shared latch on dict_foreign_key_check_lock. +@return DB_SUCCESS, DB_LOCK_WAIT, DB_NO_REFERENCED_ROW, or +DB_ROW_IS_REFERENCED */ +UNIV_INTERN +dberr_t +row_ins_check_foreign_constraint( +/*=============================*/ + ibool check_ref,/*!< in: TRUE If we want to check that + the referenced table is ok, FALSE if we + want to check the foreign key table */ + dict_foreign_t* foreign,/*!< in: foreign constraint; NOTE that the + tables mentioned in it must be in the + dictionary cache if they exist at all */ + dict_table_t* table, /*!< in: if check_ref is TRUE, then the foreign + table, else the referenced table */ + dtuple_t* entry, /*!< in: index entry for index */ + que_thr_t* thr) /*!< in: query thread */ + __attribute__((nonnull, warn_unused_result)); +/*********************************************************************//** +Creates an insert node struct. +@return own: insert node struct */ +UNIV_INTERN +ins_node_t* +ins_node_create( +/*============*/ + ulint ins_type, /*!< in: INS_VALUES, ... */ + dict_table_t* table, /*!< in: table where to insert */ + mem_heap_t* heap); /*!< in: mem heap where created */ +/*********************************************************************//** +Sets a new row to insert for an INS_DIRECT node. This function is only used +if we have constructed the row separately, which is a rare case; this +function is quite slow. */ +UNIV_INTERN +void +ins_node_set_new_row( +/*=================*/ + ins_node_t* node, /*!< in: insert node */ + dtuple_t* row); /*!< in: new row (or first row) for the node */ +/***************************************************************//** +Tries to insert an entry into a clustered index, ignoring foreign key +constraints. If a record with the same unique key is found, the other +record is necessarily marked deleted by a committed transaction, or a +unique key violation error occurs. The delete marked record is then +updated to an existing record, and we must write an undo log record on +the delete marked record. +@retval DB_SUCCESS on success +@retval DB_LOCK_WAIT on lock wait when !(flags & BTR_NO_LOCKING_FLAG) +@retval DB_FAIL if retry with BTR_MODIFY_TREE is needed +@return error code */ +UNIV_INTERN +dberr_t +row_ins_clust_index_entry_low( +/*==========================*/ + ulint flags, /*!< in: undo logging and locking flags */ + ulint mode, /*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE, + depending on whether we wish optimistic or + pessimistic descent down the index tree */ + dict_index_t* index, /*!< in: clustered index */ + ulint n_uniq, /*!< in: 0 or index->n_uniq */ + dtuple_t* entry, /*!< in/out: index entry to insert */ + ulint n_ext, /*!< in: number of externally stored columns */ + que_thr_t* thr) /*!< in: query thread or NULL */ + __attribute__((nonnull, warn_unused_result)); +/***************************************************************//** +Tries to insert an entry into a secondary index. If a record with exactly the +same fields is found, the other record is necessarily marked deleted. +It is then unmarked. Otherwise, the entry is just inserted to the index. +@retval DB_SUCCESS on success +@retval DB_LOCK_WAIT on lock wait when !(flags & BTR_NO_LOCKING_FLAG) +@retval DB_FAIL if retry with BTR_MODIFY_TREE is needed +@return error code */ +UNIV_INTERN +dberr_t +row_ins_sec_index_entry_low( +/*========================*/ + ulint flags, /*!< in: undo logging and locking flags */ + ulint mode, /*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE, + depending on whether we wish optimistic or + pessimistic descent down the index tree */ + dict_index_t* index, /*!< in: secondary index */ + mem_heap_t* offsets_heap, + /*!< in/out: memory heap that can be emptied */ + mem_heap_t* heap, /*!< in/out: memory heap */ + dtuple_t* entry, /*!< in/out: index entry to insert */ + trx_id_t trx_id, /*!< in: PAGE_MAX_TRX_ID during + row_log_table_apply(), or 0 */ + que_thr_t* thr) /*!< in: query thread */ + __attribute__((nonnull, warn_unused_result)); +/***************************************************************//** +Tries to insert the externally stored fields (off-page columns) +of a clustered index entry. +@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */ +UNIV_INTERN +dberr_t +row_ins_index_entry_big_rec_func( +/*=============================*/ + const dtuple_t* entry, /*!< in/out: index entry to insert */ + const big_rec_t* big_rec,/*!< in: externally stored fields */ + ulint* offsets,/*!< in/out: rec offsets */ + mem_heap_t** heap, /*!< in/out: memory heap */ + dict_index_t* index, /*!< in: index */ + const char* file, /*!< in: file name of caller */ +#ifndef DBUG_OFF + const void* thd, /*!< in: connection, or NULL */ +#endif /* DBUG_OFF */ + ulint line) /*!< in: line number of caller */ + __attribute__((nonnull(1,2,3,4,5,6), warn_unused_result)); +#ifdef DBUG_OFF +# define row_ins_index_entry_big_rec(e,big,ofs,heap,index,thd,file,line) \ + row_ins_index_entry_big_rec_func(e,big,ofs,heap,index,file,line) +#else /* DBUG_OFF */ +# define row_ins_index_entry_big_rec(e,big,ofs,heap,index,thd,file,line) \ + row_ins_index_entry_big_rec_func(e,big,ofs,heap,index,file,thd,line) +#endif /* DBUG_OFF */ +/***************************************************************//** +Inserts an entry into a clustered index. Tries first optimistic, +then pessimistic descent down the tree. If the entry matches enough +to a delete marked record, performs the insert by updating or delete +unmarking the delete marked record. +@return DB_SUCCESS, DB_LOCK_WAIT, DB_DUPLICATE_KEY, or some other error code */ +UNIV_INTERN +dberr_t +row_ins_clust_index_entry( +/*======================*/ + dict_index_t* index, /*!< in: clustered index */ + dtuple_t* entry, /*!< in/out: index entry to insert */ + que_thr_t* thr, /*!< in: query thread */ + ulint n_ext) /*!< in: number of externally stored columns */ + __attribute__((nonnull, warn_unused_result)); +/***************************************************************//** +Inserts an entry into a secondary index. Tries first optimistic, +then pessimistic descent down the tree. If the entry matches enough +to a delete marked record, performs the insert by updating or delete +unmarking the delete marked record. +@return DB_SUCCESS, DB_LOCK_WAIT, DB_DUPLICATE_KEY, or some other error code */ +UNIV_INTERN +dberr_t +row_ins_sec_index_entry( +/*====================*/ + dict_index_t* index, /*!< in: secondary index */ + dtuple_t* entry, /*!< in/out: index entry to insert */ + que_thr_t* thr) /*!< in: query thread */ + __attribute__((nonnull, warn_unused_result)); +/***********************************************************//** +Inserts a row to a table. This is a high-level function used in +SQL execution graphs. +@return query thread to run next or NULL */ +UNIV_INTERN +que_thr_t* +row_ins_step( +/*=========*/ + que_thr_t* thr); /*!< in: query thread */ + +/* Insert node structure */ + +struct ins_node_t{ + que_common_t common; /*!< node type: QUE_NODE_INSERT */ + ulint ins_type;/* INS_VALUES, INS_SEARCHED, or INS_DIRECT */ + dtuple_t* row; /*!< row to insert */ + dict_table_t* table; /*!< table where to insert */ + sel_node_t* select; /*!< select in searched insert */ + que_node_t* values_list;/* list of expressions to evaluate and + insert in an INS_VALUES insert */ + ulint state; /*!< node execution state */ + dict_index_t* index; /*!< NULL, or the next index where the index + entry should be inserted */ + dtuple_t* entry; /*!< NULL, or entry to insert in the index; + after a successful insert of the entry, + this should be reset to NULL */ + UT_LIST_BASE_NODE_T(dtuple_t) + entry_list;/* list of entries, one for each index */ + byte* row_id_buf;/* buffer for the row id sys field in row */ + trx_id_t trx_id; /*!< trx id or the last trx which executed the + node */ + byte* trx_id_buf;/* buffer for the trx id sys field in row */ + mem_heap_t* entry_sys_heap; + /* memory heap used as auxiliary storage; + entry_list and sys fields are stored here; + if this is NULL, entry list should be created + and buffers for sys fields in row allocated */ + ulint magic_n; +}; + +#define INS_NODE_MAGIC_N 15849075 + +/* Insert node types */ +#define INS_SEARCHED 0 /* INSERT INTO ... SELECT ... */ +#define INS_VALUES 1 /* INSERT INTO ... VALUES ... */ +#define INS_DIRECT 2 /* this is for internal use in dict0crea: + insert the row directly */ + +/* Node execution states */ +#define INS_NODE_SET_IX_LOCK 1 /* we should set an IX lock on table */ +#define INS_NODE_ALLOC_ROW_ID 2 /* row id should be allocated */ +#define INS_NODE_INSERT_ENTRIES 3 /* index entries should be built and + inserted */ + +#ifndef UNIV_NONINL +#include "row0ins.ic" +#endif + +#endif diff --git a/storage/innobase/include/row0ins.ic b/storage/innobase/include/row0ins.ic new file mode 100644 index 00000000000..9c191d869a2 --- /dev/null +++ b/storage/innobase/include/row0ins.ic @@ -0,0 +1,26 @@ +/***************************************************************************** + +Copyright (c) 1996, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/row0ins.ic +Insert into a table + +Created 4/20/1996 Heikki Tuuri +*******************************************************/ + + diff --git a/storage/innobase/include/row0log.h b/storage/innobase/include/row0log.h new file mode 100644 index 00000000000..62715fe8808 --- /dev/null +++ b/storage/innobase/include/row0log.h @@ -0,0 +1,239 @@ +/***************************************************************************** + +Copyright (c) 2011, 2014, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/row0log.h +Modification log for online index creation and online table rebuild + +Created 2011-05-26 Marko Makela +*******************************************************/ + +#ifndef row0log_h +#define row0log_h + +#include "univ.i" +#include "mtr0types.h" +#include "row0types.h" +#include "rem0types.h" +#include "data0types.h" +#include "dict0types.h" +#include "trx0types.h" +#include "que0types.h" + +/******************************************************//** +Allocate the row log for an index and flag the index +for online creation. +@retval true if success, false if not */ +UNIV_INTERN +bool +row_log_allocate( +/*=============*/ + dict_index_t* index, /*!< in/out: index */ + dict_table_t* table, /*!< in/out: new table being rebuilt, + or NULL when creating a secondary index */ + bool same_pk,/*!< in: whether the definition of the + PRIMARY KEY has remained the same */ + const dtuple_t* add_cols, + /*!< in: default values of + added columns, or NULL */ + const ulint* col_map)/*!< in: mapping of old column + numbers to new ones, or NULL if !table */ + __attribute__((nonnull(1), warn_unused_result)); + +/******************************************************//** +Free the row log for an index that was being created online. */ +UNIV_INTERN +void +row_log_free( +/*=========*/ + row_log_t*& log) /*!< in,own: row log */ + __attribute__((nonnull)); + +/******************************************************//** +Free the row log for an index on which online creation was aborted. */ +UNIV_INLINE +void +row_log_abort_sec( +/*==============*/ + dict_index_t* index) /*!< in/out: index (x-latched) */ + __attribute__((nonnull)); + +/******************************************************//** +Try to log an operation to a secondary index that is +(or was) being created. +@retval true if the operation was logged or can be ignored +@retval false if online index creation is not taking place */ +UNIV_INLINE +bool +row_log_online_op_try( +/*==================*/ + dict_index_t* index, /*!< in/out: index, S or X latched */ + const dtuple_t* tuple, /*!< in: index tuple */ + trx_id_t trx_id) /*!< in: transaction ID for insert, + or 0 for delete */ + __attribute__((nonnull, warn_unused_result)); +/******************************************************//** +Logs an operation to a secondary index that is (or was) being created. */ +UNIV_INTERN +void +row_log_online_op( +/*==============*/ + dict_index_t* index, /*!< in/out: index, S or X latched */ + const dtuple_t* tuple, /*!< in: index tuple */ + trx_id_t trx_id) /*!< in: transaction ID for insert, + or 0 for delete */ + UNIV_COLD __attribute__((nonnull)); + +/******************************************************//** +Gets the error status of the online index rebuild log. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +row_log_table_get_error( +/*====================*/ + const dict_index_t* index) /*!< in: clustered index of a table + that is being rebuilt online */ + __attribute__((nonnull, warn_unused_result)); + +/******************************************************//** +Logs a delete operation to a table that is being rebuilt. +This will be merged in row_log_table_apply_delete(). */ +UNIV_INTERN +void +row_log_table_delete( +/*=================*/ + const rec_t* rec, /*!< in: clustered index leaf page record, + page X-latched */ + dict_index_t* index, /*!< in/out: clustered index, S-latched + or X-latched */ + const ulint* offsets,/*!< in: rec_get_offsets(rec,index) */ + const byte* sys) /*!< in: DB_TRX_ID,DB_ROLL_PTR that should + be logged, or NULL to use those in rec */ + UNIV_COLD __attribute__((nonnull(1,2,3))); + +/******************************************************//** +Logs an update operation to a table that is being rebuilt. +This will be merged in row_log_table_apply_update(). */ +UNIV_INTERN +void +row_log_table_update( +/*=================*/ + const rec_t* rec, /*!< in: clustered index leaf page record, + page X-latched */ + dict_index_t* index, /*!< in/out: clustered index, S-latched + or X-latched */ + const ulint* offsets,/*!< in: rec_get_offsets(rec,index) */ + const dtuple_t* old_pk) /*!< in: row_log_table_get_pk() + before the update */ + UNIV_COLD __attribute__((nonnull(1,2,3))); + +/******************************************************//** +Constructs the old PRIMARY KEY and DB_TRX_ID,DB_ROLL_PTR +of a table that is being rebuilt. +@return tuple of PRIMARY KEY,DB_TRX_ID,DB_ROLL_PTR in the rebuilt table, +or NULL if the PRIMARY KEY definition does not change */ +UNIV_INTERN +const dtuple_t* +row_log_table_get_pk( +/*=================*/ + const rec_t* rec, /*!< in: clustered index leaf page record, + page X-latched */ + dict_index_t* index, /*!< in/out: clustered index, S-latched + or X-latched */ + const ulint* offsets,/*!< in: rec_get_offsets(rec,index), + or NULL */ + byte* sys, /*!< out: DB_TRX_ID,DB_ROLL_PTR for + row_log_table_delete(), or NULL */ + mem_heap_t** heap) /*!< in/out: memory heap where allocated */ + UNIV_COLD __attribute__((nonnull(1,2,5), warn_unused_result)); + +/******************************************************//** +Logs an insert to a table that is being rebuilt. +This will be merged in row_log_table_apply_insert(). */ +UNIV_INTERN +void +row_log_table_insert( +/*=================*/ + const rec_t* rec, /*!< in: clustered index leaf page record, + page X-latched */ + dict_index_t* index, /*!< in/out: clustered index, S-latched + or X-latched */ + const ulint* offsets)/*!< in: rec_get_offsets(rec,index) */ + UNIV_COLD __attribute__((nonnull)); +/******************************************************//** +Notes that a BLOB is being freed during online ALTER TABLE. */ +UNIV_INTERN +void +row_log_table_blob_free( +/*====================*/ + dict_index_t* index, /*!< in/out: clustered index, X-latched */ + ulint page_no)/*!< in: starting page number of the BLOB */ + UNIV_COLD __attribute__((nonnull)); +/******************************************************//** +Notes that a BLOB is being allocated during online ALTER TABLE. */ +UNIV_INTERN +void +row_log_table_blob_alloc( +/*=====================*/ + dict_index_t* index, /*!< in/out: clustered index, X-latched */ + ulint page_no)/*!< in: starting page number of the BLOB */ + UNIV_COLD __attribute__((nonnull)); +/******************************************************//** +Apply the row_log_table log to a table upon completing rebuild. +@return DB_SUCCESS, or error code on failure */ +UNIV_INTERN +dberr_t +row_log_table_apply( +/*================*/ + que_thr_t* thr, /*!< in: query graph */ + dict_table_t* old_table, + /*!< in: old table */ + struct TABLE* table) /*!< in/out: MySQL table + (for reporting duplicates) */ + __attribute__((nonnull, warn_unused_result)); + +/******************************************************//** +Get the latest transaction ID that has invoked row_log_online_op() +during online creation. +@return latest transaction ID, or 0 if nothing was logged */ +UNIV_INTERN +trx_id_t +row_log_get_max_trx( +/*================*/ + dict_index_t* index) /*!< in: index, must be locked */ + __attribute__((nonnull, warn_unused_result)); + +/******************************************************//** +Merge the row log to the index upon completing index creation. +@return DB_SUCCESS, or error code on failure */ +UNIV_INTERN +dberr_t +row_log_apply( +/*==========*/ + trx_t* trx, /*!< in: transaction (for checking if + the operation was interrupted) */ + dict_index_t* index, /*!< in/out: secondary index */ + struct TABLE* table) /*!< in/out: MySQL table + (for reporting duplicates) */ + __attribute__((nonnull, warn_unused_result)); + +#ifndef UNIV_NONINL +#include "row0log.ic" +#endif + +#endif /* row0log.h */ diff --git a/storage/innobase/include/row0log.ic b/storage/innobase/include/row0log.ic new file mode 100644 index 00000000000..b0f37dbd8e7 --- /dev/null +++ b/storage/innobase/include/row0log.ic @@ -0,0 +1,84 @@ +/***************************************************************************** + +Copyright (c) 2011, 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/row0log.ic +Modification log for online index creation and online table rebuild + +Created 2012-10-18 Marko Makela +*******************************************************/ + +#include "dict0dict.h" + +/******************************************************//** +Free the row log for an index on which online creation was aborted. */ +UNIV_INLINE +void +row_log_abort_sec( +/*===============*/ + dict_index_t* index) /*!< in/out: index (x-latched) */ +{ +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(dict_index_get_lock(index), RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + + ut_ad(!dict_index_is_clust(index)); + dict_index_set_online_status(index, ONLINE_INDEX_ABORTED); + row_log_free(index->online_log); +} + +/******************************************************//** +Try to log an operation to a secondary index that is +(or was) being created. +@retval true if the operation was logged or can be ignored +@retval false if online index creation is not taking place */ +UNIV_INLINE +bool +row_log_online_op_try( +/*==================*/ + dict_index_t* index, /*!< in/out: index, S or X latched */ + const dtuple_t* tuple, /*!< in: index tuple */ + trx_id_t trx_id) /*!< in: transaction ID for insert, + or 0 for delete */ +{ +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(dict_index_get_lock(index), RW_LOCK_SHARED) + || rw_lock_own(dict_index_get_lock(index), RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + + switch (dict_index_get_online_status(index)) { + case ONLINE_INDEX_COMPLETE: + /* This is a normal index. Do not log anything. + The caller must perform the operation on the + index tree directly. */ + return(false); + case ONLINE_INDEX_CREATION: + /* The index is being created online. Log the + operation. */ + row_log_online_op(index, tuple, trx_id); + break; + case ONLINE_INDEX_ABORTED: + case ONLINE_INDEX_ABORTED_DROPPED: + /* The index was created online, but the operation was + aborted. Do not log the operation and tell the caller + to skip the operation. */ + break; + } + + return(true); +} diff --git a/storage/innobase/include/row0merge.h b/storage/innobase/include/row0merge.h new file mode 100644 index 00000000000..2b9e9f7711c --- /dev/null +++ b/storage/innobase/include/row0merge.h @@ -0,0 +1,430 @@ +/***************************************************************************** + +Copyright (c) 2005, 2014, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/row0merge.h +Index build routines using a merge sort + +Created 13/06/2005 Jan Lindstrom +*******************************************************/ + +#ifndef row0merge_h +#define row0merge_h + +#include "univ.i" +#include "data0data.h" +#include "dict0types.h" +#include "trx0types.h" +#include "que0types.h" +#include "mtr0mtr.h" +#include "rem0types.h" +#include "rem0rec.h" +#include "read0types.h" +#include "btr0types.h" +#include "row0mysql.h" +#include "lock0types.h" +#include "srv0srv.h" + +// Forward declaration +struct ib_sequence_t; + +/** @brief Block size for I/O operations in merge sort. + +The minimum is UNIV_PAGE_SIZE, or page_get_free_space_of_empty() +rounded to a power of 2. + +When not creating a PRIMARY KEY that contains column prefixes, this +can be set as small as UNIV_PAGE_SIZE / 2. */ +typedef byte row_merge_block_t; + +/** @brief Secondary buffer for I/O operations of merge records. + +This buffer is used for writing or reading a record that spans two +row_merge_block_t. Thus, it must be able to hold one merge record, +whose maximum size is the same as the minimum size of +row_merge_block_t. */ +typedef byte mrec_buf_t[UNIV_PAGE_SIZE_MAX]; + +/** @brief Merge record in row_merge_block_t. + +The format is the same as a record in ROW_FORMAT=COMPACT with the +exception that the REC_N_NEW_EXTRA_BYTES are omitted. */ +typedef byte mrec_t; + +/** Merge record in row_merge_buf_t */ +struct mtuple_t { + dfield_t* fields; /*!< data fields */ +}; + +/** Buffer for sorting in main memory. */ +struct row_merge_buf_t { + mem_heap_t* heap; /*!< memory heap where allocated */ + dict_index_t* index; /*!< the index the tuples belong to */ + ulint total_size; /*!< total amount of data bytes */ + ulint n_tuples; /*!< number of data tuples */ + ulint max_tuples; /*!< maximum number of data tuples */ + mtuple_t* tuples; /*!< array of data tuples */ + mtuple_t* tmp_tuples; /*!< temporary copy of tuples, + for sorting */ +}; + +/** Information about temporary files used in merge sort */ +struct merge_file_t { + int fd; /*!< file descriptor */ + ulint offset; /*!< file offset (end of file) */ + ib_uint64_t n_rec; /*!< number of records in the file */ +}; + +/** Index field definition */ +struct index_field_t { + ulint col_no; /*!< column offset */ + ulint prefix_len; /*!< column prefix length, or 0 + if indexing the whole column */ +}; + +/** Definition of an index being created */ +struct index_def_t { + const char* name; /*!< index name */ + ulint ind_type; /*!< 0, DICT_UNIQUE, + or DICT_CLUSTERED */ + ulint key_number; /*!< MySQL key number, + or ULINT_UNDEFINED if none */ + ulint n_fields; /*!< number of fields in index */ + index_field_t* fields; /*!< field definitions */ +}; + +/** Structure for reporting duplicate records. */ +struct row_merge_dup_t { + dict_index_t* index; /*!< index being sorted */ + struct TABLE* table; /*!< MySQL table object */ + const ulint* col_map;/*!< mapping of column numbers + in table to the rebuilt table + (index->table), or NULL if not + rebuilding table */ + ulint n_dup; /*!< number of duplicates */ +}; + +/*************************************************************//** +Report a duplicate key. */ +UNIV_INTERN +void +row_merge_dup_report( +/*=================*/ + row_merge_dup_t* dup, /*!< in/out: for reporting duplicates */ + const dfield_t* entry) /*!< in: duplicate index entry */ + __attribute__((nonnull)); +/*********************************************************************//** +Sets an exclusive lock on a table, for the duration of creating indexes. +@return error code or DB_SUCCESS */ +UNIV_INTERN +dberr_t +row_merge_lock_table( +/*=================*/ + trx_t* trx, /*!< in/out: transaction */ + dict_table_t* table, /*!< in: table to lock */ + enum lock_mode mode) /*!< in: LOCK_X or LOCK_S */ + __attribute__((nonnull, warn_unused_result)); +/*********************************************************************//** +Drop indexes that were created before an error occurred. +The data dictionary must have been locked exclusively by the caller, +because the transaction will not be committed. */ +UNIV_INTERN +void +row_merge_drop_indexes_dict( +/*========================*/ + trx_t* trx, /*!< in/out: dictionary transaction */ + table_id_t table_id)/*!< in: table identifier */ + __attribute__((nonnull)); +/*********************************************************************//** +Drop those indexes which were created before an error occurred. +The data dictionary must have been locked exclusively by the caller, +because the transaction will not be committed. */ +UNIV_INTERN +void +row_merge_drop_indexes( +/*===================*/ + trx_t* trx, /*!< in/out: transaction */ + dict_table_t* table, /*!< in/out: table containing the indexes */ + ibool locked) /*!< in: TRUE=table locked, + FALSE=may need to do a lazy drop */ + __attribute__((nonnull)); +/*********************************************************************//** +Drop all partially created indexes during crash recovery. */ +UNIV_INTERN +void +row_merge_drop_temp_indexes(void); +/*=============================*/ + +/*********************************************************************//** +Creates temporary merge files, and if UNIV_PFS_IO defined, register +the file descriptor with Performance Schema. +@return File descriptor */ +UNIV_INTERN +int +row_merge_file_create_low(void) +/*===========================*/ + __attribute__((warn_unused_result)); +/*********************************************************************//** +Destroy a merge file. And de-register the file from Performance Schema +if UNIV_PFS_IO is defined. */ +UNIV_INTERN +void +row_merge_file_destroy_low( +/*=======================*/ + int fd); /*!< in: merge file descriptor */ + +/*********************************************************************//** +Provide a new pathname for a table that is being renamed if it belongs to +a file-per-table tablespace. The caller is responsible for freeing the +memory allocated for the return value. +@return new pathname of tablespace file, or NULL if space = 0 */ +UNIV_INTERN +char* +row_make_new_pathname( +/*==================*/ + dict_table_t* table, /*!< in: table to be renamed */ + const char* new_name); /*!< in: new name */ +/*********************************************************************//** +Rename the tables in the data dictionary. The data dictionary must +have been locked exclusively by the caller, because the transaction +will not be committed. +@return error code or DB_SUCCESS */ +UNIV_INTERN +dberr_t +row_merge_rename_tables_dict( +/*=========================*/ + dict_table_t* old_table, /*!< in/out: old table, renamed to + tmp_name */ + dict_table_t* new_table, /*!< in/out: new table, renamed to + old_table->name */ + const char* tmp_name, /*!< in: new name for old_table */ + trx_t* trx) /*!< in/out: dictionary transaction */ + __attribute__((nonnull, warn_unused_result)); + +/*********************************************************************//** +Rename an index in the dictionary that was created. The data +dictionary must have been locked exclusively by the caller, because +the transaction will not be committed. +@return DB_SUCCESS if all OK */ +UNIV_INTERN +dberr_t +row_merge_rename_index_to_add( +/*==========================*/ + trx_t* trx, /*!< in/out: transaction */ + table_id_t table_id, /*!< in: table identifier */ + index_id_t index_id) /*!< in: index identifier */ + __attribute__((nonnull)); +/*********************************************************************//** +Rename an index in the dictionary that is to be dropped. The data +dictionary must have been locked exclusively by the caller, because +the transaction will not be committed. +@return DB_SUCCESS if all OK */ +UNIV_INTERN +dberr_t +row_merge_rename_index_to_drop( +/*===========================*/ + trx_t* trx, /*!< in/out: transaction */ + table_id_t table_id, /*!< in: table identifier */ + index_id_t index_id) /*!< in: index identifier */ + __attribute__((nonnull)); +/*********************************************************************//** +Create the index and load in to the dictionary. +@return index, or NULL on error */ +UNIV_INTERN +dict_index_t* +row_merge_create_index( +/*===================*/ + trx_t* trx, /*!< in/out: trx (sets error_state) */ + dict_table_t* table, /*!< in: the index is on this table */ + const index_def_t* index_def); + /*!< in: the index definition */ +/*********************************************************************//** +Check if a transaction can use an index. +@return TRUE if index can be used by the transaction else FALSE */ +UNIV_INTERN +ibool +row_merge_is_index_usable( +/*======================*/ + const trx_t* trx, /*!< in: transaction */ + const dict_index_t* index); /*!< in: index to check */ +/*********************************************************************//** +Drop a table. The caller must have ensured that the background stats +thread is not processing the table. This can be done by calling +dict_stats_wait_bg_to_stop_using_table() after locking the dictionary and +before calling this function. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +row_merge_drop_table( +/*=================*/ + trx_t* trx, /*!< in: transaction */ + dict_table_t* table) /*!< in: table instance to drop */ + __attribute__((nonnull)); +/*********************************************************************//** +Build indexes on a table by reading a clustered index, +creating a temporary file containing index entries, merge sorting +these index entries and inserting sorted index entries to indexes. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +row_merge_build_indexes( +/*====================*/ + trx_t* trx, /*!< in: transaction */ + dict_table_t* old_table, /*!< in: table where rows are + read from */ + dict_table_t* new_table, /*!< in: table where indexes are + created; identical to old_table + unless creating a PRIMARY KEY */ + bool online, /*!< in: true if creating indexes + online */ + dict_index_t** indexes, /*!< in: indexes to be created */ + const ulint* key_numbers, /*!< in: MySQL key numbers */ + ulint n_indexes, /*!< in: size of indexes[] */ + struct TABLE* table, /*!< in/out: MySQL table, for + reporting erroneous key value + if applicable */ + const dtuple_t* add_cols, /*!< in: default values of + added columns, or NULL */ + const ulint* col_map, /*!< in: mapping of old column + numbers to new ones, or NULL + if old_table == new_table */ + ulint add_autoinc, /*!< in: number of added + AUTO_INCREMENT column, or + ULINT_UNDEFINED if none is added */ + ib_sequence_t& sequence) /*!< in/out: autoinc sequence */ + __attribute__((nonnull(1,2,3,5,6,8), warn_unused_result)); +/********************************************************************//** +Write a buffer to a block. */ +UNIV_INTERN +void +row_merge_buf_write( +/*================*/ + const row_merge_buf_t* buf, /*!< in: sorted buffer */ + const merge_file_t* of, /*!< in: output file */ + row_merge_block_t* block) /*!< out: buffer for writing to file */ + __attribute__((nonnull)); +/********************************************************************//** +Sort a buffer. */ +UNIV_INTERN +void +row_merge_buf_sort( +/*===============*/ + row_merge_buf_t* buf, /*!< in/out: sort buffer */ + row_merge_dup_t* dup) /*!< in/out: reporter of duplicates + (NULL if non-unique index) */ + __attribute__((nonnull(1))); +/********************************************************************//** +Write a merge block to the file system. +@return TRUE if request was successful, FALSE if fail */ +UNIV_INTERN +ibool +row_merge_write( +/*============*/ + int fd, /*!< in: file descriptor */ + ulint offset, /*!< in: offset where to write, + in number of row_merge_block_t elements */ + const void* buf); /*!< in: data */ +/********************************************************************//** +Empty a sort buffer. +@return sort buffer */ +UNIV_INTERN +row_merge_buf_t* +row_merge_buf_empty( +/*================*/ + row_merge_buf_t* buf) /*!< in,own: sort buffer */ + __attribute__((warn_unused_result, nonnull)); +/*********************************************************************//** +Create a merge file. +@return file descriptor, or -1 on failure */ +UNIV_INTERN +int +row_merge_file_create( +/*==================*/ + merge_file_t* merge_file) /*!< out: merge file structure */ + __attribute__((nonnull)); +/*********************************************************************//** +Merge disk files. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +row_merge_sort( +/*===========*/ + trx_t* trx, /*!< in: transaction */ + const row_merge_dup_t* dup, /*!< in: descriptor of + index being created */ + merge_file_t* file, /*!< in/out: file containing + index entries */ + row_merge_block_t* block, /*!< in/out: 3 buffers */ + int* tmpfd) /*!< in/out: temporary file handle */ + __attribute__((nonnull)); +/*********************************************************************//** +Allocate a sort buffer. +@return own: sort buffer */ +UNIV_INTERN +row_merge_buf_t* +row_merge_buf_create( +/*=================*/ + dict_index_t* index) /*!< in: secondary index */ + __attribute__((warn_unused_result, nonnull, malloc)); +/*********************************************************************//** +Deallocate a sort buffer. */ +UNIV_INTERN +void +row_merge_buf_free( +/*===============*/ + row_merge_buf_t* buf) /*!< in,own: sort buffer to be freed */ + __attribute__((nonnull)); +/*********************************************************************//** +Destroy a merge file. */ +UNIV_INTERN +void +row_merge_file_destroy( +/*===================*/ + merge_file_t* merge_file) /*!< in/out: merge file structure */ + __attribute__((nonnull)); +/********************************************************************//** +Read a merge block from the file system. +@return TRUE if request was successful, FALSE if fail */ +UNIV_INTERN +ibool +row_merge_read( +/*===========*/ + int fd, /*!< in: file descriptor */ + ulint offset, /*!< in: offset where to read + in number of row_merge_block_t + elements */ + row_merge_block_t* buf); /*!< out: data */ +/********************************************************************//** +Read a merge record. +@return pointer to next record, or NULL on I/O error or end of list */ +UNIV_INTERN +const byte* +row_merge_read_rec( +/*===============*/ + row_merge_block_t* block, /*!< in/out: file buffer */ + mrec_buf_t* buf, /*!< in/out: secondary buffer */ + const byte* b, /*!< in: pointer to record */ + const dict_index_t* index, /*!< in: index of the record */ + int fd, /*!< in: file descriptor */ + ulint* foffs, /*!< in/out: file offset */ + const mrec_t** mrec, /*!< out: pointer to merge record, + or NULL on end of list + (non-NULL on I/O error) */ + ulint* offsets)/*!< out: offsets of mrec */ + __attribute__((nonnull, warn_unused_result)); +#endif /* row0merge.h */ diff --git a/storage/innobase/include/row0mysql.h b/storage/innobase/include/row0mysql.h new file mode 100644 index 00000000000..06c07002c2b --- /dev/null +++ b/storage/innobase/include/row0mysql.h @@ -0,0 +1,915 @@ +/***************************************************************************** + +Copyright (c) 2000, 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/row0mysql.h +Interface between Innobase row operations and MySQL. +Contains also create table and other data dictionary operations. + +Created 9/17/2000 Heikki Tuuri +*******************************************************/ + +#ifndef row0mysql_h +#define row0mysql_h + +#include "univ.i" +#include "data0data.h" +#include "que0types.h" +#include "dict0types.h" +#include "trx0types.h" +#include "row0types.h" +#include "btr0pcur.h" +#include "trx0types.h" + +// Forward declaration +struct SysIndexCallback; + +extern ibool row_rollback_on_timeout; + +struct row_prebuilt_t; + +/*******************************************************************//** +Frees the blob heap in prebuilt when no longer needed. */ +UNIV_INTERN +void +row_mysql_prebuilt_free_blob_heap( +/*==============================*/ + row_prebuilt_t* prebuilt); /*!< in: prebuilt struct of a + ha_innobase:: table handle */ +/*******************************************************************//** +Stores a >= 5.0.3 format true VARCHAR length to dest, in the MySQL row +format. +@return pointer to the data, we skip the 1 or 2 bytes at the start +that are used to store the len */ +UNIV_INTERN +byte* +row_mysql_store_true_var_len( +/*=========================*/ + byte* dest, /*!< in: where to store */ + ulint len, /*!< in: length, must fit in two bytes */ + ulint lenlen);/*!< in: storage length of len: either 1 or 2 bytes */ +/*******************************************************************//** +Reads a >= 5.0.3 format true VARCHAR length, in the MySQL row format, and +returns a pointer to the data. +@return pointer to the data, we skip the 1 or 2 bytes at the start +that are used to store the len */ +UNIV_INTERN +const byte* +row_mysql_read_true_varchar( +/*========================*/ + ulint* len, /*!< out: variable-length field length */ + const byte* field, /*!< in: field in the MySQL format */ + ulint lenlen);/*!< in: storage length of len: either 1 + or 2 bytes */ +/*******************************************************************//** +Stores a reference to a BLOB in the MySQL format. */ +UNIV_INTERN +void +row_mysql_store_blob_ref( +/*=====================*/ + byte* dest, /*!< in: where to store */ + ulint col_len,/*!< in: dest buffer size: determines into + how many bytes the BLOB length is stored, + the space for the length may vary from 1 + to 4 bytes */ + const void* data, /*!< in: BLOB data; if the value to store + is SQL NULL this should be NULL pointer */ + ulint len); /*!< in: BLOB length; if the value to store + is SQL NULL this should be 0; remember + also to set the NULL bit in the MySQL record + header! */ +/*******************************************************************//** +Reads a reference to a BLOB in the MySQL format. +@return pointer to BLOB data */ +UNIV_INTERN +const byte* +row_mysql_read_blob_ref( +/*====================*/ + ulint* len, /*!< out: BLOB length */ + const byte* ref, /*!< in: BLOB reference in the + MySQL format */ + ulint col_len); /*!< in: BLOB reference length + (not BLOB length) */ +/**************************************************************//** +Pad a column with spaces. */ +UNIV_INTERN +void +row_mysql_pad_col( +/*==============*/ + ulint mbminlen, /*!< in: minimum size of a character, + in bytes */ + byte* pad, /*!< out: padded buffer */ + ulint len); /*!< in: number of bytes to pad */ + +/**************************************************************//** +Stores a non-SQL-NULL field given in the MySQL format in the InnoDB format. +The counterpart of this function is row_sel_field_store_in_mysql_format() in +row0sel.cc. +@return up to which byte we used buf in the conversion */ +UNIV_INTERN +byte* +row_mysql_store_col_in_innobase_format( +/*===================================*/ + dfield_t* dfield, /*!< in/out: dfield where dtype + information must be already set when + this function is called! */ + byte* buf, /*!< in/out: buffer for a converted + integer value; this must be at least + col_len long then! NOTE that dfield + may also get a pointer to 'buf', + therefore do not discard this as long + as dfield is used! */ + ibool row_format_col, /*!< TRUE if the mysql_data is from + a MySQL row, FALSE if from a MySQL + key value; + in MySQL, a true VARCHAR storage + format differs in a row and in a + key value: in a key value the length + is always stored in 2 bytes! */ + const byte* mysql_data, /*!< in: MySQL column value, not + SQL NULL; NOTE that dfield may also + get a pointer to mysql_data, + therefore do not discard this as long + as dfield is used! */ + ulint col_len, /*!< in: MySQL column length; NOTE that + this is the storage length of the + column in the MySQL format row, not + necessarily the length of the actual + payload data; if the column is a true + VARCHAR then this is irrelevant */ + ulint comp); /*!< in: nonzero=compact format */ +/****************************************************************//** +Handles user errors and lock waits detected by the database engine. +@return true if it was a lock wait and we should continue running the +query thread */ +UNIV_INTERN +bool +row_mysql_handle_errors( +/*====================*/ + dberr_t* new_err,/*!< out: possible new error encountered in + rollback, or the old error which was + during the function entry */ + trx_t* trx, /*!< in: transaction */ + que_thr_t* thr, /*!< in: query thread, or NULL */ + trx_savept_t* savept) /*!< in: savepoint, or NULL */ + __attribute__((nonnull(1,2))); +/********************************************************************//** +Create a prebuilt struct for a MySQL table handle. +@return own: a prebuilt struct */ +UNIV_INTERN +row_prebuilt_t* +row_create_prebuilt( +/*================*/ + dict_table_t* table, /*!< in: Innobase table handle */ + ulint mysql_row_len); /*!< in: length in bytes of a row in + the MySQL format */ +/********************************************************************//** +Free a prebuilt struct for a MySQL table handle. */ +UNIV_INTERN +void +row_prebuilt_free( +/*==============*/ + row_prebuilt_t* prebuilt, /*!< in, own: prebuilt struct */ + ibool dict_locked); /*!< in: TRUE=data dictionary locked */ +/*********************************************************************//** +Updates the transaction pointers in query graphs stored in the prebuilt +struct. */ +UNIV_INTERN +void +row_update_prebuilt_trx( +/*====================*/ + row_prebuilt_t* prebuilt, /*!< in/out: prebuilt struct + in MySQL handle */ + trx_t* trx); /*!< in: transaction handle */ +/*********************************************************************//** +Sets an AUTO_INC type lock on the table mentioned in prebuilt. The +AUTO_INC lock gives exclusive access to the auto-inc counter of the +table. The lock is reserved only for the duration of an SQL statement. +It is not compatible with another AUTO_INC or exclusive lock on the +table. +@return error code or DB_SUCCESS */ +UNIV_INTERN +dberr_t +row_lock_table_autoinc_for_mysql( +/*=============================*/ + row_prebuilt_t* prebuilt) /*!< in: prebuilt struct in the MySQL + table handle */ + __attribute__((nonnull, warn_unused_result)); +/*********************************************************************//** +Sets a table lock on the table mentioned in prebuilt. +@return error code or DB_SUCCESS */ +UNIV_INTERN +dberr_t +row_lock_table_for_mysql( +/*=====================*/ + row_prebuilt_t* prebuilt, /*!< in: prebuilt struct in the MySQL + table handle */ + dict_table_t* table, /*!< in: table to lock, or NULL + if prebuilt->table should be + locked as + prebuilt->select_lock_type */ + ulint mode) /*!< in: lock mode of table + (ignored if table==NULL) */ + __attribute__((nonnull(1))); +/*********************************************************************//** +Does an insert for MySQL. +@return error code or DB_SUCCESS */ +UNIV_INTERN +dberr_t +row_insert_for_mysql( +/*=================*/ + byte* mysql_rec, /*!< in: row in the MySQL format */ + row_prebuilt_t* prebuilt) /*!< in: prebuilt struct in MySQL + handle */ + __attribute__((nonnull, warn_unused_result)); +/*********************************************************************//** +Builds a dummy query graph used in selects. */ +UNIV_INTERN +void +row_prebuild_sel_graph( +/*===================*/ + row_prebuilt_t* prebuilt); /*!< in: prebuilt struct in MySQL + handle */ +/*********************************************************************//** +Gets pointer to a prebuilt update vector used in updates. If the update +graph has not yet been built in the prebuilt struct, then this function +first builds it. +@return prebuilt update vector */ +UNIV_INTERN +upd_t* +row_get_prebuilt_update_vector( +/*===========================*/ + row_prebuilt_t* prebuilt); /*!< in: prebuilt struct in MySQL + handle */ +/*********************************************************************//** +Checks if a table is such that we automatically created a clustered +index on it (on row id). +@return TRUE if the clustered index was generated automatically */ +UNIV_INTERN +ibool +row_table_got_default_clust_index( +/*==============================*/ + const dict_table_t* table); /*!< in: table */ +/*********************************************************************//** +Does an update or delete of a row for MySQL. +@return error code or DB_SUCCESS */ +UNIV_INTERN +dberr_t +row_update_for_mysql( +/*=================*/ + byte* mysql_rec, /*!< in: the row to be updated, in + the MySQL format */ + row_prebuilt_t* prebuilt) /*!< in: prebuilt struct in MySQL + handle */ + __attribute__((nonnull, warn_unused_result)); +/*********************************************************************//** +This can only be used when srv_locks_unsafe_for_binlog is TRUE or this +session is using a READ COMMITTED or READ UNCOMMITTED isolation level. +Before calling this function row_search_for_mysql() must have +initialized prebuilt->new_rec_locks to store the information which new +record locks really were set. This function removes a newly set +clustered index record lock under prebuilt->pcur or +prebuilt->clust_pcur. Thus, this implements a 'mini-rollback' that +releases the latest clustered index record lock we set. */ +UNIV_INTERN +void +row_unlock_for_mysql( +/*=================*/ + row_prebuilt_t* prebuilt, /*!< in/out: prebuilt struct in MySQL + handle */ + ibool has_latches_on_recs)/*!< in: TRUE if called + so that we have the latches on + the records under pcur and + clust_pcur, and we do not need + to reposition the cursors. */ + __attribute__((nonnull)); +/*********************************************************************//** +Checks if a table name contains the string "/#sql" which denotes temporary +tables in MySQL. +@return true if temporary table */ +UNIV_INTERN +bool +row_is_mysql_tmp_table_name( +/*========================*/ + const char* name) __attribute__((warn_unused_result)); + /*!< in: table name in the form + 'database/tablename' */ + +/*********************************************************************//** +Creates an query graph node of 'update' type to be used in the MySQL +interface. +@return own: update node */ +UNIV_INTERN +upd_node_t* +row_create_update_node_for_mysql( +/*=============================*/ + dict_table_t* table, /*!< in: table to update */ + mem_heap_t* heap); /*!< in: mem heap from which allocated */ +/**********************************************************************//** +Does a cascaded delete or set null in a foreign key operation. +@return error code or DB_SUCCESS */ +UNIV_INTERN +dberr_t +row_update_cascade_for_mysql( +/*=========================*/ + que_thr_t* thr, /*!< in: query thread */ + upd_node_t* node, /*!< in: update node used in the cascade + or set null operation */ + dict_table_t* table) /*!< in: table where we do the operation */ + __attribute__((nonnull, warn_unused_result)); +/*********************************************************************//** +Locks the data dictionary exclusively for performing a table create or other +data dictionary modification operation. */ +UNIV_INTERN +void +row_mysql_lock_data_dictionary_func( +/*================================*/ + trx_t* trx, /*!< in/out: transaction */ + const char* file, /*!< in: file name */ + ulint line); /*!< in: line number */ +#define row_mysql_lock_data_dictionary(trx) \ + row_mysql_lock_data_dictionary_func(trx, __FILE__, __LINE__) +/*********************************************************************//** +Unlocks the data dictionary exclusive lock. */ +UNIV_INTERN +void +row_mysql_unlock_data_dictionary( +/*=============================*/ + trx_t* trx); /*!< in/out: transaction */ +/*********************************************************************//** +Locks the data dictionary in shared mode from modifications, for performing +foreign key check, rollback, or other operation invisible to MySQL. */ +UNIV_INTERN +void +row_mysql_freeze_data_dictionary_func( +/*==================================*/ + trx_t* trx, /*!< in/out: transaction */ + const char* file, /*!< in: file name */ + ulint line); /*!< in: line number */ +#define row_mysql_freeze_data_dictionary(trx) \ + row_mysql_freeze_data_dictionary_func(trx, __FILE__, __LINE__) +/*********************************************************************//** +Unlocks the data dictionary shared lock. */ +UNIV_INTERN +void +row_mysql_unfreeze_data_dictionary( +/*===============================*/ + trx_t* trx); /*!< in/out: transaction */ +/*********************************************************************//** +Creates a table for MySQL. If the name of the table ends in +one of "innodb_monitor", "innodb_lock_monitor", "innodb_tablespace_monitor", +"innodb_table_monitor", then this will also start the printing of monitor +output by the master thread. If the table name ends in "innodb_mem_validate", +InnoDB will try to invoke mem_validate(). On failure the transaction will +be rolled back. +@return error code or DB_SUCCESS */ +UNIV_INTERN +dberr_t +row_create_table_for_mysql( +/*=======================*/ + dict_table_t* table, /*!< in, own: table definition + (will be freed, or on DB_SUCCESS + added to the data dictionary cache) */ + trx_t* trx, /*!< in/out: transaction */ + bool commit) /*!< in: if true, commit the transaction */ + __attribute__((nonnull, warn_unused_result)); +/*********************************************************************//** +Does an index creation operation for MySQL. TODO: currently failure +to create an index results in dropping the whole table! This is no problem +currently as all indexes must be created at the same time as the table. +@return error number or DB_SUCCESS */ +UNIV_INTERN +dberr_t +row_create_index_for_mysql( +/*=======================*/ + dict_index_t* index, /*!< in, own: index definition + (will be freed) */ + trx_t* trx, /*!< in: transaction handle */ + const ulint* field_lengths) /*!< in: if not NULL, must contain + dict_index_get_n_fields(index) + actual field lengths for the + index columns, which are + then checked for not being too + large. */ + __attribute__((nonnull(1,2), warn_unused_result)); +/*********************************************************************//** +Scans a table create SQL string and adds to the data dictionary +the foreign key constraints declared in the string. This function +should be called after the indexes for a table have been created. +Each foreign key constraint must be accompanied with indexes in +bot participating tables. The indexes are allowed to contain more +fields than mentioned in the constraint. +@return error code or DB_SUCCESS */ +UNIV_INTERN +dberr_t +row_table_add_foreign_constraints( +/*==============================*/ + trx_t* trx, /*!< in: transaction */ + const char* sql_string, /*!< in: table create statement where + foreign keys are declared like: + FOREIGN KEY (a, b) REFERENCES table2(c, d), + table2 can be written also with the + database name before it: test.table2 */ + size_t sql_length, /*!< in: length of sql_string */ + const char* name, /*!< in: table full name in the + normalized form + database_name/table_name */ + ibool reject_fks) /*!< in: if TRUE, fail with error + code DB_CANNOT_ADD_CONSTRAINT if + any foreign keys are found. */ + __attribute__((nonnull, warn_unused_result)); +/*********************************************************************//** +The master thread in srv0srv.cc calls this regularly to drop tables which +we must drop in background after queries to them have ended. Such lazy +dropping of tables is needed in ALTER TABLE on Unix. +@return how many tables dropped + remaining tables in list */ +UNIV_INTERN +ulint +row_drop_tables_for_mysql_in_background(void); +/*=========================================*/ +/*********************************************************************//** +Get the background drop list length. NOTE: the caller must own the kernel +mutex! +@return how many tables in list */ +UNIV_INTERN +ulint +row_get_background_drop_list_len_low(void); +/*======================================*/ +/*********************************************************************//** +Sets an exclusive lock on a table. +@return error code or DB_SUCCESS */ +UNIV_INTERN +dberr_t +row_mysql_lock_table( +/*=================*/ + trx_t* trx, /*!< in/out: transaction */ + dict_table_t* table, /*!< in: table to lock */ + enum lock_mode mode, /*!< in: LOCK_X or LOCK_S */ + const char* op_info) /*!< in: string for trx->op_info */ + __attribute__((nonnull, warn_unused_result)); + +/*********************************************************************//** +Truncates a table for MySQL. +@return error code or DB_SUCCESS */ +UNIV_INTERN +dberr_t +row_truncate_table_for_mysql( +/*=========================*/ + dict_table_t* table, /*!< in: table handle */ + trx_t* trx) /*!< in: transaction handle */ + __attribute__((nonnull, warn_unused_result)); +/*********************************************************************//** +Drops a table for MySQL. If the name of the dropped table ends in +one of "innodb_monitor", "innodb_lock_monitor", "innodb_tablespace_monitor", +"innodb_table_monitor", then this will also stop the printing of monitor +output by the master thread. If the data dictionary was not already locked +by the transaction, the transaction will be committed. Otherwise, the +data dictionary will remain locked. +@return error code or DB_SUCCESS */ +UNIV_INTERN +dberr_t +row_drop_table_for_mysql( +/*=====================*/ + const char* name, /*!< in: table name */ + trx_t* trx, /*!< in: dictionary transaction handle */ + bool drop_db,/*!< in: true=dropping whole database */ + bool nonatomic = true) + /*!< in: whether it is permitted + to release and reacquire dict_operation_lock */ + __attribute__((nonnull)); +/*********************************************************************//** +Drop all temporary tables during crash recovery. */ +UNIV_INTERN +void +row_mysql_drop_temp_tables(void); +/*============================*/ + +/*********************************************************************//** +Discards the tablespace of a table which stored in an .ibd file. Discarding +means that this function deletes the .ibd file and assigns a new table id for +the table. Also the flag table->ibd_file_missing is set TRUE. +@return error code or DB_SUCCESS */ +UNIV_INTERN +dberr_t +row_discard_tablespace_for_mysql( +/*=============================*/ + const char* name, /*!< in: table name */ + trx_t* trx) /*!< in: transaction handle */ + __attribute__((nonnull, warn_unused_result)); +/*****************************************************************//** +Imports a tablespace. The space id in the .ibd file must match the space id +of the table in the data dictionary. +@return error code or DB_SUCCESS */ +UNIV_INTERN +dberr_t +row_import_tablespace_for_mysql( +/*============================*/ + dict_table_t* table, /*!< in/out: table */ + row_prebuilt_t* prebuilt) /*!< in: prebuilt struct in MySQL */ + __attribute__((nonnull, warn_unused_result)); +/*********************************************************************//** +Drops a database for MySQL. +@return error code or DB_SUCCESS */ +UNIV_INTERN +dberr_t +row_drop_database_for_mysql( +/*========================*/ + const char* name, /*!< in: database name which ends to '/' */ + trx_t* trx) /*!< in: transaction handle */ + __attribute__((nonnull)); +/*********************************************************************//** +Renames a table for MySQL. +@return error code or DB_SUCCESS */ +UNIV_INTERN +dberr_t +row_rename_table_for_mysql( +/*=======================*/ + const char* old_name, /*!< in: old table name */ + const char* new_name, /*!< in: new table name */ + trx_t* trx, /*!< in/out: transaction */ + bool commit) /*!< in: whether to commit trx */ + __attribute__((nonnull, warn_unused_result)); +/*********************************************************************//** +Checks that the index contains entries in an ascending order, unique +constraint is not broken, and calculates the number of index entries +in the read view of the current transaction. +@return true if ok */ +UNIV_INTERN +bool +row_check_index_for_mysql( +/*======================*/ + row_prebuilt_t* prebuilt, /*!< in: prebuilt struct + in MySQL handle */ + const dict_index_t* index, /*!< in: index */ + ulint* n_rows) /*!< out: number of entries + seen in the consistent read */ + __attribute__((nonnull, warn_unused_result)); +/*********************************************************************//** +Determines if a table is a magic monitor table. +@return true if monitor table */ +UNIV_INTERN +bool +row_is_magic_monitor_table( +/*=======================*/ + const char* table_name) /*!< in: name of the table, in the + form database/table_name */ + __attribute__((nonnull, warn_unused_result)); +/*********************************************************************//** +Initialize this module */ +UNIV_INTERN +void +row_mysql_init(void); +/*================*/ + +/*********************************************************************//** +Close this module */ +UNIV_INTERN +void +row_mysql_close(void); +/*=================*/ + +/*********************************************************************//** +Reassigns the table identifier of a table. +@return error code or DB_SUCCESS */ +UNIV_INTERN +dberr_t +row_mysql_table_id_reassign( +/*========================*/ + dict_table_t* table, /*!< in/out: table */ + trx_t* trx, /*!< in/out: transaction */ + table_id_t* new_id) /*!< out: new table id */ + __attribute__((nonnull, warn_unused_result)); + +/* A struct describing a place for an individual column in the MySQL +row format which is presented to the table handler in ha_innobase. +This template struct is used to speed up row transformations between +Innobase and MySQL. */ + +struct mysql_row_templ_t { + ulint col_no; /*!< column number of the column */ + ulint rec_field_no; /*!< field number of the column in an + Innobase record in the current index; + not defined if template_type is + ROW_MYSQL_WHOLE_ROW */ + ulint clust_rec_field_no; /*!< field number of the column in an + Innobase record in the clustered index; + not defined if template_type is + ROW_MYSQL_WHOLE_ROW */ + ulint icp_rec_field_no; /*!< field number of the column in an + Innobase record in the current index; + not defined unless + index condition pushdown is used */ + ulint mysql_col_offset; /*!< offset of the column in the MySQL + row format */ + ulint mysql_col_len; /*!< length of the column in the MySQL + row format */ + ulint mysql_null_byte_offset; /*!< MySQL NULL bit byte offset in a + MySQL record */ + ulint mysql_null_bit_mask; /*!< bit mask to get the NULL bit, + zero if column cannot be NULL */ + ulint type; /*!< column type in Innobase mtype + numbers DATA_CHAR... */ + ulint mysql_type; /*!< MySQL type code; this is always + < 256 */ + ulint mysql_length_bytes; /*!< if mysql_type + == DATA_MYSQL_TRUE_VARCHAR, this tells + whether we should use 1 or 2 bytes to + store the MySQL true VARCHAR data + length at the start of row in the MySQL + format (NOTE that the MySQL key value + format always uses 2 bytes for the data + len) */ + ulint charset; /*!< MySQL charset-collation code + of the column, or zero */ + ulint mbminlen; /*!< minimum length of a char, in bytes, + or zero if not a char type */ + ulint mbmaxlen; /*!< maximum length of a char, in bytes, + or zero if not a char type */ + ulint is_unsigned; /*!< if a column type is an integer + type and this field is != 0, then + it is an unsigned integer type */ +}; + +#define MYSQL_FETCH_CACHE_SIZE 8 +/* After fetching this many rows, we start caching them in fetch_cache */ +#define MYSQL_FETCH_CACHE_THRESHOLD 4 + +#define ROW_PREBUILT_ALLOCATED 78540783 +#define ROW_PREBUILT_FREED 26423527 + +/** A struct for (sometimes lazily) prebuilt structures in an Innobase table +handle used within MySQL; these are used to save CPU time. */ + +struct row_prebuilt_t { + ulint magic_n; /*!< this magic number is set to + ROW_PREBUILT_ALLOCATED when created, + or ROW_PREBUILT_FREED when the + struct has been freed */ + dict_table_t* table; /*!< Innobase table handle */ + dict_index_t* index; /*!< current index for a search, if + any */ + trx_t* trx; /*!< current transaction handle */ + unsigned sql_stat_start:1;/*!< TRUE when we start processing of + an SQL statement: we may have to set + an intention lock on the table, + create a consistent read view etc. */ + unsigned mysql_has_locked:1;/*!< this is set TRUE when MySQL + calls external_lock on this handle + with a lock flag, and set FALSE when + with the F_UNLOCK flag */ + unsigned clust_index_was_generated:1; + /*!< if the user did not define a + primary key in MySQL, then Innobase + automatically generated a clustered + index where the ordering column is + the row id: in this case this flag + is set to TRUE */ + unsigned index_usable:1; /*!< caches the value of + row_merge_is_index_usable(trx,index) */ + unsigned read_just_key:1;/*!< set to 1 when MySQL calls + ha_innobase::extra with the + argument HA_EXTRA_KEYREAD; it is enough + to read just columns defined in + the index (i.e., no read of the + clustered index record necessary) */ + unsigned used_in_HANDLER:1;/*!< TRUE if we have been using this + handle in a MySQL HANDLER low level + index cursor command: then we must + store the pcur position even in a + unique search from a clustered index, + because HANDLER allows NEXT and PREV + in such a situation */ + unsigned template_type:2;/*!< ROW_MYSQL_WHOLE_ROW, + ROW_MYSQL_REC_FIELDS, + ROW_MYSQL_DUMMY_TEMPLATE, or + ROW_MYSQL_NO_TEMPLATE */ + unsigned n_template:10; /*!< number of elements in the + template */ + unsigned null_bitmap_len:10;/*!< number of bytes in the SQL NULL + bitmap at the start of a row in the + MySQL format */ + unsigned need_to_access_clustered:1; /*!< if we are fetching + columns through a secondary index + and at least one column is not in + the secondary index, then this is + set to TRUE */ + unsigned templ_contains_blob:1;/*!< TRUE if the template contains + a column with DATA_BLOB == + get_innobase_type_from_mysql_type(); + not to be confused with InnoDB + externally stored columns + (VARCHAR can be off-page too) */ + mysql_row_templ_t* mysql_template;/*!< template used to transform + rows fast between MySQL and Innobase + formats; memory for this template + is not allocated from 'heap' */ + mem_heap_t* heap; /*!< memory heap from which + these auxiliary structures are + allocated when needed */ + ins_node_t* ins_node; /*!< Innobase SQL insert node + used to perform inserts + to the table */ + byte* ins_upd_rec_buff;/*!< buffer for storing data converted + to the Innobase format from the MySQL + format */ + const byte* default_rec; /*!< the default values of all columns + (a "default row") in MySQL format */ + ulint hint_need_to_fetch_extra_cols; + /*!< normally this is set to 0; if this + is set to ROW_RETRIEVE_PRIMARY_KEY, + then we should at least retrieve all + columns in the primary key; if this + is set to ROW_RETRIEVE_ALL_COLS, then + we must retrieve all columns in the + key (if read_just_key == 1), or all + columns in the table */ + upd_node_t* upd_node; /*!< Innobase SQL update node used + to perform updates and deletes */ + trx_id_t trx_id; /*!< The table->def_trx_id when + ins_graph was built */ + que_fork_t* ins_graph; /*!< Innobase SQL query graph used + in inserts. Will be rebuilt on + trx_id or n_indexes mismatch. */ + que_fork_t* upd_graph; /*!< Innobase SQL query graph used + in updates or deletes */ + btr_pcur_t pcur; /*!< persistent cursor used in selects + and updates */ + btr_pcur_t clust_pcur; /*!< persistent cursor used in + some selects and updates */ + que_fork_t* sel_graph; /*!< dummy query graph used in + selects */ + dtuple_t* search_tuple; /*!< prebuilt dtuple used in selects */ + byte row_id[DATA_ROW_ID_LEN]; + /*!< if the clustered index was + generated, the row id of the + last row fetched is stored + here */ + doc_id_t fts_doc_id; /* if the table has an FTS index on + it then we fetch the doc_id. + FTS-FIXME: Currently we fetch it always + but in the future we must only fetch + it when FTS columns are being + updated */ + dtuple_t* clust_ref; /*!< prebuilt dtuple used in + sel/upd/del */ + ulint select_lock_type;/*!< LOCK_NONE, LOCK_S, or LOCK_X */ + ulint stored_select_lock_type;/*!< this field is used to + remember the original select_lock_type + that was decided in ha_innodb.cc, + ::store_lock(), ::external_lock(), + etc. */ + ulint row_read_type; /*!< ROW_READ_WITH_LOCKS if row locks + should be the obtained for records + under an UPDATE or DELETE cursor. + If innodb_locks_unsafe_for_binlog + is TRUE, this can be set to + ROW_READ_TRY_SEMI_CONSISTENT, so that + if the row under an UPDATE or DELETE + cursor was locked by another + transaction, InnoDB will resort + to reading the last committed value + ('semi-consistent read'). Then, + this field will be set to + ROW_READ_DID_SEMI_CONSISTENT to + indicate that. If the row does not + match the WHERE condition, MySQL will + invoke handler::unlock_row() to + clear the flag back to + ROW_READ_TRY_SEMI_CONSISTENT and + to simply skip the row. If + the row matches, the next call to + row_search_for_mysql() will lock + the row. + This eliminates lock waits in some + cases; note that this breaks + serializability. */ + ulint new_rec_locks; /*!< normally 0; if + srv_locks_unsafe_for_binlog is + TRUE or session is using READ + COMMITTED or READ UNCOMMITTED + isolation level, set in + row_search_for_mysql() if we set a new + record lock on the secondary + or clustered index; this is + used in row_unlock_for_mysql() + when releasing the lock under + the cursor if we determine + after retrieving the row that + it does not need to be locked + ('mini-rollback') */ + ulint mysql_prefix_len;/*!< byte offset of the end of + the last requested column */ + ulint mysql_row_len; /*!< length in bytes of a row in the + MySQL format */ + ulint n_rows_fetched; /*!< number of rows fetched after + positioning the current cursor */ + ulint fetch_direction;/*!< ROW_SEL_NEXT or ROW_SEL_PREV */ + byte* fetch_cache[MYSQL_FETCH_CACHE_SIZE]; + /*!< a cache for fetched rows if we + fetch many rows from the same cursor: + it saves CPU time to fetch them in a + batch; we reserve mysql_row_len + bytes for each such row; these + pointers point 4 bytes past the + allocated mem buf start, because + there is a 4 byte magic number at the + start and at the end */ + ibool keep_other_fields_on_keyread; /*!< when using fetch + cache with HA_EXTRA_KEYREAD, don't + overwrite other fields in mysql row + row buffer.*/ + ulint fetch_cache_first;/*!< position of the first not yet + fetched row in fetch_cache */ + ulint n_fetch_cached; /*!< number of not yet fetched rows + in fetch_cache */ + mem_heap_t* blob_heap; /*!< in SELECTS BLOB fields are copied + to this heap */ + mem_heap_t* old_vers_heap; /*!< memory heap where a previous + version is built in consistent read */ + bool in_fts_query; /*!< Whether we are in a FTS query */ + /*----------------------*/ + ulonglong autoinc_last_value; + /*!< last value of AUTO-INC interval */ + ulonglong autoinc_increment;/*!< The increment step of the auto + increment column. Value must be + greater than or equal to 1. Required to + calculate the next value */ + ulonglong autoinc_offset; /*!< The offset passed to + get_auto_increment() by MySQL. Required + to calculate the next value */ + dberr_t autoinc_error; /*!< The actual error code encountered + while trying to init or read the + autoinc value from the table. We + store it here so that we can return + it to MySQL */ + /*----------------------*/ + void* idx_cond; /*!< In ICP, pointer to a ha_innobase, + passed to innobase_index_cond(). + NULL if index condition pushdown is + not used. */ + ulint idx_cond_n_cols;/*!< Number of fields in idx_cond_cols. + 0 if and only if idx_cond == NULL. */ + /*----------------------*/ + ulint magic_n2; /*!< this should be the same as + magic_n */ + /*----------------------*/ + unsigned innodb_api:1; /*!< whether this is a InnoDB API + query */ + const rec_t* innodb_api_rec; /*!< InnoDB API search result */ + byte* srch_key_val1; /*!< buffer used in converting + search key values from MySQL format + to InnoDB format.*/ + byte* srch_key_val2; /*!< buffer used in converting + search key values from MySQL format + to InnoDB format.*/ + uint srch_key_val_len; /*!< Size of search key */ + +}; + +/** Callback for row_mysql_sys_index_iterate() */ +struct SysIndexCallback { + virtual ~SysIndexCallback() { } + + /** Callback method + @param mtr - current mini transaction + @param pcur - persistent cursor. */ + virtual void operator()(mtr_t* mtr, btr_pcur_t* pcur) throw() = 0; +}; + +#define ROW_PREBUILT_FETCH_MAGIC_N 465765687 + +#define ROW_MYSQL_WHOLE_ROW 0 +#define ROW_MYSQL_REC_FIELDS 1 +#define ROW_MYSQL_NO_TEMPLATE 2 +#define ROW_MYSQL_DUMMY_TEMPLATE 3 /* dummy template used in + row_scan_and_check_index */ + +/* Values for hint_need_to_fetch_extra_cols */ +#define ROW_RETRIEVE_PRIMARY_KEY 1 +#define ROW_RETRIEVE_ALL_COLS 2 + +/* Values for row_read_type */ +#define ROW_READ_WITH_LOCKS 0 +#define ROW_READ_TRY_SEMI_CONSISTENT 1 +#define ROW_READ_DID_SEMI_CONSISTENT 2 + +#ifndef UNIV_NONINL +#include "row0mysql.ic" +#endif + +#endif /* row0mysql.h */ diff --git a/storage/innobase/include/row0mysql.ic b/storage/innobase/include/row0mysql.ic new file mode 100644 index 00000000000..2eb60898c46 --- /dev/null +++ b/storage/innobase/include/row0mysql.ic @@ -0,0 +1,24 @@ +/***************************************************************************** + +Copyright (c) 2001, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/row0mysql.ic +MySQL interface for Innobase + +Created 1/23/2001 Heikki Tuuri +*******************************************************/ diff --git a/storage/innobase/include/row0purge.h b/storage/innobase/include/row0purge.h new file mode 100644 index 00000000000..93dcf9cf49b --- /dev/null +++ b/storage/innobase/include/row0purge.h @@ -0,0 +1,128 @@ +/***************************************************************************** + +Copyright (c) 1997, 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/row0purge.h +Purge obsolete records + +Created 3/14/1997 Heikki Tuuri +*******************************************************/ + +#ifndef row0purge_h +#define row0purge_h + +#include "univ.i" +#include "data0data.h" +#include "btr0types.h" +#include "btr0pcur.h" +#include "dict0types.h" +#include "trx0types.h" +#include "que0types.h" +#include "row0types.h" +#include "row0purge.h" +#include "ut0vec.h" + +/********************************************************************//** +Creates a purge node to a query graph. +@return own: purge node */ +UNIV_INTERN +purge_node_t* +row_purge_node_create( +/*==================*/ + que_thr_t* parent, /*!< in: parent node, i.e., a + thr node */ + mem_heap_t* heap) /*!< in: memory heap where created */ + __attribute__((nonnull, warn_unused_result)); +/***********************************************************//** +Determines if it is possible to remove a secondary index entry. +Removal is possible if the secondary index entry does not refer to any +not delete marked version of a clustered index record where DB_TRX_ID +is newer than the purge view. + +NOTE: This function should only be called by the purge thread, only +while holding a latch on the leaf page of the secondary index entry +(or keeping the buffer pool watch on the page). It is possible that +this function first returns true and then false, if a user transaction +inserts a record that the secondary index entry would refer to. +However, in that case, the user transaction would also re-insert the +secondary index entry after purge has removed it and released the leaf +page latch. +@return true if the secondary index record can be purged */ +UNIV_INTERN +bool +row_purge_poss_sec( +/*===============*/ + purge_node_t* node, /*!< in/out: row purge node */ + dict_index_t* index, /*!< in: secondary index */ + const dtuple_t* entry) /*!< in: secondary index entry */ + __attribute__((nonnull, warn_unused_result)); +/*************************************************************** +Does the purge operation for a single undo log record. This is a high-level +function used in an SQL execution graph. +@return query thread to run next or NULL */ +UNIV_INTERN +que_thr_t* +row_purge_step( +/*===========*/ + que_thr_t* thr) /*!< in: query thread */ + __attribute__((nonnull, warn_unused_result)); + +/* Purge node structure */ + +struct purge_node_t{ + que_common_t common; /*!< node type: QUE_NODE_PURGE */ + /*----------------------*/ + /* Local storage for this graph node */ + roll_ptr_t roll_ptr;/* roll pointer to undo log record */ + ib_vector_t* undo_recs;/*!< Undo recs to purge */ + + undo_no_t undo_no;/* undo number of the record */ + + ulint rec_type;/* undo log record type: TRX_UNDO_INSERT_REC, + ... */ + dict_table_t* table; /*!< table where purge is done */ + + ulint cmpl_info;/* compiler analysis info of an update */ + + upd_t* update; /*!< update vector for a clustered index + record */ + dtuple_t* ref; /*!< NULL, or row reference to the next row to + handle */ + dtuple_t* row; /*!< NULL, or a copy (also fields copied to + heap) of the indexed fields of the row to + handle */ + dict_index_t* index; /*!< NULL, or the next index whose record should + be handled */ + mem_heap_t* heap; /*!< memory heap used as auxiliary storage for + row; this must be emptied after a successful + purge of a row */ + ibool found_clust;/* TRUE if the clustered index record + determined by ref was found in the clustered + index, and we were able to position pcur on + it */ + btr_pcur_t pcur; /*!< persistent cursor used in searching the + clustered index record */ + ibool done; /* Debug flag */ + +}; + +#ifndef UNIV_NONINL +#include "row0purge.ic" +#endif + +#endif diff --git a/storage/innobase/include/row0purge.ic b/storage/innobase/include/row0purge.ic new file mode 100644 index 00000000000..700106d1048 --- /dev/null +++ b/storage/innobase/include/row0purge.ic @@ -0,0 +1,25 @@ +/***************************************************************************** + +Copyright (c) 1997, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + + +/**************************************************//** +@file include/row0purge.ic +Purge obsolete records + +Created 3/14/1997 Heikki Tuuri +*******************************************************/ diff --git a/storage/innobase/include/row0quiesce.h b/storage/innobase/include/row0quiesce.h new file mode 100644 index 00000000000..1d6d11291b8 --- /dev/null +++ b/storage/innobase/include/row0quiesce.h @@ -0,0 +1,74 @@ +/***************************************************************************** + +Copyright (c) 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/row0quiesce.h + +Header file for tablespace quiesce functions. + +Created 2012-02-08 by Sunny Bains +*******************************************************/ + +#ifndef row0quiesce_h +#define row0quiesce_h + +#include "univ.i" +#include "dict0types.h" + +struct trx_t; + +/** The version number of the export meta-data text file. */ +#define IB_EXPORT_CFG_VERSION_V1 0x1UL + +/*********************************************************************//** +Quiesce the tablespace that the table resides in. */ +UNIV_INTERN +void +row_quiesce_table_start( +/*====================*/ + dict_table_t* table, /*!< in: quiesce this table */ + trx_t* trx) /*!< in/out: transaction/session */ + __attribute__((nonnull)); + +/*********************************************************************//** +Set a table's quiesce state. +@return DB_SUCCESS or errro code. */ +UNIV_INTERN +dberr_t +row_quiesce_set_state( +/*==================*/ + dict_table_t* table, /*!< in: quiesce this table */ + ib_quiesce_t state, /*!< in: quiesce state to set */ + trx_t* trx) /*!< in/out: transaction */ + __attribute__((nonnull, warn_unused_result)); + +/*********************************************************************//** +Cleanup after table quiesce. */ +UNIV_INTERN +void +row_quiesce_table_complete( +/*=======================*/ + dict_table_t* table, /*!< in: quiesce this table */ + trx_t* trx) /*!< in/out: transaction/session */ + __attribute__((nonnull)); + +#ifndef UNIV_NONINL +#include "row0quiesce.ic" +#endif + +#endif /* row0quiesce_h */ diff --git a/storage/innobase/include/row0quiesce.ic b/storage/innobase/include/row0quiesce.ic new file mode 100644 index 00000000000..f570a6aed05 --- /dev/null +++ b/storage/innobase/include/row0quiesce.ic @@ -0,0 +1,26 @@ +/***************************************************************************** + +Copyright (c) 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/row0quiesce.ic + +Quiesce a tablespace. + +Created 2012-02-08 Sunny Bains +*******************************************************/ + diff --git a/storage/innobase/include/row0row.h b/storage/innobase/include/row0row.h new file mode 100644 index 00000000000..a4e5e0dd2fa --- /dev/null +++ b/storage/innobase/include/row0row.h @@ -0,0 +1,343 @@ +/***************************************************************************** + +Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/row0row.h +General row routines + +Created 4/20/1996 Heikki Tuuri +*******************************************************/ + +#ifndef row0row_h +#define row0row_h + +#include "univ.i" +#include "data0data.h" +#include "dict0types.h" +#include "trx0types.h" +#include "que0types.h" +#include "mtr0mtr.h" +#include "rem0types.h" +#include "read0types.h" +#include "row0types.h" +#include "btr0types.h" + +/*********************************************************************//** +Gets the offset of the DB_TRX_ID field, in bytes relative to the origin of +a clustered index record. +@return offset of DATA_TRX_ID */ +UNIV_INLINE +ulint +row_get_trx_id_offset( +/*==================*/ + const dict_index_t* index, /*!< in: clustered index */ + const ulint* offsets)/*!< in: record offsets */ + __attribute__((nonnull, warn_unused_result)); +/*********************************************************************//** +Reads the trx id field from a clustered index record. +@return value of the field */ +UNIV_INLINE +trx_id_t +row_get_rec_trx_id( +/*===============*/ + const rec_t* rec, /*!< in: record */ + const dict_index_t* index, /*!< in: clustered index */ + const ulint* offsets)/*!< in: rec_get_offsets(rec, index) */ + __attribute__((nonnull, warn_unused_result)); +/*********************************************************************//** +Reads the roll pointer field from a clustered index record. +@return value of the field */ +UNIV_INLINE +roll_ptr_t +row_get_rec_roll_ptr( +/*=================*/ + const rec_t* rec, /*!< in: record */ + const dict_index_t* index, /*!< in: clustered index */ + const ulint* offsets)/*!< in: rec_get_offsets(rec, index) */ + __attribute__((nonnull, warn_unused_result)); +/*****************************************************************//** +When an insert or purge to a table is performed, this function builds +the entry to be inserted into or purged from an index on the table. +@return index entry which should be inserted or purged +@retval NULL if the externally stored columns in the clustered index record +are unavailable and ext != NULL, or row is missing some needed columns. */ +UNIV_INTERN +dtuple_t* +row_build_index_entry_low( +/*======================*/ + const dtuple_t* row, /*!< in: row which should be + inserted or purged */ + const row_ext_t* ext, /*!< in: externally stored column + prefixes, or NULL */ + dict_index_t* index, /*!< in: index on the table */ + mem_heap_t* heap) /*!< in: memory heap from which + the memory for the index entry + is allocated */ + __attribute__((warn_unused_result, nonnull(1,3,4))); +/*****************************************************************//** +When an insert or purge to a table is performed, this function builds +the entry to be inserted into or purged from an index on the table. +@return index entry which should be inserted or purged, or NULL if the +externally stored columns in the clustered index record are +unavailable and ext != NULL */ +UNIV_INLINE +dtuple_t* +row_build_index_entry( +/*==================*/ + const dtuple_t* row, /*!< in: row which should be + inserted or purged */ + const row_ext_t* ext, /*!< in: externally stored column + prefixes, or NULL */ + dict_index_t* index, /*!< in: index on the table */ + mem_heap_t* heap) /*!< in: memory heap from which + the memory for the index entry + is allocated */ + __attribute__((warn_unused_result, nonnull(1,3,4))); +/*******************************************************************//** +An inverse function to row_build_index_entry. Builds a row from a +record in a clustered index. +@return own: row built; see the NOTE below! */ +UNIV_INTERN +dtuple_t* +row_build( +/*======*/ + ulint type, /*!< in: ROW_COPY_POINTERS or + ROW_COPY_DATA; the latter + copies also the data fields to + heap while the first only + places pointers to data fields + on the index page, and thus is + more efficient */ + const dict_index_t* index, /*!< in: clustered index */ + const rec_t* rec, /*!< in: record in the clustered + index; NOTE: in the case + ROW_COPY_POINTERS the data + fields in the row will point + directly into this record, + therefore, the buffer page of + this record must be at least + s-latched and the latch held + as long as the row dtuple is used! */ + const ulint* offsets,/*!< in: rec_get_offsets(rec,index) + or NULL, in which case this function + will invoke rec_get_offsets() */ + const dict_table_t* col_table, + /*!< in: table, to check which + externally stored columns + occur in the ordering columns + of an index, or NULL if + index->table should be + consulted instead; the user + columns in this table should be + the same columns as in index->table */ + const dtuple_t* add_cols, + /*!< in: default values of + added columns, or NULL */ + const ulint* col_map,/*!< in: mapping of old column + numbers to new ones, or NULL */ + row_ext_t** ext, /*!< out, own: cache of + externally stored column + prefixes, or NULL */ + mem_heap_t* heap) /*!< in: memory heap from which + the memory needed is allocated */ + __attribute__((nonnull(2,3,9))); +/*******************************************************************//** +Converts an index record to a typed data tuple. +@return index entry built; does not set info_bits, and the data fields +in the entry will point directly to rec */ +UNIV_INTERN +dtuple_t* +row_rec_to_index_entry_low( +/*=======================*/ + const rec_t* rec, /*!< in: record in the index */ + const dict_index_t* index, /*!< in: index */ + const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */ + ulint* n_ext, /*!< out: number of externally + stored columns */ + mem_heap_t* heap) /*!< in: memory heap from which + the memory needed is allocated */ + __attribute__((nonnull, warn_unused_result)); +/*******************************************************************//** +Converts an index record to a typed data tuple. NOTE that externally +stored (often big) fields are NOT copied to heap. +@return own: index entry built */ +UNIV_INTERN +dtuple_t* +row_rec_to_index_entry( +/*===================*/ + const rec_t* rec, /*!< in: record in the index */ + const dict_index_t* index, /*!< in: index */ + const ulint* offsets,/*!< in/out: rec_get_offsets(rec) */ + ulint* n_ext, /*!< out: number of externally + stored columns */ + mem_heap_t* heap) /*!< in: memory heap from which + the memory needed is allocated */ + __attribute__((nonnull, warn_unused_result)); +/*******************************************************************//** +Builds from a secondary index record a row reference with which we can +search the clustered index record. +@return own: row reference built; see the NOTE below! */ +UNIV_INTERN +dtuple_t* +row_build_row_ref( +/*==============*/ + ulint type, /*!< in: ROW_COPY_DATA, or ROW_COPY_POINTERS: + the former copies also the data fields to + heap, whereas the latter only places pointers + to data fields on the index page */ + dict_index_t* index, /*!< in: secondary index */ + const rec_t* rec, /*!< in: record in the index; + NOTE: in the case ROW_COPY_POINTERS + the data fields in the row will point + directly into this record, therefore, + the buffer page of this record must be + at least s-latched and the latch held + as long as the row reference is used! */ + mem_heap_t* heap) /*!< in: memory heap from which the memory + needed is allocated */ + __attribute__((nonnull, warn_unused_result)); +/*******************************************************************//** +Builds from a secondary index record a row reference with which we can +search the clustered index record. */ +UNIV_INTERN +void +row_build_row_ref_in_tuple( +/*=======================*/ + dtuple_t* ref, /*!< in/out: row reference built; + see the NOTE below! */ + const rec_t* rec, /*!< in: record in the index; + NOTE: the data fields in ref + will point directly into this + record, therefore, the buffer + page of this record must be at + least s-latched and the latch + held as long as the row + reference is used! */ + const dict_index_t* index, /*!< in: secondary index */ + ulint* offsets,/*!< in: rec_get_offsets(rec, index) + or NULL */ + trx_t* trx) /*!< in: transaction or NULL */ + __attribute__((nonnull(1,2,3))); +/*******************************************************************//** +Builds from a secondary index record a row reference with which we can +search the clustered index record. */ +UNIV_INLINE +void +row_build_row_ref_fast( +/*===================*/ + dtuple_t* ref, /*!< in/out: typed data tuple where the + reference is built */ + const ulint* map, /*!< in: array of field numbers in rec + telling how ref should be built from + the fields of rec */ + const rec_t* rec, /*!< in: record in the index; must be + preserved while ref is used, as we do + not copy field values to heap */ + const ulint* offsets);/*!< in: array returned by rec_get_offsets() */ +/***************************************************************//** +Searches the clustered index record for a row, if we have the row +reference. +@return TRUE if found */ +UNIV_INTERN +ibool +row_search_on_row_ref( +/*==================*/ + btr_pcur_t* pcur, /*!< out: persistent cursor, which must + be closed by the caller */ + ulint mode, /*!< in: BTR_MODIFY_LEAF, ... */ + const dict_table_t* table, /*!< in: table */ + const dtuple_t* ref, /*!< in: row reference */ + mtr_t* mtr) /*!< in/out: mtr */ + __attribute__((nonnull, warn_unused_result)); +/*********************************************************************//** +Fetches the clustered index record for a secondary index record. The latches +on the secondary index record are preserved. +@return record or NULL, if no record found */ +UNIV_INTERN +rec_t* +row_get_clust_rec( +/*==============*/ + ulint mode, /*!< in: BTR_MODIFY_LEAF, ... */ + const rec_t* rec, /*!< in: record in a secondary index */ + dict_index_t* index, /*!< in: secondary index */ + dict_index_t** clust_index,/*!< out: clustered index */ + mtr_t* mtr) /*!< in: mtr */ + __attribute__((nonnull, warn_unused_result)); + +/** Result of row_search_index_entry */ +enum row_search_result { + ROW_FOUND = 0, /*!< the record was found */ + ROW_NOT_FOUND, /*!< record not found */ + ROW_BUFFERED, /*!< one of BTR_INSERT, BTR_DELETE, or + BTR_DELETE_MARK was specified, the + secondary index leaf page was not in + the buffer pool, and the operation was + enqueued in the insert/delete buffer */ + ROW_NOT_DELETED_REF /*!< BTR_DELETE was specified, and + row_purge_poss_sec() failed */ +}; + +/***************************************************************//** +Searches an index record. +@return whether the record was found or buffered */ +UNIV_INTERN +enum row_search_result +row_search_index_entry( +/*===================*/ + dict_index_t* index, /*!< in: index */ + const dtuple_t* entry, /*!< in: index entry */ + ulint mode, /*!< in: BTR_MODIFY_LEAF, ... */ + btr_pcur_t* pcur, /*!< in/out: persistent cursor, which must + be closed by the caller */ + mtr_t* mtr) /*!< in: mtr */ + __attribute__((nonnull, warn_unused_result)); + +#define ROW_COPY_DATA 1 +#define ROW_COPY_POINTERS 2 + +/* The allowed latching order of index records is the following: +(1) a secondary index record -> +(2) the clustered index record -> +(3) rollback segment data for the clustered index record. */ + +/*******************************************************************//** +Formats the raw data in "data" (in InnoDB on-disk format) using +"dict_field" and writes the result to "buf". +Not more than "buf_size" bytes are written to "buf". +The result is always NUL-terminated (provided buf_size is positive) and the +number of bytes that were written to "buf" is returned (including the +terminating NUL). +@return number of bytes that were written */ +UNIV_INTERN +ulint +row_raw_format( +/*===========*/ + const char* data, /*!< in: raw data */ + ulint data_len, /*!< in: raw data length + in bytes */ + const dict_field_t* dict_field, /*!< in: index field */ + char* buf, /*!< out: output buffer */ + ulint buf_size) /*!< in: output buffer size + in bytes */ + __attribute__((nonnull, warn_unused_result)); + +#ifndef UNIV_NONINL +#include "row0row.ic" +#endif + +#endif diff --git a/storage/innobase/include/row0row.ic b/storage/innobase/include/row0row.ic new file mode 100644 index 00000000000..ac62422be1f --- /dev/null +++ b/storage/innobase/include/row0row.ic @@ -0,0 +1,174 @@ +/***************************************************************************** + +Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/row0row.ic +General row routines + +Created 4/20/1996 Heikki Tuuri +*******************************************************/ + +#include "dict0dict.h" +#include "rem0rec.h" +#include "trx0undo.h" + +/*********************************************************************//** +Gets the offset of the DB_TRX_ID field, in bytes relative to the origin of +a clustered index record. +@return offset of DATA_TRX_ID */ +UNIV_INLINE +ulint +row_get_trx_id_offset( +/*==================*/ + const dict_index_t* index, /*!< in: clustered index */ + const ulint* offsets)/*!< in: record offsets */ +{ + ulint pos; + ulint offset; + ulint len; + + ut_ad(dict_index_is_clust(index)); + ut_ad(rec_offs_validate(NULL, index, offsets)); + + pos = dict_index_get_sys_col_pos(index, DATA_TRX_ID); + + offset = rec_get_nth_field_offs(offsets, pos, &len); + + ut_ad(len == DATA_TRX_ID_LEN); + + return(offset); +} + +/*********************************************************************//** +Reads the trx id field from a clustered index record. +@return value of the field */ +UNIV_INLINE +trx_id_t +row_get_rec_trx_id( +/*===============*/ + const rec_t* rec, /*!< in: record */ + const dict_index_t* index, /*!< in: clustered index */ + const ulint* offsets)/*!< in: rec_get_offsets(rec, index) */ +{ + ulint offset; + + ut_ad(dict_index_is_clust(index)); + ut_ad(rec_offs_validate(rec, index, offsets)); + + offset = index->trx_id_offset; + + if (!offset) { + offset = row_get_trx_id_offset(index, offsets); + } + + return(trx_read_trx_id(rec + offset)); +} + +/*********************************************************************//** +Reads the roll pointer field from a clustered index record. +@return value of the field */ +UNIV_INLINE +roll_ptr_t +row_get_rec_roll_ptr( +/*=================*/ + const rec_t* rec, /*!< in: record */ + const dict_index_t* index, /*!< in: clustered index */ + const ulint* offsets)/*!< in: rec_get_offsets(rec, index) */ +{ + ulint offset; + + ut_ad(dict_index_is_clust(index)); + ut_ad(rec_offs_validate(rec, index, offsets)); + + offset = index->trx_id_offset; + + if (!offset) { + offset = row_get_trx_id_offset(index, offsets); + } + + return(trx_read_roll_ptr(rec + offset + DATA_TRX_ID_LEN)); +} + +/*****************************************************************//** +When an insert or purge to a table is performed, this function builds +the entry to be inserted into or purged from an index on the table. +@return index entry which should be inserted or purged, or NULL if the +externally stored columns in the clustered index record are +unavailable and ext != NULL */ +UNIV_INLINE +dtuple_t* +row_build_index_entry( +/*==================*/ + const dtuple_t* row, /*!< in: row which should be + inserted or purged */ + const row_ext_t* ext, /*!< in: externally stored column + prefixes, or NULL */ + dict_index_t* index, /*!< in: index on the table */ + mem_heap_t* heap) /*!< in: memory heap from which + the memory for the index entry + is allocated */ +{ + dtuple_t* entry; + + ut_ad(dtuple_check_typed(row)); + entry = row_build_index_entry_low(row, ext, index, heap); + ut_ad(!entry || dtuple_check_typed(entry)); + return(entry); +} + +/*******************************************************************//** +Builds from a secondary index record a row reference with which we can +search the clustered index record. */ +UNIV_INLINE +void +row_build_row_ref_fast( +/*===================*/ + dtuple_t* ref, /*!< in/out: typed data tuple where the + reference is built */ + const ulint* map, /*!< in: array of field numbers in rec + telling how ref should be built from + the fields of rec */ + const rec_t* rec, /*!< in: record in the index; must be + preserved while ref is used, as we do + not copy field values to heap */ + const ulint* offsets)/*!< in: array returned by rec_get_offsets() */ +{ + dfield_t* dfield; + const byte* field; + ulint len; + ulint ref_len; + ulint field_no; + ulint i; + + ut_ad(rec_offs_validate(rec, NULL, offsets)); + ut_ad(!rec_offs_any_extern(offsets)); + ref_len = dtuple_get_n_fields(ref); + + for (i = 0; i < ref_len; i++) { + dfield = dtuple_get_nth_field(ref, i); + + field_no = *(map + i); + + if (field_no != ULINT_UNDEFINED) { + + field = rec_get_nth_field(rec, offsets, + field_no, &len); + dfield_set_data(dfield, field, len); + } + } +} diff --git a/storage/innobase/include/row0sel.h b/storage/innobase/include/row0sel.h new file mode 100644 index 00000000000..c8be80f89d9 --- /dev/null +++ b/storage/innobase/include/row0sel.h @@ -0,0 +1,409 @@ +/***************************************************************************** + +Copyright (c) 1997, 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/row0sel.h +Select + +Created 12/19/1997 Heikki Tuuri +*******************************************************/ + +#ifndef row0sel_h +#define row0sel_h + +#include "univ.i" +#include "data0data.h" +#include "que0types.h" +#include "dict0types.h" +#include "trx0types.h" +#include "row0types.h" +#include "que0types.h" +#include "pars0sym.h" +#include "btr0pcur.h" +#include "read0read.h" +#include "row0mysql.h" + +/*********************************************************************//** +Creates a select node struct. +@return own: select node struct */ +UNIV_INTERN +sel_node_t* +sel_node_create( +/*============*/ + mem_heap_t* heap); /*!< in: memory heap where created */ +/*********************************************************************//** +Frees the memory private to a select node when a query graph is freed, +does not free the heap where the node was originally created. */ +UNIV_INTERN +void +sel_node_free_private( +/*==================*/ + sel_node_t* node); /*!< in: select node struct */ +/*********************************************************************//** +Frees a prefetch buffer for a column, including the dynamically allocated +memory for data stored there. */ +UNIV_INTERN +void +sel_col_prefetch_buf_free( +/*======================*/ + sel_buf_t* prefetch_buf); /*!< in, own: prefetch buffer */ +/*********************************************************************//** +Gets the plan node for the nth table in a join. +@return plan node */ +UNIV_INLINE +plan_t* +sel_node_get_nth_plan( +/*==================*/ + sel_node_t* node, /*!< in: select node */ + ulint i); /*!< in: get ith plan node */ +/**********************************************************************//** +Performs a select step. This is a high-level function used in SQL execution +graphs. +@return query thread to run next or NULL */ +UNIV_INTERN +que_thr_t* +row_sel_step( +/*=========*/ + que_thr_t* thr); /*!< in: query thread */ +/**********************************************************************//** +Performs an execution step of an open or close cursor statement node. +@return query thread to run next or NULL */ +UNIV_INLINE +que_thr_t* +open_step( +/*======*/ + que_thr_t* thr); /*!< in: query thread */ +/**********************************************************************//** +Performs a fetch for a cursor. +@return query thread to run next or NULL */ +UNIV_INTERN +que_thr_t* +fetch_step( +/*=======*/ + que_thr_t* thr); /*!< in: query thread */ +/****************************************************************//** +Sample callback function for fetch that prints each row. +@return always returns non-NULL */ +UNIV_INTERN +void* +row_fetch_print( +/*============*/ + void* row, /*!< in: sel_node_t* */ + void* user_arg); /*!< in: not used */ +/***********************************************************//** +Prints a row in a select result. +@return query thread to run next or NULL */ +UNIV_INTERN +que_thr_t* +row_printf_step( +/*============*/ + que_thr_t* thr); /*!< in: query thread */ +/****************************************************************//** +Converts a key value stored in MySQL format to an Innobase dtuple. The last +field of the key value may be just a prefix of a fixed length field: hence +the parameter key_len. But currently we do not allow search keys where the +last field is only a prefix of the full key field len and print a warning if +such appears. */ +UNIV_INTERN +void +row_sel_convert_mysql_key_to_innobase( +/*==================================*/ + dtuple_t* tuple, /*!< in/out: tuple where to build; + NOTE: we assume that the type info + in the tuple is already according + to index! */ + byte* buf, /*!< in: buffer to use in field + conversions; NOTE that dtuple->data + may end up pointing inside buf so + do not discard that buffer while + the tuple is being used. See + row_mysql_store_col_in_innobase_format() + in the case of DATA_INT */ + ulint buf_len, /*!< in: buffer length */ + dict_index_t* index, /*!< in: index of the key value */ + const byte* key_ptr, /*!< in: MySQL key value */ + ulint key_len, /*!< in: MySQL key value length */ + trx_t* trx); /*!< in: transaction */ +/********************************************************************//** +Searches for rows in the database. This is used in the interface to +MySQL. This function opens a cursor, and also implements fetch next +and fetch prev. NOTE that if we do a search with a full key value +from a unique index (ROW_SEL_EXACT), then we will not store the cursor +position and fetch next or fetch prev must not be tried to the cursor! +@return DB_SUCCESS, DB_RECORD_NOT_FOUND, DB_END_OF_INDEX, DB_DEADLOCK, +DB_LOCK_TABLE_FULL, or DB_TOO_BIG_RECORD */ +UNIV_INTERN +dberr_t +row_search_for_mysql( +/*=================*/ + byte* buf, /*!< in/out: buffer for the fetched + row in the MySQL format */ + ulint mode, /*!< in: search mode PAGE_CUR_L, ... */ + row_prebuilt_t* prebuilt, /*!< in: prebuilt struct for the + table handle; this contains the info + of search_tuple, index; if search + tuple contains 0 fields then we + position the cursor at the start or + the end of the index, depending on + 'mode' */ + ulint match_mode, /*!< in: 0 or ROW_SEL_EXACT or + ROW_SEL_EXACT_PREFIX */ + ulint direction) /*!< in: 0 or ROW_SEL_NEXT or + ROW_SEL_PREV; NOTE: if this is != 0, + then prebuilt must have a pcur + with stored position! In opening of a + cursor 'direction' should be 0. */ + __attribute__((nonnull, warn_unused_result)); +/*******************************************************************//** +Checks if MySQL at the moment is allowed for this table to retrieve a +consistent read result, or store it to the query cache. +@return TRUE if storing or retrieving from the query cache is permitted */ +UNIV_INTERN +ibool +row_search_check_if_query_cache_permitted( +/*======================================*/ + trx_t* trx, /*!< in: transaction object */ + const char* norm_name); /*!< in: concatenation of database name, + '/' char, table name */ +/*******************************************************************//** +Read the max AUTOINC value from an index. +@return DB_SUCCESS if all OK else error code */ +UNIV_INTERN +dberr_t +row_search_max_autoinc( +/*===================*/ + dict_index_t* index, /*!< in: index to search */ + const char* col_name, /*!< in: autoinc column name */ + ib_uint64_t* value) /*!< out: AUTOINC value read */ + __attribute__((nonnull, warn_unused_result)); + +/** A structure for caching column values for prefetched rows */ +struct sel_buf_t{ + byte* data; /*!< data, or NULL; if not NULL, this field + has allocated memory which must be explicitly + freed; can be != NULL even when len is + UNIV_SQL_NULL */ + ulint len; /*!< data length or UNIV_SQL_NULL */ + ulint val_buf_size; + /*!< size of memory buffer allocated for data: + this can be more than len; this is defined + when data != NULL */ +}; + +/** Query plan */ +struct plan_t{ + dict_table_t* table; /*!< table struct in the dictionary + cache */ + dict_index_t* index; /*!< table index used in the search */ + btr_pcur_t pcur; /*!< persistent cursor used to search + the index */ + ibool asc; /*!< TRUE if cursor traveling upwards */ + ibool pcur_is_open; /*!< TRUE if pcur has been positioned + and we can try to fetch new rows */ + ibool cursor_at_end; /*!< TRUE if the cursor is open but + we know that there are no more + qualifying rows left to retrieve from + the index tree; NOTE though, that + there may still be unprocessed rows in + the prefetch stack; always FALSE when + pcur_is_open is FALSE */ + ibool stored_cursor_rec_processed; + /*!< TRUE if the pcur position has been + stored and the record it is positioned + on has already been processed */ + que_node_t** tuple_exps; /*!< array of expressions + which are used to calculate + the field values in the search + tuple: there is one expression + for each field in the search + tuple */ + dtuple_t* tuple; /*!< search tuple */ + ulint mode; /*!< search mode: PAGE_CUR_G, ... */ + ulint n_exact_match; /*!< number of first fields in + the search tuple which must be + exactly matched */ + ibool unique_search; /*!< TRUE if we are searching an + index record with a unique key */ + ulint n_rows_fetched; /*!< number of rows fetched using pcur + after it was opened */ + ulint n_rows_prefetched;/*!< number of prefetched rows cached + for fetch: fetching several rows in + the same mtr saves CPU time */ + ulint first_prefetched;/*!< index of the first cached row in + select buffer arrays for each column */ + ibool no_prefetch; /*!< no prefetch for this table */ + sym_node_list_t columns; /*!< symbol table nodes for the columns + to retrieve from the table */ + UT_LIST_BASE_NODE_T(func_node_t) + end_conds; /*!< conditions which determine the + fetch limit of the index segment we + have to look at: when one of these + fails, the result set has been + exhausted for the cursor in this + index; these conditions are normalized + so that in a comparison the column + for this table is the first argument */ + UT_LIST_BASE_NODE_T(func_node_t) + other_conds; /*!< the rest of search conditions we can + test at this table in a join */ + ibool must_get_clust; /*!< TRUE if index is a non-clustered + index and we must also fetch the + clustered index record; this is the + case if the non-clustered record does + not contain all the needed columns, or + if this is a single-table explicit + cursor, or a searched update or + delete */ + ulint* clust_map; /*!< map telling how clust_ref is built + from the fields of a non-clustered + record */ + dtuple_t* clust_ref; /*!< the reference to the clustered + index entry is built here if index is + a non-clustered index */ + btr_pcur_t clust_pcur; /*!< if index is non-clustered, we use + this pcur to search the clustered + index */ + mem_heap_t* old_vers_heap; /*!< memory heap used in building an old + version of a row, or NULL */ +}; + +/** Select node states */ +enum sel_node_state { + SEL_NODE_CLOSED, /*!< it is a declared cursor which is not + currently open */ + SEL_NODE_OPEN, /*!< intention locks not yet set on tables */ + SEL_NODE_FETCH, /*!< intention locks have been set */ + SEL_NODE_NO_MORE_ROWS /*!< cursor has reached the result set end */ +}; + +/** Select statement node */ +struct sel_node_t{ + que_common_t common; /*!< node type: QUE_NODE_SELECT */ + enum sel_node_state + state; /*!< node state */ + que_node_t* select_list; /*!< select list */ + sym_node_t* into_list; /*!< variables list or NULL */ + sym_node_t* table_list; /*!< table list */ + ibool asc; /*!< TRUE if the rows should be fetched + in an ascending order */ + ibool set_x_locks; /*!< TRUE if the cursor is for update or + delete, which means that a row x-lock + should be placed on the cursor row */ + ulint row_lock_mode; /*!< LOCK_X or LOCK_S */ + ulint n_tables; /*!< number of tables */ + ulint fetch_table; /*!< number of the next table to access + in the join */ + plan_t* plans; /*!< array of n_tables many plan nodes + containing the search plan and the + search data structures */ + que_node_t* search_cond; /*!< search condition */ + read_view_t* read_view; /*!< if the query is a non-locking + consistent read, its read view is + placed here, otherwise NULL */ + ibool consistent_read;/*!< TRUE if the select is a consistent, + non-locking read */ + order_node_t* order_by; /*!< order by column definition, or + NULL */ + ibool is_aggregate; /*!< TRUE if the select list consists of + aggregate functions */ + ibool aggregate_already_fetched; + /*!< TRUE if the aggregate row has + already been fetched for the current + cursor */ + ibool can_get_updated;/*!< this is TRUE if the select + is in a single-table explicit + cursor which can get updated + within the stored procedure, + or in a searched update or + delete; NOTE that to determine + of an explicit cursor if it + can get updated, the parser + checks from a stored procedure + if it contains positioned + update or delete statements */ + sym_node_t* explicit_cursor;/*!< not NULL if an explicit cursor */ + UT_LIST_BASE_NODE_T(sym_node_t) + copy_variables; /*!< variables whose values we have to + copy when an explicit cursor is opened, + so that they do not change between + fetches */ +}; + +/** Fetch statement node */ +struct fetch_node_t{ + que_common_t common; /*!< type: QUE_NODE_FETCH */ + sel_node_t* cursor_def; /*!< cursor definition */ + sym_node_t* into_list; /*!< variables to set */ + + pars_user_func_t* + func; /*!< User callback function or NULL. + The first argument to the function + is a sel_node_t*, containing the + results of the SELECT operation for + one row. If the function returns + NULL, it is not interested in + further rows and the cursor is + modified so (cursor % NOTFOUND) is + true. If it returns not-NULL, + continue normally. See + row_fetch_print() for an example + (and a useful debugging tool). */ +}; + +/** Open or close cursor operation type */ +enum open_node_op { + ROW_SEL_OPEN_CURSOR, /*!< open cursor */ + ROW_SEL_CLOSE_CURSOR /*!< close cursor */ +}; + +/** Open or close cursor statement node */ +struct open_node_t{ + que_common_t common; /*!< type: QUE_NODE_OPEN */ + enum open_node_op + op_type; /*!< operation type: open or + close cursor */ + sel_node_t* cursor_def; /*!< cursor definition */ +}; + +/** Row printf statement node */ +struct row_printf_node_t{ + que_common_t common; /*!< type: QUE_NODE_ROW_PRINTF */ + sel_node_t* sel_node; /*!< select */ +}; + +/** Search direction for the MySQL interface */ +enum row_sel_direction { + ROW_SEL_NEXT = 1, /*!< ascending direction */ + ROW_SEL_PREV = 2 /*!< descending direction */ +}; + +/** Match mode for the MySQL interface */ +enum row_sel_match_mode { + ROW_SEL_EXACT = 1, /*!< search using a complete key value */ + ROW_SEL_EXACT_PREFIX /*!< search using a key prefix which + must match rows: the prefix may + contain an incomplete field (the last + field in prefix may be just a prefix + of a fixed length column) */ +}; + +#ifndef UNIV_NONINL +#include "row0sel.ic" +#endif + +#endif diff --git a/storage/innobase/include/row0sel.ic b/storage/innobase/include/row0sel.ic new file mode 100644 index 00000000000..d83a3448832 --- /dev/null +++ b/storage/innobase/include/row0sel.ic @@ -0,0 +1,105 @@ +/***************************************************************************** + +Copyright (c) 1997, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/row0sel.ic +Select + +Created 12/19/1997 Heikki Tuuri +*******************************************************/ + +#include "que0que.h" + +/*********************************************************************//** +Gets the plan node for the nth table in a join. +@return plan node */ +UNIV_INLINE +plan_t* +sel_node_get_nth_plan( +/*==================*/ + sel_node_t* node, /*!< in: select node */ + ulint i) /*!< in: get ith plan node */ +{ + ut_ad(i < node->n_tables); + + return(node->plans + i); +} + +/*********************************************************************//** +Resets the cursor defined by sel_node to the SEL_NODE_OPEN state, which means +that it will start fetching from the start of the result set again, regardless +of where it was before, and it will set intention locks on the tables. */ +UNIV_INLINE +void +sel_node_reset_cursor( +/*==================*/ + sel_node_t* node) /*!< in: select node */ +{ + node->state = SEL_NODE_OPEN; +} + +/**********************************************************************//** +Performs an execution step of an open or close cursor statement node. +@return query thread to run next or NULL */ +UNIV_INLINE +que_thr_t* +open_step( +/*======*/ + que_thr_t* thr) /*!< in: query thread */ +{ + sel_node_t* sel_node; + open_node_t* node; + ulint err; + + ut_ad(thr); + + node = (open_node_t*) thr->run_node; + ut_ad(que_node_get_type(node) == QUE_NODE_OPEN); + + sel_node = node->cursor_def; + + err = DB_SUCCESS; + + if (node->op_type == ROW_SEL_OPEN_CURSOR) { + + /* if (sel_node->state == SEL_NODE_CLOSED) { */ + + sel_node_reset_cursor(sel_node); + /* } else { + err = DB_ERROR; + } */ + } else { + if (sel_node->state != SEL_NODE_CLOSED) { + + sel_node->state = SEL_NODE_CLOSED; + } else { + err = DB_ERROR; + } + } + + if (err != DB_SUCCESS) { + /* SQL error detected */ + fprintf(stderr, "SQL error %lu\n", (ulong) err); + + ut_error; + } + + thr->run_node = que_node_get_parent(node); + + return(thr); +} diff --git a/storage/innobase/include/row0types.h b/storage/innobase/include/row0types.h new file mode 100644 index 00000000000..52c89cb01fa --- /dev/null +++ b/storage/innobase/include/row0types.h @@ -0,0 +1,55 @@ +/***************************************************************************** + +Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/row0types.h +Row operation global types + +Created 12/27/1996 Heikki Tuuri +*******************************************************/ + +#ifndef row0types_h +#define row0types_h + +struct plan_t; + +struct upd_t; +struct upd_field_t; +struct upd_node_t; +struct del_node_t; +struct ins_node_t; +struct sel_node_t; +struct open_node_t; +struct fetch_node_t; + +struct row_printf_node_t; +struct sel_buf_t; + +struct undo_node_t; + +struct purge_node_t; + +struct row_ext_t; + +/** Buffer for logging modifications during online index creation */ +struct row_log_t; + +/* MySQL data types */ +struct TABLE; + +#endif diff --git a/storage/innobase/include/row0uins.h b/storage/innobase/include/row0uins.h new file mode 100644 index 00000000000..ebf4881208a --- /dev/null +++ b/storage/innobase/include/row0uins.h @@ -0,0 +1,54 @@ +/***************************************************************************** + +Copyright (c) 1997, 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/row0uins.h +Fresh insert undo + +Created 2/25/1997 Heikki Tuuri +*******************************************************/ + +#ifndef row0uins_h +#define row0uins_h + +#include "univ.i" +#include "data0data.h" +#include "dict0types.h" +#include "trx0types.h" +#include "que0types.h" +#include "row0types.h" +#include "mtr0mtr.h" + +/***********************************************************//** +Undoes a fresh insert of a row to a table. A fresh insert means that +the same clustered index unique key did not have any record, even delete +marked, at the time of the insert. InnoDB is eager in a rollback: +if it figures out that an index record will be removed in the purge +anyway, it will remove it in the rollback. +@return DB_SUCCESS */ +UNIV_INTERN +dberr_t +row_undo_ins( +/*=========*/ + undo_node_t* node) /*!< in: row undo node */ + __attribute__((nonnull, warn_unused_result)); +#ifndef UNIV_NONINL +#include "row0uins.ic" +#endif + +#endif diff --git a/storage/innobase/include/row0uins.ic b/storage/innobase/include/row0uins.ic new file mode 100644 index 00000000000..54da2e49874 --- /dev/null +++ b/storage/innobase/include/row0uins.ic @@ -0,0 +1,25 @@ +/***************************************************************************** + +Copyright (c) 1997, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/row0uins.ic +Fresh insert undo + +Created 2/25/1997 Heikki Tuuri +*******************************************************/ + diff --git a/storage/innobase/include/row0umod.h b/storage/innobase/include/row0umod.h new file mode 100644 index 00000000000..f89d5a334fc --- /dev/null +++ b/storage/innobase/include/row0umod.h @@ -0,0 +1,52 @@ +/***************************************************************************** + +Copyright (c) 1997, 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/row0umod.h +Undo modify of a row + +Created 2/27/1997 Heikki Tuuri +*******************************************************/ + +#ifndef row0umod_h +#define row0umod_h + +#include "univ.i" +#include "data0data.h" +#include "dict0types.h" +#include "trx0types.h" +#include "que0types.h" +#include "row0types.h" +#include "mtr0mtr.h" + +/***********************************************************//** +Undoes a modify operation on a row of a table. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +row_undo_mod( +/*=========*/ + undo_node_t* node, /*!< in: row undo node */ + que_thr_t* thr) /*!< in: query thread */ + __attribute__((nonnull, warn_unused_result)); + +#ifndef UNIV_NONINL +#include "row0umod.ic" +#endif + +#endif diff --git a/storage/innobase/include/row0umod.ic b/storage/innobase/include/row0umod.ic new file mode 100644 index 00000000000..00a8cd86e01 --- /dev/null +++ b/storage/innobase/include/row0umod.ic @@ -0,0 +1,24 @@ +/***************************************************************************** + +Copyright (c) 1997, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/row0umod.ic +Undo modify of a row + +Created 2/27/1997 Heikki Tuuri +*******************************************************/ diff --git a/storage/innobase/include/row0undo.h b/storage/innobase/include/row0undo.h new file mode 100644 index 00000000000..5dddfb4eae1 --- /dev/null +++ b/storage/innobase/include/row0undo.h @@ -0,0 +1,135 @@ +/***************************************************************************** + +Copyright (c) 1997, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/row0undo.h +Row undo + +Created 1/8/1997 Heikki Tuuri +*******************************************************/ + +#ifndef row0undo_h +#define row0undo_h + +#include "univ.i" +#include "mtr0mtr.h" +#include "trx0sys.h" +#include "btr0types.h" +#include "btr0pcur.h" +#include "dict0types.h" +#include "trx0types.h" +#include "que0types.h" +#include "row0types.h" + +/********************************************************************//** +Creates a row undo node to a query graph. +@return own: undo node */ +UNIV_INTERN +undo_node_t* +row_undo_node_create( +/*=================*/ + trx_t* trx, /*!< in: transaction */ + que_thr_t* parent, /*!< in: parent node, i.e., a thr node */ + mem_heap_t* heap); /*!< in: memory heap where created */ +/***********************************************************//** +Looks for the clustered index record when node has the row reference. +The pcur in node is used in the search. If found, stores the row to node, +and stores the position of pcur, and detaches it. The pcur must be closed +by the caller in any case. +@return TRUE if found; NOTE the node->pcur must be closed by the +caller, regardless of the return value */ +UNIV_INTERN +ibool +row_undo_search_clust_to_pcur( +/*==========================*/ + undo_node_t* node); /*!< in: row undo node */ +/***********************************************************//** +Undoes a row operation in a table. This is a high-level function used +in SQL execution graphs. +@return query thread to run next or NULL */ +UNIV_INTERN +que_thr_t* +row_undo_step( +/*==========*/ + que_thr_t* thr); /*!< in: query thread */ + +/* A single query thread will try to perform the undo for all successive +versions of a clustered index record, if the transaction has modified it +several times during the execution which is rolled back. It may happen +that the task is transferred to another query thread, if the other thread +is assigned to handle an undo log record in the chain of different versions +of the record, and the other thread happens to get the x-latch to the +clustered index record at the right time. + If a query thread notices that the clustered index record it is looking +for is missing, or the roll ptr field in the record doed not point to the +undo log record the thread was assigned to handle, then it gives up the undo +task for that undo log record, and fetches the next. This situation can occur +just in the case where the transaction modified the same record several times +and another thread is currently doing the undo for successive versions of +that index record. */ + +/** Execution state of an undo node */ +enum undo_exec { + UNDO_NODE_FETCH_NEXT = 1, /*!< we should fetch the next + undo log record */ + UNDO_NODE_INSERT, /*!< undo a fresh insert of a + row to a table */ + UNDO_NODE_MODIFY /*!< undo a modify operation + (DELETE or UPDATE) on a row + of a table */ +}; + +/** Undo node structure */ +struct undo_node_t{ + que_common_t common; /*!< node type: QUE_NODE_UNDO */ + enum undo_exec state; /*!< node execution state */ + trx_t* trx; /*!< trx for which undo is done */ + roll_ptr_t roll_ptr;/*!< roll pointer to undo log record */ + trx_undo_rec_t* undo_rec;/*!< undo log record */ + undo_no_t undo_no;/*!< undo number of the record */ + ulint rec_type;/*!< undo log record type: TRX_UNDO_INSERT_REC, + ... */ + trx_id_t new_trx_id; /*!< trx id to restore to clustered index + record */ + btr_pcur_t pcur; /*!< persistent cursor used in searching the + clustered index record */ + dict_table_t* table; /*!< table where undo is done */ + ulint cmpl_info;/*!< compiler analysis of an update */ + upd_t* update; /*!< update vector for a clustered index + record */ + dtuple_t* ref; /*!< row reference to the next row to handle */ + dtuple_t* row; /*!< a copy (also fields copied to heap) of the + row to handle */ + row_ext_t* ext; /*!< NULL, or prefixes of the externally + stored columns of the row */ + dtuple_t* undo_row;/*!< NULL, or the row after undo */ + row_ext_t* undo_ext;/*!< NULL, or prefixes of the externally + stored columns of undo_row */ + dict_index_t* index; /*!< the next index whose record should be + handled */ + mem_heap_t* heap; /*!< memory heap used as auxiliary storage for + row; this must be emptied after undo is tried + on a row */ +}; + + +#ifndef UNIV_NONINL +#include "row0undo.ic" +#endif + +#endif diff --git a/storage/innobase/include/row0undo.ic b/storage/innobase/include/row0undo.ic new file mode 100644 index 00000000000..b97ffca590e --- /dev/null +++ b/storage/innobase/include/row0undo.ic @@ -0,0 +1,24 @@ +/***************************************************************************** + +Copyright (c) 1997, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/row0undo.ic +Row undo + +Created 1/8/1997 Heikki Tuuri +*******************************************************/ diff --git a/storage/innobase/include/row0upd.h b/storage/innobase/include/row0upd.h new file mode 100644 index 00000000000..27dedeb65a7 --- /dev/null +++ b/storage/innobase/include/row0upd.h @@ -0,0 +1,540 @@ +/***************************************************************************** + +Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/row0upd.h +Update of a row + +Created 12/27/1996 Heikki Tuuri +*******************************************************/ + +#ifndef row0upd_h +#define row0upd_h + +#include "univ.i" +#include "data0data.h" +#include "row0types.h" +#include "btr0types.h" +#include "dict0types.h" +#include "trx0types.h" + +#ifndef UNIV_HOTBACKUP +# include "btr0pcur.h" +# include "que0types.h" +# include "pars0types.h" +#endif /* !UNIV_HOTBACKUP */ + +/*********************************************************************//** +Creates an update vector object. +@return own: update vector object */ +UNIV_INLINE +upd_t* +upd_create( +/*=======*/ + ulint n, /*!< in: number of fields */ + mem_heap_t* heap); /*!< in: heap from which memory allocated */ +/*********************************************************************//** +Returns the number of fields in the update vector == number of columns +to be updated by an update vector. +@return number of fields */ +UNIV_INLINE +ulint +upd_get_n_fields( +/*=============*/ + const upd_t* update); /*!< in: update vector */ +#ifdef UNIV_DEBUG +/*********************************************************************//** +Returns the nth field of an update vector. +@return update vector field */ +UNIV_INLINE +upd_field_t* +upd_get_nth_field( +/*==============*/ + const upd_t* update, /*!< in: update vector */ + ulint n); /*!< in: field position in update vector */ +#else +# define upd_get_nth_field(update, n) ((update)->fields + (n)) +#endif +#ifndef UNIV_HOTBACKUP +/*********************************************************************//** +Sets an index field number to be updated by an update vector field. */ +UNIV_INLINE +void +upd_field_set_field_no( +/*===================*/ + upd_field_t* upd_field, /*!< in: update vector field */ + ulint field_no, /*!< in: field number in a clustered + index */ + dict_index_t* index, /*!< in: index */ + trx_t* trx); /*!< in: transaction */ +/*********************************************************************//** +Returns a field of an update vector by field_no. +@return update vector field, or NULL */ +UNIV_INLINE +const upd_field_t* +upd_get_field_by_field_no( +/*======================*/ + const upd_t* update, /*!< in: update vector */ + ulint no) /*!< in: field_no */ + __attribute__((nonnull, pure)); +/*********************************************************************//** +Writes into the redo log the values of trx id and roll ptr and enough info +to determine their positions within a clustered index record. +@return new pointer to mlog */ +UNIV_INTERN +byte* +row_upd_write_sys_vals_to_log( +/*==========================*/ + dict_index_t* index, /*!< in: clustered index */ + trx_id_t trx_id, /*!< in: transaction id */ + roll_ptr_t roll_ptr,/*!< in: roll ptr of the undo log record */ + byte* log_ptr,/*!< pointer to a buffer of size > 20 opened + in mlog */ + mtr_t* mtr); /*!< in: mtr */ +/*********************************************************************//** +Updates the trx id and roll ptr field in a clustered index record when +a row is updated or marked deleted. */ +UNIV_INLINE +void +row_upd_rec_sys_fields( +/*===================*/ + rec_t* rec, /*!< in/out: record */ + page_zip_des_t* page_zip,/*!< in/out: compressed page whose + uncompressed part will be updated, or NULL */ + dict_index_t* index, /*!< in: clustered index */ + const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */ + const trx_t* trx, /*!< in: transaction */ + roll_ptr_t roll_ptr);/*!< in: roll ptr of the undo log record, + can be 0 during IMPORT */ +/*********************************************************************//** +Sets the trx id or roll ptr field of a clustered index entry. */ +UNIV_INTERN +void +row_upd_index_entry_sys_field( +/*==========================*/ + dtuple_t* entry, /*!< in/out: index entry, where the memory + buffers for sys fields are already allocated: + the function just copies the new values to + them */ + dict_index_t* index, /*!< in: clustered index */ + ulint type, /*!< in: DATA_TRX_ID or DATA_ROLL_PTR */ + ib_uint64_t val); /*!< in: value to write */ +/*********************************************************************//** +Creates an update node for a query graph. +@return own: update node */ +UNIV_INTERN +upd_node_t* +upd_node_create( +/*============*/ + mem_heap_t* heap); /*!< in: mem heap where created */ +/***********************************************************//** +Writes to the redo log the new values of the fields occurring in the index. */ +UNIV_INTERN +void +row_upd_index_write_log( +/*====================*/ + const upd_t* update, /*!< in: update vector */ + byte* log_ptr,/*!< in: pointer to mlog buffer: must + contain at least MLOG_BUF_MARGIN bytes + of free space; the buffer is closed + within this function */ + mtr_t* mtr); /*!< in: mtr into whose log to write */ +/***********************************************************//** +Returns TRUE if row update changes size of some field in index or if some +field to be updated is stored externally in rec or update. +@return TRUE if the update changes the size of some field in index or +the field is external in rec or update */ +UNIV_INTERN +ibool +row_upd_changes_field_size_or_external( +/*===================================*/ + dict_index_t* index, /*!< in: index */ + const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */ + const upd_t* update);/*!< in: update vector */ +/***********************************************************//** +Returns true if row update contains disowned external fields. +@return true if the update contains disowned external fields. */ +UNIV_INTERN +bool +row_upd_changes_disowned_external( +/*==============================*/ + const upd_t* update) /*!< in: update vector */ + __attribute__((nonnull, warn_unused_result)); +#endif /* !UNIV_HOTBACKUP */ +/***********************************************************//** +Replaces the new column values stored in the update vector to the +record given. No field size changes are allowed. This function is +usually invoked on a clustered index. The only use case for a +secondary index is row_ins_sec_index_entry_by_modify() or its +counterpart in ibuf_insert_to_index_page(). */ +UNIV_INTERN +void +row_upd_rec_in_place( +/*=================*/ + rec_t* rec, /*!< in/out: record where replaced */ + dict_index_t* index, /*!< in: the index the record belongs to */ + const ulint* offsets,/*!< in: array returned by rec_get_offsets() */ + const upd_t* update, /*!< in: update vector */ + page_zip_des_t* page_zip);/*!< in: compressed page with enough space + available, or NULL */ +#ifndef UNIV_HOTBACKUP +/***************************************************************//** +Builds an update vector from those fields which in a secondary index entry +differ from a record that has the equal ordering fields. NOTE: we compare +the fields as binary strings! +@return own: update vector of differing fields */ +UNIV_INTERN +upd_t* +row_upd_build_sec_rec_difference_binary( +/*====================================*/ + const rec_t* rec, /*!< in: secondary index record */ + dict_index_t* index, /*!< in: index */ + const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */ + const dtuple_t* entry, /*!< in: entry to insert */ + mem_heap_t* heap) /*!< in: memory heap from which allocated */ + __attribute__((warn_unused_result, nonnull)); +/***************************************************************//** +Builds an update vector from those fields, excluding the roll ptr and +trx id fields, which in an index entry differ from a record that has +the equal ordering fields. NOTE: we compare the fields as binary strings! +@return own: update vector of differing fields, excluding roll ptr and +trx id */ +UNIV_INTERN +const upd_t* +row_upd_build_difference_binary( +/*============================*/ + dict_index_t* index, /*!< in: clustered index */ + const dtuple_t* entry, /*!< in: entry to insert */ + const rec_t* rec, /*!< in: clustered index record */ + const ulint* offsets,/*!< in: rec_get_offsets(rec,index), or NULL */ + bool no_sys, /*!< in: skip the system columns + DB_TRX_ID and DB_ROLL_PTR */ + trx_t* trx, /*!< in: transaction (for diagnostics), + or NULL */ + mem_heap_t* heap) /*!< in: memory heap from which allocated */ + __attribute__((nonnull(1,2,3,7), warn_unused_result)); +/***********************************************************//** +Replaces the new column values stored in the update vector to the index entry +given. */ +UNIV_INTERN +void +row_upd_index_replace_new_col_vals_index_pos( +/*=========================================*/ + dtuple_t* entry, /*!< in/out: index entry where replaced; + the clustered index record must be + covered by a lock or a page latch to + prevent deletion (rollback or purge) */ + dict_index_t* index, /*!< in: index; NOTE that this may also be a + non-clustered index */ + const upd_t* update, /*!< in: an update vector built for the index so + that the field number in an upd_field is the + index position */ + ibool order_only, + /*!< in: if TRUE, limit the replacement to + ordering fields of index; note that this + does not work for non-clustered indexes. */ + mem_heap_t* heap) /*!< in: memory heap for allocating and + copying the new values */ + __attribute__((nonnull)); +/***********************************************************//** +Replaces the new column values stored in the update vector to the index entry +given. */ +UNIV_INTERN +void +row_upd_index_replace_new_col_vals( +/*===============================*/ + dtuple_t* entry, /*!< in/out: index entry where replaced; + the clustered index record must be + covered by a lock or a page latch to + prevent deletion (rollback or purge) */ + dict_index_t* index, /*!< in: index; NOTE that this may also be a + non-clustered index */ + const upd_t* update, /*!< in: an update vector built for the + CLUSTERED index so that the field number in + an upd_field is the clustered index position */ + mem_heap_t* heap) /*!< in: memory heap for allocating and + copying the new values */ + __attribute__((nonnull)); +/***********************************************************//** +Replaces the new column values stored in the update vector. */ +UNIV_INTERN +void +row_upd_replace( +/*============*/ + dtuple_t* row, /*!< in/out: row where replaced, + indexed by col_no; + the clustered index record must be + covered by a lock or a page latch to + prevent deletion (rollback or purge) */ + row_ext_t** ext, /*!< out, own: NULL, or externally + stored column prefixes */ + const dict_index_t* index, /*!< in: clustered index */ + const upd_t* update, /*!< in: an update vector built for the + clustered index */ + mem_heap_t* heap); /*!< in: memory heap */ +/***********************************************************//** +Checks if an update vector changes an ordering field of an index record. + +This function is fast if the update vector is short or the number of ordering +fields in the index is small. Otherwise, this can be quadratic. +NOTE: we compare the fields as binary strings! +@return TRUE if update vector changes an ordering field in the index record */ +UNIV_INTERN +ibool +row_upd_changes_ord_field_binary_func( +/*==================================*/ + dict_index_t* index, /*!< in: index of the record */ + const upd_t* update, /*!< in: update vector for the row; NOTE: the + field numbers in this MUST be clustered index + positions! */ +#ifdef UNIV_DEBUG + const que_thr_t*thr, /*!< in: query thread */ +#endif /* UNIV_DEBUG */ + const dtuple_t* row, /*!< in: old value of row, or NULL if the + row and the data values in update are not + known when this function is called, e.g., at + compile time */ + const row_ext_t*ext) /*!< NULL, or prefixes of the externally + stored columns in the old row */ + __attribute__((nonnull(1,2), warn_unused_result)); +#ifdef UNIV_DEBUG +# define row_upd_changes_ord_field_binary(index,update,thr,row,ext) \ + row_upd_changes_ord_field_binary_func(index,update,thr,row,ext) +#else /* UNIV_DEBUG */ +# define row_upd_changes_ord_field_binary(index,update,thr,row,ext) \ + row_upd_changes_ord_field_binary_func(index,update,row,ext) +#endif /* UNIV_DEBUG */ +/***********************************************************//** +Checks if an FTS indexed column is affected by an UPDATE. +@return offset within fts_t::indexes if FTS indexed column updated else +ULINT_UNDEFINED */ +UNIV_INTERN +ulint +row_upd_changes_fts_column( +/*=======================*/ + dict_table_t* table, /*!< in: table */ + upd_field_t* upd_field); /*!< in: field to check */ +/***********************************************************//** +Checks if an FTS Doc ID column is affected by an UPDATE. +@return whether Doc ID column is affected */ +UNIV_INTERN +bool +row_upd_changes_doc_id( +/*===================*/ + dict_table_t* table, /*!< in: table */ + upd_field_t* upd_field) /*!< in: field to check */ + __attribute__((nonnull, warn_unused_result)); +/***********************************************************//** +Checks if an update vector changes an ordering field of an index record. +This function is fast if the update vector is short or the number of ordering +fields in the index is small. Otherwise, this can be quadratic. +NOTE: we compare the fields as binary strings! +@return TRUE if update vector may change an ordering field in an index +record */ +UNIV_INTERN +ibool +row_upd_changes_some_index_ord_field_binary( +/*========================================*/ + const dict_table_t* table, /*!< in: table */ + const upd_t* update);/*!< in: update vector for the row */ +/***********************************************************//** +Updates a row in a table. This is a high-level function used +in SQL execution graphs. +@return query thread to run next or NULL */ +UNIV_INTERN +que_thr_t* +row_upd_step( +/*=========*/ + que_thr_t* thr); /*!< in: query thread */ +#endif /* !UNIV_HOTBACKUP */ +/*********************************************************************//** +Parses the log data of system field values. +@return log data end or NULL */ +UNIV_INTERN +byte* +row_upd_parse_sys_vals( +/*===================*/ + byte* ptr, /*!< in: buffer */ + byte* end_ptr,/*!< in: buffer end */ + ulint* pos, /*!< out: TRX_ID position in record */ + trx_id_t* trx_id, /*!< out: trx id */ + roll_ptr_t* roll_ptr);/*!< out: roll ptr */ +/*********************************************************************//** +Updates the trx id and roll ptr field in a clustered index record in database +recovery. */ +UNIV_INTERN +void +row_upd_rec_sys_fields_in_recovery( +/*===============================*/ + rec_t* rec, /*!< in/out: record */ + page_zip_des_t* page_zip,/*!< in/out: compressed page, or NULL */ + const ulint* offsets,/*!< in: array returned by rec_get_offsets() */ + ulint pos, /*!< in: TRX_ID position in rec */ + trx_id_t trx_id, /*!< in: transaction id */ + roll_ptr_t roll_ptr);/*!< in: roll ptr of the undo log record */ +/*********************************************************************//** +Parses the log data written by row_upd_index_write_log. +@return log data end or NULL */ +UNIV_INTERN +byte* +row_upd_index_parse( +/*================*/ + byte* ptr, /*!< in: buffer */ + byte* end_ptr,/*!< in: buffer end */ + mem_heap_t* heap, /*!< in: memory heap where update vector is + built */ + upd_t** update_out);/*!< out: update vector */ + + +/* Update vector field */ +struct upd_field_t{ + unsigned field_no:16; /*!< field number in an index, usually + the clustered index, but in updating + a secondary index record in btr0cur.cc + this is the position in the secondary + index */ +#ifndef UNIV_HOTBACKUP + unsigned orig_len:16; /*!< original length of the locally + stored part of an externally stored + column, or 0 */ + que_node_t* exp; /*!< expression for calculating a new + value: it refers to column values and + constants in the symbol table of the + query graph */ +#endif /* !UNIV_HOTBACKUP */ + dfield_t new_val; /*!< new value for the column */ +}; + +/* Update vector structure */ +struct upd_t{ + ulint info_bits; /*!< new value of info bits to record; + default is 0 */ + ulint n_fields; /*!< number of update fields */ + upd_field_t* fields; /*!< array of update fields */ +}; + +#ifndef UNIV_HOTBACKUP +/* Update node structure which also implements the delete operation +of a row */ + +struct upd_node_t{ + que_common_t common; /*!< node type: QUE_NODE_UPDATE */ + ibool is_delete;/* TRUE if delete, FALSE if update */ + ibool searched_update; + /* TRUE if searched update, FALSE if + positioned */ + ibool in_mysql_interface; + /* TRUE if the update node was created + for the MySQL interface */ + dict_foreign_t* foreign;/* NULL or pointer to a foreign key + constraint if this update node is used in + doing an ON DELETE or ON UPDATE operation */ + upd_node_t* cascade_node;/* NULL or an update node template which + is used to implement ON DELETE/UPDATE CASCADE + or ... SET NULL for foreign keys */ + mem_heap_t* cascade_heap;/* NULL or a mem heap where the cascade + node is created */ + sel_node_t* select; /*!< query graph subtree implementing a base + table cursor: the rows returned will be + updated */ + btr_pcur_t* pcur; /*!< persistent cursor placed on the clustered + index record which should be updated or + deleted; the cursor is stored in the graph + of 'select' field above, except in the case + of the MySQL interface */ + dict_table_t* table; /*!< table where updated */ + upd_t* update; /*!< update vector for the row */ + ulint update_n_fields; + /* when this struct is used to implement + a cascade operation for foreign keys, we store + here the size of the buffer allocated for use + as the update vector */ + sym_node_list_t columns;/* symbol table nodes for the columns + to retrieve from the table */ + ibool has_clust_rec_x_lock; + /* TRUE if the select which retrieves the + records to update already sets an x-lock on + the clustered record; note that it must always + set at least an s-lock */ + ulint cmpl_info;/* information extracted during query + compilation; speeds up execution: + UPD_NODE_NO_ORD_CHANGE and + UPD_NODE_NO_SIZE_CHANGE, ORed */ + /*----------------------*/ + /* Local storage for this graph node */ + ulint state; /*!< node execution state */ + dict_index_t* index; /*!< NULL, or the next index whose record should + be updated */ + dtuple_t* row; /*!< NULL, or a copy (also fields copied to + heap) of the row to update; this must be reset + to NULL after a successful update */ + row_ext_t* ext; /*!< NULL, or prefixes of the externally + stored columns in the old row */ + dtuple_t* upd_row;/* NULL, or a copy of the updated row */ + row_ext_t* upd_ext;/* NULL, or prefixes of the externally + stored columns in upd_row */ + mem_heap_t* heap; /*!< memory heap used as auxiliary storage; + this must be emptied after a successful + update */ + /*----------------------*/ + sym_node_t* table_sym;/* table node in symbol table */ + que_node_t* col_assign_list; + /* column assignment list */ + ulint magic_n; +}; + +#define UPD_NODE_MAGIC_N 1579975 + +/* Node execution states */ +#define UPD_NODE_SET_IX_LOCK 1 /* execution came to the node from + a node above and if the field + has_clust_rec_x_lock is FALSE, we + should set an intention x-lock on + the table */ +#define UPD_NODE_UPDATE_CLUSTERED 2 /* clustered index record should be + updated */ +#define UPD_NODE_INSERT_CLUSTERED 3 /* clustered index record should be + inserted, old record is already delete + marked */ +#define UPD_NODE_INSERT_BLOB 4 /* clustered index record should be + inserted, old record is already + delete-marked; non-updated BLOBs + should be inherited by the new record + and disowned by the old record */ +#define UPD_NODE_UPDATE_ALL_SEC 5 /* an ordering field of the clustered + index record was changed, or this is + a delete operation: should update + all the secondary index records */ +#define UPD_NODE_UPDATE_SOME_SEC 6 /* secondary index entries should be + looked at and updated if an ordering + field changed */ + +/* Compilation info flags: these must fit within 3 bits; see trx0rec.h */ +#define UPD_NODE_NO_ORD_CHANGE 1 /* no secondary index record will be + changed in the update and no ordering + field of the clustered index */ +#define UPD_NODE_NO_SIZE_CHANGE 2 /* no record field size will be + changed in the update */ + +#endif /* !UNIV_HOTBACKUP */ + +#ifndef UNIV_NONINL +#include "row0upd.ic" +#endif + +#endif diff --git a/storage/innobase/include/row0upd.ic b/storage/innobase/include/row0upd.ic new file mode 100644 index 00000000000..618a77fa4bf --- /dev/null +++ b/storage/innobase/include/row0upd.ic @@ -0,0 +1,188 @@ +/***************************************************************************** + +Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/row0upd.ic +Update of a row + +Created 12/27/1996 Heikki Tuuri +*******************************************************/ + +#include "mtr0log.h" +#ifndef UNIV_HOTBACKUP +# include "trx0trx.h" +# include "trx0undo.h" +# include "row0row.h" +# include "lock0lock.h" +#endif /* !UNIV_HOTBACKUP */ +#include "page0zip.h" + +/*********************************************************************//** +Creates an update vector object. +@return own: update vector object */ +UNIV_INLINE +upd_t* +upd_create( +/*=======*/ + ulint n, /*!< in: number of fields */ + mem_heap_t* heap) /*!< in: heap from which memory allocated */ +{ + upd_t* update; + + update = (upd_t*) mem_heap_zalloc(heap, sizeof(upd_t)); + + update->n_fields = n; + update->fields = (upd_field_t*) + mem_heap_zalloc(heap, sizeof(upd_field_t) * n); + + return(update); +} + +/*********************************************************************//** +Returns the number of fields in the update vector == number of columns +to be updated by an update vector. +@return number of fields */ +UNIV_INLINE +ulint +upd_get_n_fields( +/*=============*/ + const upd_t* update) /*!< in: update vector */ +{ + ut_ad(update); + + return(update->n_fields); +} + +#ifdef UNIV_DEBUG +/*********************************************************************//** +Returns the nth field of an update vector. +@return update vector field */ +UNIV_INLINE +upd_field_t* +upd_get_nth_field( +/*==============*/ + const upd_t* update, /*!< in: update vector */ + ulint n) /*!< in: field position in update vector */ +{ + ut_ad(update); + ut_ad(n < update->n_fields); + + return((upd_field_t*) update->fields + n); +} +#endif /* UNIV_DEBUG */ + +#ifndef UNIV_HOTBACKUP +/*********************************************************************//** +Sets an index field number to be updated by an update vector field. */ +UNIV_INLINE +void +upd_field_set_field_no( +/*===================*/ + upd_field_t* upd_field, /*!< in: update vector field */ + ulint field_no, /*!< in: field number in a clustered + index */ + dict_index_t* index, /*!< in: index */ + trx_t* trx) /*!< in: transaction */ +{ + upd_field->field_no = field_no; + upd_field->orig_len = 0; + + if (field_no >= dict_index_get_n_fields(index)) { + fprintf(stderr, + "InnoDB: Error: trying to access field %lu in ", + (ulong) field_no); + dict_index_name_print(stderr, trx, index); + fprintf(stderr, "\n" + "InnoDB: but index only has %lu fields\n", + (ulong) dict_index_get_n_fields(index)); + ut_ad(0); + } + + dict_col_copy_type(dict_index_get_nth_col(index, field_no), + dfield_get_type(&upd_field->new_val)); +} + +/*********************************************************************//** +Returns a field of an update vector by field_no. +@return update vector field, or NULL */ +UNIV_INLINE +const upd_field_t* +upd_get_field_by_field_no( +/*======================*/ + const upd_t* update, /*!< in: update vector */ + ulint no) /*!< in: field_no */ +{ + ulint i; + for (i = 0; i < upd_get_n_fields(update); i++) { + const upd_field_t* uf = upd_get_nth_field(update, i); + + if (uf->field_no == no) { + + return(uf); + } + } + + return(NULL); +} + +/*********************************************************************//** +Updates the trx id and roll ptr field in a clustered index record when +a row is updated or marked deleted. */ +UNIV_INLINE +void +row_upd_rec_sys_fields( +/*===================*/ + rec_t* rec, /*!< in/out: record */ + page_zip_des_t* page_zip,/*!< in/out: compressed page whose + uncompressed part will be updated, or NULL */ + dict_index_t* index, /*!< in: clustered index */ + const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */ + const trx_t* trx, /*!< in: transaction */ + roll_ptr_t roll_ptr)/*!< in: roll ptr of the undo log record, + can be 0 during IMPORT */ +{ + ut_ad(dict_index_is_clust(index)); + ut_ad(rec_offs_validate(rec, index, offsets)); + + if (page_zip) { + ulint pos = dict_index_get_sys_col_pos(index, DATA_TRX_ID); + page_zip_write_trx_id_and_roll_ptr(page_zip, rec, offsets, + pos, trx->id, roll_ptr); + } else { + ulint offset = index->trx_id_offset; + + if (!offset) { + offset = row_get_trx_id_offset(index, offsets); + } + +#if DATA_TRX_ID + 1 != DATA_ROLL_PTR +# error "DATA_TRX_ID + 1 != DATA_ROLL_PTR" +#endif + /* During IMPORT the trx id in the record can be in the + future, if the .ibd file is being imported from another + instance. During IMPORT roll_ptr will be 0. */ + ut_ad(roll_ptr == 0 + || lock_check_trx_id_sanity( + trx_read_trx_id(rec + offset), + rec, index, offsets)); + + trx_write_trx_id(rec + offset, trx->id); + trx_write_roll_ptr(rec + offset + DATA_TRX_ID_LEN, roll_ptr); + } +} +#endif /* !UNIV_HOTBACKUP */ diff --git a/storage/innobase/include/row0vers.h b/storage/innobase/include/row0vers.h new file mode 100644 index 00000000000..1df5b4d3e98 --- /dev/null +++ b/storage/innobase/include/row0vers.h @@ -0,0 +1,146 @@ +/***************************************************************************** + +Copyright (c) 1997, 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/row0vers.h +Row versions + +Created 2/6/1997 Heikki Tuuri +*******************************************************/ + +#ifndef row0vers_h +#define row0vers_h + +#include "univ.i" +#include "data0data.h" +#include "dict0types.h" +#include "trx0types.h" +#include "que0types.h" +#include "rem0types.h" +#include "mtr0mtr.h" +#include "read0types.h" + +/*****************************************************************//** +Finds out if an active transaction has inserted or modified a secondary +index record. +@return 0 if committed, else the active transaction id; +NOTE that this function can return false positives but never false +negatives. The caller must confirm all positive results by calling +trx_is_active() while holding lock_sys->mutex. */ +UNIV_INTERN +trx_id_t +row_vers_impl_x_locked( +/*===================*/ + const rec_t* rec, /*!< in: record in a secondary index */ + dict_index_t* index, /*!< in: the secondary index */ + const ulint* offsets);/*!< in: rec_get_offsets(rec, index) */ +/*****************************************************************//** +Finds out if we must preserve a delete marked earlier version of a clustered +index record, because it is >= the purge view. +@return TRUE if earlier version should be preserved */ +UNIV_INTERN +ibool +row_vers_must_preserve_del_marked( +/*==============================*/ + trx_id_t trx_id, /*!< in: transaction id in the version */ + mtr_t* mtr); /*!< in: mtr holding the latch on the + clustered index record; it will also + hold the latch on purge_view */ +/*****************************************************************//** +Finds out if a version of the record, where the version >= the current +purge view, should have ientry as its secondary index entry. We check +if there is any not delete marked version of the record where the trx +id >= purge view, and the secondary index entry == ientry; exactly in +this case we return TRUE. +@return TRUE if earlier version should have */ +UNIV_INTERN +ibool +row_vers_old_has_index_entry( +/*=========================*/ + ibool also_curr,/*!< in: TRUE if also rec is included in the + versions to search; otherwise only versions + prior to it are searched */ + const rec_t* rec, /*!< in: record in the clustered index; the + caller must have a latch on the page */ + mtr_t* mtr, /*!< in: mtr holding the latch on rec; it will + also hold the latch on purge_view */ + dict_index_t* index, /*!< in: the secondary index */ + const dtuple_t* ientry);/*!< in: the secondary index entry */ +/*****************************************************************//** +Constructs the version of a clustered index record which a consistent +read should see. We assume that the trx id stored in rec is such that +the consistent read should not see rec in its present version. +@return DB_SUCCESS or DB_MISSING_HISTORY */ +UNIV_INTERN +dberr_t +row_vers_build_for_consistent_read( +/*===============================*/ + const rec_t* rec, /*!< in: record in a clustered index; the + caller must have a latch on the page; this + latch locks the top of the stack of versions + of this records */ + mtr_t* mtr, /*!< in: mtr holding the latch on rec; it will + also hold the latch on purge_view */ + dict_index_t* index, /*!< in: the clustered index */ + ulint** offsets,/*!< in/out: offsets returned by + rec_get_offsets(rec, index) */ + read_view_t* view, /*!< in: the consistent read view */ + mem_heap_t** offset_heap,/*!< in/out: memory heap from which + the offsets are allocated */ + mem_heap_t* in_heap,/*!< in: memory heap from which the memory for + *old_vers is allocated; memory for possible + intermediate versions is allocated and freed + locally within the function */ + rec_t** old_vers)/*!< out, own: old version, or NULL + if the history is missing or the record + does not exist in the view, that is, + it was freshly inserted afterwards */ + __attribute__((nonnull(1,2,3,4,5,6,7))); + +/*****************************************************************//** +Constructs the last committed version of a clustered index record, +which should be seen by a semi-consistent read. */ +UNIV_INTERN +void +row_vers_build_for_semi_consistent_read( +/*====================================*/ + const rec_t* rec, /*!< in: record in a clustered index; the + caller must have a latch on the page; this + latch locks the top of the stack of versions + of this records */ + mtr_t* mtr, /*!< in: mtr holding the latch on rec */ + dict_index_t* index, /*!< in: the clustered index */ + ulint** offsets,/*!< in/out: offsets returned by + rec_get_offsets(rec, index) */ + mem_heap_t** offset_heap,/*!< in/out: memory heap from which + the offsets are allocated */ + mem_heap_t* in_heap,/*!< in: memory heap from which the memory for + *old_vers is allocated; memory for possible + intermediate versions is allocated and freed + locally within the function */ + const rec_t** old_vers)/*!< out: rec, old version, or NULL if the + record does not exist in the view, that is, + it was freshly inserted afterwards */ + __attribute__((nonnull(1,2,3,4,5))); + + +#ifndef UNIV_NONINL +#include "row0vers.ic" +#endif + +#endif diff --git a/storage/innobase/include/row0vers.ic b/storage/innobase/include/row0vers.ic new file mode 100644 index 00000000000..ef43a55bf70 --- /dev/null +++ b/storage/innobase/include/row0vers.ic @@ -0,0 +1,30 @@ +/***************************************************************************** + +Copyright (c) 1997, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/row0vers.ic +Row versions + +Created 2/6/1997 Heikki Tuuri +*******************************************************/ + +#include "row0row.h" +#include "dict0dict.h" +#include "read0read.h" +#include "page0page.h" +#include "log0recv.h" diff --git a/storage/innobase/include/srv0conc.h b/storage/innobase/include/srv0conc.h new file mode 100644 index 00000000000..cf61ef5528d --- /dev/null +++ b/storage/innobase/include/srv0conc.h @@ -0,0 +1,111 @@ +/***************************************************************************** + +Copyright (c) 2011, 2012, Oracle and/or its affiliates. All Rights Reserved. + +Portions of this file contain modifications contributed and copyrighted by +Google, Inc. Those modifications are gratefully acknowledged and are described +briefly in the InnoDB documentation. The contributions by Google are +incorporated with their permission, and subject to the conditions contained in +the file COPYING.Google. + +Portions of this file contain modifications contributed and copyrighted +by Percona Inc.. Those modifications are +gratefully acknowledged and are described briefly in the InnoDB +documentation. The contributions by Percona Inc. are incorporated with +their permission, and subject to the conditions contained in the file +COPYING.Percona. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file srv/srv0conc.h + +InnoDB concurrency manager header file + +Created 2011/04/18 Sunny Bains +*******************************************************/ + +#ifndef srv_conc_h +#define srv_conc_h + +/** We are prepared for a situation that we have this many threads waiting for +a semaphore inside InnoDB. innobase_start_or_create_for_mysql() sets the +value. */ + +extern ulint srv_max_n_threads; + +/** The following controls how many threads we let inside InnoDB concurrently: +threads waiting for locks are not counted into the number because otherwise +we could get a deadlock. Value of 0 will disable the concurrency check. */ + +extern ulong srv_thread_concurrency; + +/*********************************************************************//** +Initialise the concurrency management data structures */ +void +srv_conc_init(void); +/*===============*/ + +/*********************************************************************//** +Free the concurrency management data structures */ +void +srv_conc_free(void); +/*===============*/ + +/*********************************************************************//** +Puts an OS thread to wait if there are too many concurrent threads +(>= srv_thread_concurrency) inside InnoDB. The threads wait in a FIFO queue. */ +UNIV_INTERN +void +srv_conc_enter_innodb( +/*==================*/ + trx_t* trx); /*!< in: transaction object associated + with the thread */ + +/*********************************************************************//** +This lets a thread enter InnoDB regardless of the number of threads inside +InnoDB. This must be called when a thread ends a lock wait. */ +UNIV_INTERN +void +srv_conc_force_enter_innodb( +/*========================*/ + trx_t* trx); /*!< in: transaction object associated with + the thread */ + +/*********************************************************************//** +This must be called when a thread exits InnoDB in a lock wait or at the +end of an SQL statement. */ +UNIV_INTERN +void +srv_conc_force_exit_innodb( +/*=======================*/ + trx_t* trx); /*!< in: transaction object associated with + the thread */ + +/*********************************************************************//** +Get the count of threads waiting inside InnoDB. */ +UNIV_INTERN +ulint +srv_conc_get_waiting_threads(void); +/*==============================*/ + +/*********************************************************************//** +Get the count of threads active inside InnoDB. */ +UNIV_INTERN +ulint +srv_conc_get_active_threads(void); +/*==============================*/ + +#endif /* srv_conc_h */ diff --git a/storage/innobase/include/srv0mon.h b/storage/innobase/include/srv0mon.h new file mode 100644 index 00000000000..e2ab81bf53a --- /dev/null +++ b/storage/innobase/include/srv0mon.h @@ -0,0 +1,896 @@ +/*********************************************************************** + +Copyright (c) 2010, 2013, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2012, Facebook Inc. + +This program is free software; you can redistribute it and/or modify it +under the terms of the GNU General Public License as published by the +Free Software Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General +Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +***********************************************************************/ + +/**************************************************//** +@file include/srv0mon.h +Server monitor counter related defines + +Created 12/15/2009 Jimmy Yang +*******************************************************/ + +#ifndef srv0mon_h +#define srv0mon_h + +#include "univ.i" +#ifndef UNIV_HOTBACKUP + + +/** Possible status values for "mon_status" in "struct monitor_value" */ +enum monitor_running_status { + MONITOR_STARTED = 1, /*!< Monitor has been turned on */ + MONITOR_STOPPED = 2 /*!< Monitor has been turned off */ +}; + +typedef enum monitor_running_status monitor_running_t; + +/** Monitor counter value type */ +typedef ib_int64_t mon_type_t; + +/** Two monitor structures are defined in this file. One is +"monitor_value_t" which contains dynamic counter values for each +counter. The other is "monitor_info_t", which contains +static information (counter name, desc etc.) for each counter. +In addition, an enum datatype "monitor_id_t" is also defined, +it identifies each monitor with an internally used symbol, whose +integer value indexes into above two structure for its dynamic +and static information. +Developer who intend to add new counters would require to +fill in counter information as described in "monitor_info_t" and +create the internal counter ID in "monitor_id_t". */ + +/** Structure containing the actual values of a monitor counter. */ +struct monitor_value_t { + ib_time_t mon_start_time; /*!< Start time of monitoring */ + ib_time_t mon_stop_time; /*!< Stop time of monitoring */ + ib_time_t mon_reset_time; /*!< Time counter resetted */ + mon_type_t mon_value; /*!< Current counter Value */ + mon_type_t mon_max_value; /*!< Current Max value */ + mon_type_t mon_min_value; /*!< Current Min value */ + mon_type_t mon_value_reset;/*!< value at last reset */ + mon_type_t mon_max_value_start; /*!< Max value since start */ + mon_type_t mon_min_value_start; /*!< Min value since start */ + mon_type_t mon_start_value;/*!< Value at the start time */ + mon_type_t mon_last_value; /*!< Last set of values */ + monitor_running_t mon_status; /* whether monitor still running */ +}; + +/** Follwoing defines are possible values for "monitor_type" field in +"struct monitor_info" */ +enum monitor_type_t { + MONITOR_NONE = 0, /*!< No monitoring */ + MONITOR_MODULE = 1, /*!< This is a monitor module type, + not a counter */ + MONITOR_EXISTING = 2, /*!< The monitor carries information from + an existing system status variable */ + MONITOR_NO_AVERAGE = 4, /*!< Set this status if we don't want to + calculate the average value for the counter */ + MONITOR_DISPLAY_CURRENT = 8, /*!< Display current value of the + counter, rather than incremental value + over the period. Mostly for counters + displaying current resource usage */ + MONITOR_GROUP_MODULE = 16, /*!< Monitor can be turned on/off + only as a module, but not individually */ + MONITOR_DEFAULT_ON = 32,/*!< Monitor will be turned on by default at + server start up */ + MONITOR_SET_OWNER = 64, /*!< Owner of "monitor set", a set of + monitor counters */ + MONITOR_SET_MEMBER = 128,/*!< Being part of a "monitor set" */ + MONITOR_HIDDEN = 256 /*!< Do not display this monitor in the + metrics table */ +}; + +/** Counter minimum value is initialized to be max value of + mon_type_t (ib_int64_t) */ +#define MIN_RESERVED ((mon_type_t) (IB_UINT64_MAX >> 1)) +#define MAX_RESERVED (~MIN_RESERVED) + +/** This enumeration defines internal monitor identifier used internally +to identify each particular counter. Its value indexes into two arrays, +one is the "innodb_counter_value" array which records actual monitor +counter values, the other is "innodb_counter_info" array which describes +each counter's basic information (name, desc etc.). A couple of +naming rules here: +1) If the monitor defines a module, it starts with MONITOR_MODULE +2) If the monitor uses exisitng counters from "status variable", its ID +name shall start with MONITOR_OVLD + +Please refer to "innodb_counter_info" in srv/srv0mon.cc for detail +information for each monitor counter */ + +enum monitor_id_t { + /* This is to identify the default value set by the metrics + control global variables */ + MONITOR_DEFAULT_START = 0, + + /* Start of Metadata counter */ + MONITOR_MODULE_METADATA, + MONITOR_TABLE_OPEN, + MONITOR_TABLE_CLOSE, + MONITOR_TABLE_REFERENCE, + MONITOR_OVLD_META_MEM_POOL, + + /* Lock manager related counters */ + MONITOR_MODULE_LOCK, + MONITOR_DEADLOCK, + MONITOR_TIMEOUT, + MONITOR_LOCKREC_WAIT, + MONITOR_TABLELOCK_WAIT, + MONITOR_NUM_RECLOCK_REQ, + MONITOR_RECLOCK_CREATED, + MONITOR_RECLOCK_REMOVED, + MONITOR_NUM_RECLOCK, + MONITOR_TABLELOCK_CREATED, + MONITOR_TABLELOCK_REMOVED, + MONITOR_NUM_TABLELOCK, + MONITOR_OVLD_ROW_LOCK_CURRENT_WAIT, + MONITOR_OVLD_LOCK_WAIT_TIME, + MONITOR_OVLD_LOCK_MAX_WAIT_TIME, + MONITOR_OVLD_ROW_LOCK_WAIT, + MONITOR_OVLD_LOCK_AVG_WAIT_TIME, + + /* Buffer and I/O realted counters. */ + MONITOR_MODULE_BUFFER, + MONITOR_OVLD_BUFFER_POOL_SIZE, + MONITOR_OVLD_BUF_POOL_READS, + MONITOR_OVLD_BUF_POOL_READ_REQUESTS, + MONITOR_OVLD_BUF_POOL_WRITE_REQUEST, + MONITOR_OVLD_BUF_POOL_WAIT_FREE, + MONITOR_OVLD_BUF_POOL_READ_AHEAD, + MONITOR_OVLD_BUF_POOL_READ_AHEAD_EVICTED, + MONITOR_OVLD_BUF_POOL_PAGE_TOTAL, + MONITOR_OVLD_BUF_POOL_PAGE_MISC, + MONITOR_OVLD_BUF_POOL_PAGES_DATA, + MONITOR_OVLD_BUF_POOL_BYTES_DATA, + MONITOR_OVLD_BUF_POOL_PAGES_DIRTY, + MONITOR_OVLD_BUF_POOL_BYTES_DIRTY, + MONITOR_OVLD_BUF_POOL_PAGES_FREE, + MONITOR_OVLD_PAGE_CREATED, + MONITOR_OVLD_PAGES_WRITTEN, + MONITOR_OVLD_PAGES_READ, + MONITOR_OVLD_BYTE_READ, + MONITOR_OVLD_BYTE_WRITTEN, + MONITOR_FLUSH_BATCH_SCANNED, + MONITOR_FLUSH_BATCH_SCANNED_NUM_CALL, + MONITOR_FLUSH_BATCH_SCANNED_PER_CALL, + MONITOR_FLUSH_HP_RESCAN, + MONITOR_FLUSH_BATCH_TOTAL_PAGE, + MONITOR_FLUSH_BATCH_COUNT, + MONITOR_FLUSH_BATCH_PAGES, + MONITOR_FLUSH_NEIGHBOR_TOTAL_PAGE, + MONITOR_FLUSH_NEIGHBOR_COUNT, + MONITOR_FLUSH_NEIGHBOR_PAGES, + MONITOR_FLUSH_N_TO_FLUSH_REQUESTED, + MONITOR_FLUSH_AVG_PAGE_RATE, + MONITOR_FLUSH_LSN_AVG_RATE, + MONITOR_FLUSH_PCT_FOR_DIRTY, + MONITOR_FLUSH_PCT_FOR_LSN, + MONITOR_FLUSH_SYNC_WAITS, + MONITOR_FLUSH_ADAPTIVE_TOTAL_PAGE, + MONITOR_FLUSH_ADAPTIVE_COUNT, + MONITOR_FLUSH_ADAPTIVE_PAGES, + MONITOR_FLUSH_SYNC_TOTAL_PAGE, + MONITOR_FLUSH_SYNC_COUNT, + MONITOR_FLUSH_SYNC_PAGES, + MONITOR_FLUSH_BACKGROUND_TOTAL_PAGE, + MONITOR_FLUSH_BACKGROUND_COUNT, + MONITOR_FLUSH_BACKGROUND_PAGES, + MONITOR_LRU_BATCH_SCANNED, + MONITOR_LRU_BATCH_SCANNED_NUM_CALL, + MONITOR_LRU_BATCH_SCANNED_PER_CALL, + MONITOR_LRU_BATCH_TOTAL_PAGE, + MONITOR_LRU_BATCH_COUNT, + MONITOR_LRU_BATCH_PAGES, + MONITOR_LRU_SINGLE_FLUSH_SCANNED, + MONITOR_LRU_SINGLE_FLUSH_SCANNED_NUM_CALL, + MONITOR_LRU_SINGLE_FLUSH_SCANNED_PER_CALL, + MONITOR_LRU_SINGLE_FLUSH_FAILURE_COUNT, + MONITOR_LRU_GET_FREE_SEARCH, + MONITOR_LRU_SEARCH_SCANNED, + MONITOR_LRU_SEARCH_SCANNED_NUM_CALL, + MONITOR_LRU_SEARCH_SCANNED_PER_CALL, + MONITOR_LRU_UNZIP_SEARCH_SCANNED, + MONITOR_LRU_UNZIP_SEARCH_SCANNED_NUM_CALL, + MONITOR_LRU_UNZIP_SEARCH_SCANNED_PER_CALL, + + /* Buffer Page I/O specific counters. */ + MONITOR_MODULE_BUF_PAGE, + MONITOR_INDEX_LEAF_PAGE_READ, + MONITOR_INDEX_NON_LEAF_PAGE_READ, + MONITOR_INDEX_IBUF_LEAF_PAGE_READ, + MONITOR_INDEX_IBUF_NON_LEAF_PAGE_READ, + MONITOR_UNDO_LOG_PAGE_READ, + MONITOR_INODE_PAGE_READ, + MONITOR_IBUF_FREELIST_PAGE_READ, + MONITOR_IBUF_BITMAP_PAGE_READ, + MONITOR_SYSTEM_PAGE_READ, + MONITOR_TRX_SYSTEM_PAGE_READ, + MONITOR_FSP_HDR_PAGE_READ, + MONITOR_XDES_PAGE_READ, + MONITOR_BLOB_PAGE_READ, + MONITOR_ZBLOB_PAGE_READ, + MONITOR_ZBLOB2_PAGE_READ, + MONITOR_OTHER_PAGE_READ, + MONITOR_INDEX_LEAF_PAGE_WRITTEN, + MONITOR_INDEX_NON_LEAF_PAGE_WRITTEN, + MONITOR_INDEX_IBUF_LEAF_PAGE_WRITTEN, + MONITOR_INDEX_IBUF_NON_LEAF_PAGE_WRITTEN, + MONITOR_UNDO_LOG_PAGE_WRITTEN, + MONITOR_INODE_PAGE_WRITTEN, + MONITOR_IBUF_FREELIST_PAGE_WRITTEN, + MONITOR_IBUF_BITMAP_PAGE_WRITTEN, + MONITOR_SYSTEM_PAGE_WRITTEN, + MONITOR_TRX_SYSTEM_PAGE_WRITTEN, + MONITOR_FSP_HDR_PAGE_WRITTEN, + MONITOR_XDES_PAGE_WRITTEN, + MONITOR_BLOB_PAGE_WRITTEN, + MONITOR_ZBLOB_PAGE_WRITTEN, + MONITOR_ZBLOB2_PAGE_WRITTEN, + MONITOR_OTHER_PAGE_WRITTEN, + + /* OS level counters (I/O) */ + MONITOR_MODULE_OS, + MONITOR_OVLD_OS_FILE_READ, + MONITOR_OVLD_OS_FILE_WRITE, + MONITOR_OVLD_OS_FSYNC, + MONITOR_OS_PENDING_READS, + MONITOR_OS_PENDING_WRITES, + MONITOR_OVLD_OS_LOG_WRITTEN, + MONITOR_OVLD_OS_LOG_FSYNC, + MONITOR_OVLD_OS_LOG_PENDING_FSYNC, + MONITOR_OVLD_OS_LOG_PENDING_WRITES, + + /* Transaction related counters */ + MONITOR_MODULE_TRX, + MONITOR_TRX_RW_COMMIT, + MONITOR_TRX_RO_COMMIT, + MONITOR_TRX_NL_RO_COMMIT, + MONITOR_TRX_COMMIT_UNDO, + MONITOR_TRX_ROLLBACK, + MONITOR_TRX_ROLLBACK_SAVEPOINT, + MONITOR_TRX_ROLLBACK_ACTIVE, + MONITOR_TRX_ACTIVE, + MONITOR_RSEG_HISTORY_LEN, + MONITOR_NUM_UNDO_SLOT_USED, + MONITOR_NUM_UNDO_SLOT_CACHED, + MONITOR_RSEG_CUR_SIZE, + + /* Purge related counters */ + MONITOR_MODULE_PURGE, + MONITOR_N_DEL_ROW_PURGE, + MONITOR_N_UPD_EXIST_EXTERN, + MONITOR_PURGE_INVOKED, + MONITOR_PURGE_N_PAGE_HANDLED, + MONITOR_DML_PURGE_DELAY, + MONITOR_PURGE_STOP_COUNT, + MONITOR_PURGE_RESUME_COUNT, + + /* Recovery related counters */ + MONITOR_MODULE_RECOVERY, + MONITOR_NUM_CHECKPOINT, + MONITOR_OVLD_LSN_FLUSHDISK, + MONITOR_OVLD_LSN_CHECKPOINT, + MONITOR_OVLD_LSN_CURRENT, + MONITOR_LSN_CHECKPOINT_AGE, + MONITOR_OVLD_BUF_OLDEST_LSN, + MONITOR_OVLD_MAX_AGE_ASYNC, + MONITOR_OVLD_MAX_AGE_SYNC, + MONITOR_PENDING_LOG_WRITE, + MONITOR_PENDING_CHECKPOINT_WRITE, + MONITOR_LOG_IO, + MONITOR_OVLD_LOG_WAITS, + MONITOR_OVLD_LOG_WRITE_REQUEST, + MONITOR_OVLD_LOG_WRITES, + + /* Page Manager related counters */ + MONITOR_MODULE_PAGE, + MONITOR_PAGE_COMPRESS, + MONITOR_PAGE_DECOMPRESS, + MONITOR_PAD_INCREMENTS, + MONITOR_PAD_DECREMENTS, + + /* Index related counters */ + MONITOR_MODULE_INDEX, + MONITOR_INDEX_SPLIT, + MONITOR_INDEX_MERGE_ATTEMPTS, + MONITOR_INDEX_MERGE_SUCCESSFUL, + MONITOR_INDEX_REORG_ATTEMPTS, + MONITOR_INDEX_REORG_SUCCESSFUL, + MONITOR_INDEX_DISCARD, + + /* Adaptive Hash Index related counters */ + MONITOR_MODULE_ADAPTIVE_HASH, + MONITOR_OVLD_ADAPTIVE_HASH_SEARCH, + MONITOR_OVLD_ADAPTIVE_HASH_SEARCH_BTREE, + MONITOR_ADAPTIVE_HASH_PAGE_ADDED, + MONITOR_ADAPTIVE_HASH_PAGE_REMOVED, + MONITOR_ADAPTIVE_HASH_ROW_ADDED, + MONITOR_ADAPTIVE_HASH_ROW_REMOVED, + MONITOR_ADAPTIVE_HASH_ROW_REMOVE_NOT_FOUND, + MONITOR_ADAPTIVE_HASH_ROW_UPDATED, + + /* Tablespace related counters */ + MONITOR_MODULE_FIL_SYSTEM, + MONITOR_OVLD_N_FILE_OPENED, + + /* InnoDB Change Buffer related counters */ + MONITOR_MODULE_IBUF_SYSTEM, + MONITOR_OVLD_IBUF_MERGE_INSERT, + MONITOR_OVLD_IBUF_MERGE_DELETE, + MONITOR_OVLD_IBUF_MERGE_PURGE, + MONITOR_OVLD_IBUF_MERGE_DISCARD_INSERT, + MONITOR_OVLD_IBUF_MERGE_DISCARD_DELETE, + MONITOR_OVLD_IBUF_MERGE_DISCARD_PURGE, + MONITOR_OVLD_IBUF_MERGES, + MONITOR_OVLD_IBUF_SIZE, + + /* Counters for server operations */ + MONITOR_MODULE_SERVER, + MONITOR_MASTER_THREAD_SLEEP, + MONITOR_OVLD_SERVER_ACTIVITY, + MONITOR_MASTER_ACTIVE_LOOPS, + MONITOR_MASTER_IDLE_LOOPS, + MONITOR_SRV_BACKGROUND_DROP_TABLE_MICROSECOND, + MONITOR_SRV_IBUF_MERGE_MICROSECOND, + MONITOR_SRV_LOG_FLUSH_MICROSECOND, + MONITOR_SRV_MEM_VALIDATE_MICROSECOND, + MONITOR_SRV_PURGE_MICROSECOND, + MONITOR_SRV_DICT_LRU_MICROSECOND, + MONITOR_SRV_CHECKPOINT_MICROSECOND, + MONITOR_OVLD_SRV_DBLWR_WRITES, + MONITOR_OVLD_SRV_DBLWR_PAGES_WRITTEN, + MONITOR_OVLD_SRV_PAGE_SIZE, + MONITOR_OVLD_RWLOCK_S_SPIN_WAITS, + MONITOR_OVLD_RWLOCK_X_SPIN_WAITS, + MONITOR_OVLD_RWLOCK_S_SPIN_ROUNDS, + MONITOR_OVLD_RWLOCK_X_SPIN_ROUNDS, + MONITOR_OVLD_RWLOCK_S_OS_WAITS, + MONITOR_OVLD_RWLOCK_X_OS_WAITS, + + /* Data DML related counters */ + MONITOR_MODULE_DML_STATS, + MONITOR_OLVD_ROW_READ, + MONITOR_OLVD_ROW_INSERTED, + MONITOR_OLVD_ROW_DELETED, + MONITOR_OLVD_ROW_UPDTATED, + + /* Data DDL related counters */ + MONITOR_MODULE_DDL_STATS, + MONITOR_BACKGROUND_DROP_INDEX, + MONITOR_BACKGROUND_DROP_TABLE, + MONITOR_ONLINE_CREATE_INDEX, + MONITOR_PENDING_ALTER_TABLE, + + MONITOR_MODULE_ICP, + MONITOR_ICP_ATTEMPTS, + MONITOR_ICP_NO_MATCH, + MONITOR_ICP_OUT_OF_RANGE, + MONITOR_ICP_MATCH, + + /* This is used only for control system to turn + on/off and reset all monitor counters */ + MONITOR_ALL_COUNTER, + + /* This must be the last member */ + NUM_MONITOR +}; + +/** This informs the monitor control system to turn +on/off and reset monitor counters through wild card match */ +#define MONITOR_WILDCARD_MATCH (NUM_MONITOR + 1) + +/** Cannot find monitor counter with a specified name */ +#define MONITOR_NO_MATCH (NUM_MONITOR + 2) + +/** struct monitor_info describes the basic/static information +about each monitor counter. */ +struct monitor_info_t { + const char* monitor_name; /*!< Monitor name */ + const char* monitor_module; /*!< Sub Module the monitor + belongs to */ + const char* monitor_desc; /*!< Brief desc of monitor counter */ + monitor_type_t monitor_type; /*!< Type of Monitor Info */ + monitor_id_t monitor_related_id;/*!< Monitor ID of counter that + related to this monitor. This is + set when the monitor belongs to + a "monitor set" */ + monitor_id_t monitor_id; /*!< Monitor ID as defined in enum + monitor_id_t */ +}; + +/** Following are the "set_option" values allowed for +srv_mon_process_existing_counter() and srv_mon_process_existing_counter() +functions. To turn on/off/reset the monitor counters. */ +enum mon_option_t { + MONITOR_TURN_ON = 1, /*!< Turn on the counter */ + MONITOR_TURN_OFF, /*!< Turn off the counter */ + MONITOR_RESET_VALUE, /*!< Reset current values */ + MONITOR_RESET_ALL_VALUE, /*!< Reset all values */ + MONITOR_GET_VALUE /*!< Option for + srv_mon_process_existing_counter() + function */ +}; + +/** Number of bit in a ulint datatype */ +#define NUM_BITS_ULINT (sizeof(ulint) * CHAR_BIT) + +/** This "monitor_set_tbl" is a bitmap records whether a particular monitor +counter has been turned on or off */ +extern ulint monitor_set_tbl[(NUM_MONITOR + NUM_BITS_ULINT - 1) / + NUM_BITS_ULINT]; + +/** Macros to turn on/off the control bit in monitor_set_tbl for a monitor +counter option. */ +#define MONITOR_ON(monitor) \ + (monitor_set_tbl[monitor / NUM_BITS_ULINT] |= \ + ((ulint)1 << (monitor % NUM_BITS_ULINT))) + +#define MONITOR_OFF(monitor) \ + (monitor_set_tbl[monitor / NUM_BITS_ULINT] &= \ + ~((ulint)1 << (monitor % NUM_BITS_ULINT))) + +/** Check whether the requested monitor is turned on/off */ +#define MONITOR_IS_ON(monitor) \ + (monitor_set_tbl[monitor / NUM_BITS_ULINT] & \ + ((ulint)1 << (monitor % NUM_BITS_ULINT))) + +/** The actual monitor counter array that records each monintor counter +value */ +extern monitor_value_t innodb_counter_value[NUM_MONITOR]; + +/** Following are macro defines for basic montior counter manipulations. +Please note we do not provide any synchronization for these monitor +operations due to performance consideration. Most counters can +be placed under existing mutex protections in respective code +module. */ + +/** Macros to access various fields of a monitor counters */ +#define MONITOR_FIELD(monitor, field) \ + (innodb_counter_value[monitor].field) + +#define MONITOR_VALUE(monitor) \ + MONITOR_FIELD(monitor, mon_value) + +#define MONITOR_MAX_VALUE(monitor) \ + MONITOR_FIELD(monitor, mon_max_value) + +#define MONITOR_MIN_VALUE(monitor) \ + MONITOR_FIELD(monitor, mon_min_value) + +#define MONITOR_VALUE_RESET(monitor) \ + MONITOR_FIELD(monitor, mon_value_reset) + +#define MONITOR_MAX_VALUE_START(monitor) \ + MONITOR_FIELD(monitor, mon_max_value_start) + +#define MONITOR_MIN_VALUE_START(monitor) \ + MONITOR_FIELD(monitor, mon_min_value_start) + +#define MONITOR_LAST_VALUE(monitor) \ + MONITOR_FIELD(monitor, mon_last_value) + +#define MONITOR_START_VALUE(monitor) \ + MONITOR_FIELD(monitor, mon_start_value) + +#define MONITOR_VALUE_SINCE_START(monitor) \ + (MONITOR_VALUE(monitor) + MONITOR_VALUE_RESET(monitor)) + +#define MONITOR_STATUS(monitor) \ + MONITOR_FIELD(monitor, mon_status) + +#define MONITOR_SET_START(monitor) \ + do { \ + MONITOR_STATUS(monitor) = MONITOR_STARTED; \ + MONITOR_FIELD((monitor), mon_start_time) = time(NULL); \ + } while (0) + +#define MONITOR_SET_OFF(monitor) \ + do { \ + MONITOR_STATUS(monitor) = MONITOR_STOPPED; \ + MONITOR_FIELD((monitor), mon_stop_time) = time(NULL); \ + } while (0) + +#define MONITOR_INIT_ZERO_VALUE 0 + +/** Max and min values are initialized when we first turn on the monitor +counter, and set the MONITOR_STATUS. */ +#define MONITOR_MAX_MIN_NOT_INIT(monitor) \ + (MONITOR_STATUS(monitor) == MONITOR_INIT_ZERO_VALUE \ + && MONITOR_MIN_VALUE(monitor) == MONITOR_INIT_ZERO_VALUE \ + && MONITOR_MAX_VALUE(monitor) == MONITOR_INIT_ZERO_VALUE) + +#define MONITOR_INIT(monitor) \ + if (MONITOR_MAX_MIN_NOT_INIT(monitor)) { \ + MONITOR_MIN_VALUE(monitor) = MIN_RESERVED; \ + MONITOR_MIN_VALUE_START(monitor) = MIN_RESERVED; \ + MONITOR_MAX_VALUE(monitor) = MAX_RESERVED; \ + MONITOR_MAX_VALUE_START(monitor) = MAX_RESERVED; \ + } + +/** Macros to increment/decrement the counters. The normal +monitor counter operation expects appropriate synchronization +already exists. No additional mutex is necessary when operating +on the counters */ +#define MONITOR_INC(monitor) \ + if (MONITOR_IS_ON(monitor)) { \ + MONITOR_VALUE(monitor)++; \ + if (MONITOR_VALUE(monitor) > MONITOR_MAX_VALUE(monitor)) { \ + MONITOR_MAX_VALUE(monitor) = MONITOR_VALUE(monitor);\ + } \ + } + +/** Increment a monitor counter under mutex protection. +Use MONITOR_INC if appropriate mutex protection already exists. +@param monitor monitor to be incremented by 1 +@param mutex mutex to acquire and relese */ +# define MONITOR_MUTEX_INC(mutex, monitor) \ + ut_ad(!mutex_own(mutex)); \ + if (MONITOR_IS_ON(monitor)) { \ + mutex_enter(mutex); \ + if (++MONITOR_VALUE(monitor) > MONITOR_MAX_VALUE(monitor)) { \ + MONITOR_MAX_VALUE(monitor) = MONITOR_VALUE(monitor); \ + } \ + mutex_exit(mutex); \ + } +/** Decrement a monitor counter under mutex protection. +Use MONITOR_DEC if appropriate mutex protection already exists. +@param monitor monitor to be decremented by 1 +@param mutex mutex to acquire and relese */ +# define MONITOR_MUTEX_DEC(mutex, monitor) \ + ut_ad(!mutex_own(mutex)); \ + if (MONITOR_IS_ON(monitor)) { \ + mutex_enter(mutex); \ + if (--MONITOR_VALUE(monitor) < MONITOR_MIN_VALUE(monitor)) { \ + MONITOR_MIN_VALUE(monitor) = MONITOR_VALUE(monitor); \ + } \ + mutex_exit(mutex); \ + } + +#if defined HAVE_ATOMIC_BUILTINS_64 +/** Atomically increment a monitor counter. +Use MONITOR_INC if appropriate mutex protection exists. +@param monitor monitor to be incremented by 1 */ +# define MONITOR_ATOMIC_INC(monitor) \ + if (MONITOR_IS_ON(monitor)) { \ + ib_uint64_t value; \ + value = os_atomic_increment_uint64( \ + (ib_uint64_t*) &MONITOR_VALUE(monitor), 1); \ + /* Note: This is not 100% accurate because of the \ + inherent race, we ignore it due to performance. */ \ + if (value > (ib_uint64_t) MONITOR_MAX_VALUE(monitor)) { \ + MONITOR_MAX_VALUE(monitor) = value; \ + } \ + } + +/** Atomically decrement a monitor counter. +Use MONITOR_DEC if appropriate mutex protection exists. +@param monitor monitor to be decremented by 1 */ +# define MONITOR_ATOMIC_DEC(monitor) \ + if (MONITOR_IS_ON(monitor)) { \ + ib_uint64_t value; \ + value = os_atomic_decrement_uint64( \ + (ib_uint64_t*) &MONITOR_VALUE(monitor), 1); \ + /* Note: This is not 100% accurate because of the \ + inherent race, we ignore it due to performance. */ \ + if (value < (ib_uint64_t) MONITOR_MIN_VALUE(monitor)) { \ + MONITOR_MIN_VALUE(monitor) = value; \ + } \ + } +# define srv_mon_create() ((void) 0) +# define srv_mon_free() ((void) 0) +#else /* HAVE_ATOMIC_BUILTINS_64 */ +/** Mutex protecting atomic operations on platforms that lack +built-in operations for atomic memory access */ +extern ib_mutex_t monitor_mutex; +/****************************************************************//** +Initialize the monitor subsystem. */ +UNIV_INTERN +void +srv_mon_create(void); +/*================*/ +/****************************************************************//** +Close the monitor subsystem. */ +UNIV_INTERN +void +srv_mon_free(void); +/*==============*/ + +/** Atomically increment a monitor counter. +Use MONITOR_INC if appropriate mutex protection exists. +@param monitor monitor to be incremented by 1 */ +# define MONITOR_ATOMIC_INC(monitor) MONITOR_MUTEX_INC(&monitor_mutex, monitor) +/** Atomically decrement a monitor counter. +Use MONITOR_DEC if appropriate mutex protection exists. +@param monitor monitor to be decremented by 1 */ +# define MONITOR_ATOMIC_DEC(monitor) MONITOR_MUTEX_DEC(&monitor_mutex, monitor) +#endif /* HAVE_ATOMIC_BUILTINS_64 */ + +#define MONITOR_DEC(monitor) \ + if (MONITOR_IS_ON(monitor)) { \ + MONITOR_VALUE(monitor)--; \ + if (MONITOR_VALUE(monitor) < MONITOR_MIN_VALUE(monitor)) { \ + MONITOR_MIN_VALUE(monitor) = MONITOR_VALUE(monitor);\ + } \ + } + +#ifdef UNIV_DEBUG_VALGRIND +# define MONITOR_CHECK_DEFINED(value) do { \ + mon_type_t m = value; \ + UNIV_MEM_ASSERT_RW(&m, sizeof m); \ +} while (0) +#else /* UNIV_DEBUG_VALGRIND */ +# define MONITOR_CHECK_DEFINED(value) (void) 0 +#endif /* UNIV_DEBUG_VALGRIND */ + +#define MONITOR_INC_VALUE(monitor, value) \ + MONITOR_CHECK_DEFINED(value); \ + if (MONITOR_IS_ON(monitor)) { \ + MONITOR_VALUE(monitor) += (mon_type_t) (value); \ + if (MONITOR_VALUE(monitor) > MONITOR_MAX_VALUE(monitor)) { \ + MONITOR_MAX_VALUE(monitor) = MONITOR_VALUE(monitor);\ + } \ + } + +#define MONITOR_DEC_VALUE(monitor, value) \ + MONITOR_CHECK_DEFINED(value); \ + if (MONITOR_IS_ON(monitor)) { \ + ut_ad(MONITOR_VALUE(monitor) >= (mon_type_t) (value); \ + MONITOR_VALUE(monitor) -= (mon_type_t) (value); \ + if (MONITOR_VALUE(monitor) < MONITOR_MIN_VALUE(monitor)) { \ + MONITOR_MIN_VALUE(monitor) = MONITOR_VALUE(monitor);\ + } \ + } + +/* Increment/decrement counter without check the monitor on/off bit, which +could already be checked as a module group */ +#define MONITOR_INC_NOCHECK(monitor) \ + do { \ + MONITOR_VALUE(monitor)++; \ + if (MONITOR_VALUE(monitor) > MONITOR_MAX_VALUE(monitor)) { \ + MONITOR_MAX_VALUE(monitor) = MONITOR_VALUE(monitor);\ + } \ + } while (0) \ + +#define MONITOR_DEC_NOCHECK(monitor) \ + do { \ + MONITOR_VALUE(monitor)--; \ + if (MONITOR_VALUE(monitor) < MONITOR_MIN_VALUE(monitor)) { \ + MONITOR_MIN_VALUE(monitor) = MONITOR_VALUE(monitor);\ + } \ + } while (0) + +/** Directly set a monitor counter's value */ +#define MONITOR_SET(monitor, value) \ + MONITOR_CHECK_DEFINED(value); \ + if (MONITOR_IS_ON(monitor)) { \ + MONITOR_VALUE(monitor) = (mon_type_t) (value); \ + if (MONITOR_VALUE(monitor) > MONITOR_MAX_VALUE(monitor)) { \ + MONITOR_MAX_VALUE(monitor) = MONITOR_VALUE(monitor);\ + } \ + if (MONITOR_VALUE(monitor) < MONITOR_MIN_VALUE(monitor)) { \ + MONITOR_MIN_VALUE(monitor) = MONITOR_VALUE(monitor);\ + } \ + } + +/** Add time difference between now and input "value" (in seconds) to the +monitor counter +@param monitor monitor to update for the time difference +@param value the start time value */ +#define MONITOR_INC_TIME_IN_MICRO_SECS(monitor, value) \ + MONITOR_CHECK_DEFINED(value); \ + if (MONITOR_IS_ON(monitor)) { \ + ullint old_time = (value); \ + value = ut_time_us(NULL); \ + MONITOR_VALUE(monitor) += (mon_type_t) (value - old_time);\ + } + +/** This macro updates 3 counters in one call. However, it only checks the +main/first monitor counter 'monitor', to see it is on or off to decide +whether to do the update. +@param monitor the main monitor counter to update. It accounts for + the accumulative value for the counter. +@param monitor_n_calls counter that counts number of times this macro is + called +@param monitor_per_call counter that records the current and max value of + each incremental value +@param value incremental value to record this time */ +#define MONITOR_INC_VALUE_CUMULATIVE( \ + monitor, monitor_n_calls, monitor_per_call, value) \ + MONITOR_CHECK_DEFINED(value); \ + if (MONITOR_IS_ON(monitor)) { \ + MONITOR_VALUE(monitor_n_calls)++; \ + MONITOR_VALUE(monitor_per_call) = (mon_type_t) (value); \ + if (MONITOR_VALUE(monitor_per_call) \ + > MONITOR_MAX_VALUE(monitor_per_call)) { \ + MONITOR_MAX_VALUE(monitor_per_call) = \ + (mon_type_t) (value); \ + } \ + MONITOR_VALUE(monitor) += (mon_type_t) (value); \ + if (MONITOR_VALUE(monitor) > MONITOR_MAX_VALUE(monitor)) { \ + MONITOR_MAX_VALUE(monitor) = MONITOR_VALUE(monitor);\ + } \ + } + +/** Directly set a monitor counter's value, and if the value +is monotonically increasing, only max value needs to be updated */ +#define MONITOR_SET_UPD_MAX_ONLY(monitor, value) \ + MONITOR_CHECK_DEFINED(value); \ + if (MONITOR_IS_ON(monitor)) { \ + MONITOR_VALUE(monitor) = (mon_type_t) (value); \ + if (MONITOR_VALUE(monitor) > MONITOR_MAX_VALUE(monitor)) { \ + MONITOR_MAX_VALUE(monitor) = MONITOR_VALUE(monitor);\ + } \ + } + +/** Some values such as log sequence number are montomically increasing +number, do not need to record max/min values */ +#define MONITOR_SET_SIMPLE(monitor, value) \ + MONITOR_CHECK_DEFINED(value); \ + if (MONITOR_IS_ON(monitor)) { \ + MONITOR_VALUE(monitor) = (mon_type_t) (value); \ + } + +/** Reset the monitor value and max/min value to zero. The reset +operation would only be conducted when the counter is turned off */ +#define MONITOR_RESET_ALL(monitor) \ + do { \ + MONITOR_VALUE(monitor) = MONITOR_INIT_ZERO_VALUE; \ + MONITOR_MAX_VALUE(monitor) = MAX_RESERVED; \ + MONITOR_MIN_VALUE(monitor) = MIN_RESERVED; \ + MONITOR_VALUE_RESET(monitor) = MONITOR_INIT_ZERO_VALUE; \ + MONITOR_MAX_VALUE_START(monitor) = MAX_RESERVED; \ + MONITOR_MIN_VALUE_START(monitor) = MIN_RESERVED; \ + MONITOR_LAST_VALUE(monitor) = MONITOR_INIT_ZERO_VALUE; \ + MONITOR_FIELD(monitor, mon_start_time) = \ + MONITOR_INIT_ZERO_VALUE; \ + MONITOR_FIELD(monitor, mon_stop_time) = \ + MONITOR_INIT_ZERO_VALUE; \ + MONITOR_FIELD(monitor, mon_reset_time) = \ + MONITOR_INIT_ZERO_VALUE; \ + } while (0) + +/** Following four macros defines necessary operations to fetch and +consolidate information from existing system status variables. */ + +/** Save the passed-in value to mon_start_value field of monitor +counters */ +#define MONITOR_SAVE_START(monitor, value) do { \ + MONITOR_CHECK_DEFINED(value); \ + (MONITOR_START_VALUE(monitor) = \ + (mon_type_t) (value) - MONITOR_VALUE_RESET(monitor)); \ + } while (0) + +/** Save the passed-in value to mon_last_value field of monitor +counters */ +#define MONITOR_SAVE_LAST(monitor) \ + do { \ + MONITOR_LAST_VALUE(monitor) = MONITOR_VALUE(monitor); \ + MONITOR_START_VALUE(monitor) += MONITOR_VALUE(monitor); \ + } while (0) + +/** Set monitor value to the difference of value and mon_start_value +compensated by mon_last_value if accumulated value is required. */ +#define MONITOR_SET_DIFF(monitor, value) \ + MONITOR_SET_UPD_MAX_ONLY(monitor, ((value) \ + - MONITOR_VALUE_RESET(monitor) \ + - MONITOR_FIELD(monitor, mon_start_value) \ + + MONITOR_FIELD(monitor, mon_last_value))) + +/****************************************************************//** +Get monitor's monitor_info_t by its monitor id (index into the +innodb_counter_info array +@return Point to corresponding monitor_info_t, or NULL if no such +monitor */ +UNIV_INTERN +monitor_info_t* +srv_mon_get_info( +/*=============*/ + monitor_id_t monitor_id); /*!< id index into the + innodb_counter_info array */ +/****************************************************************//** +Get monitor's name by its monitor id (index into the +innodb_counter_info array +@return corresponding monitor name, or NULL if no such +monitor */ +UNIV_INTERN +const char* +srv_mon_get_name( +/*=============*/ + monitor_id_t monitor_id); /*!< id index into the + innodb_counter_info array */ + +/****************************************************************//** +Turn on/off/reset monitor counters in a module. If module_value +is NUM_MONITOR then turn on all monitor counters. +@return 0 if successful, or the first monitor that cannot be +turned on because it is already turned on. */ +UNIV_INTERN +void +srv_mon_set_module_control( +/*=======================*/ + monitor_id_t module_id, /*!< in: Module ID as in + monitor_counter_id. If it is + set to NUM_MONITOR, this means + we shall turn on all the counters */ + mon_option_t set_option); /*!< in: Turn on/off reset the + counter */ +/****************************************************************//** +This function consolidates some existing server counters used +by "system status variables". These existing system variables do not have +mechanism to start/stop and reset the counters, so we simulate these +controls by remembering the corresponding counter values when the +corresponding monitors are turned on/off/reset, and do appropriate +mathematics to deduct the actual value. */ +UNIV_INTERN +void +srv_mon_process_existing_counter( +/*=============================*/ + monitor_id_t monitor_id, /*!< in: the monitor's ID as in + monitor_counter_id */ + mon_option_t set_option); /*!< in: Turn on/off reset the + counter */ +/*************************************************************//** +This function is used to calculate the maximum counter value +since the start of monitor counter +@return max counter value since start. */ +UNIV_INLINE +mon_type_t +srv_mon_calc_max_since_start( +/*=========================*/ + monitor_id_t monitor); /*!< in: monitor id */ +/*************************************************************//** +This function is used to calculate the minimum counter value +since the start of monitor counter +@return min counter value since start. */ +UNIV_INLINE +mon_type_t +srv_mon_calc_min_since_start( +/*=========================*/ + monitor_id_t monitor); /*!< in: monitor id*/ +/*************************************************************//** +Reset a monitor, create a new base line with the current monitor +value. This baseline is recorded by MONITOR_VALUE_RESET(monitor) */ +UNIV_INTERN +void +srv_mon_reset( +/*==========*/ + monitor_id_t monitor); /*!< in: monitor id*/ +/*************************************************************//** +This function resets all values of a monitor counter */ +UNIV_INLINE +void +srv_mon_reset_all( +/*==============*/ + monitor_id_t monitor); /*!< in: monitor id*/ +/*************************************************************//** +Turn on monitor counters that are marked as default ON. */ +UNIV_INTERN +void +srv_mon_default_on(void); +/*====================*/ + +#ifndef UNIV_NONINL +#include "srv0mon.ic" +#endif +#else /* !UNIV_HOTBACKUP */ +# define MONITOR_INC(x) ((void) 0) +# define MONITOR_DEC(x) ((void) 0) +#endif /* !UNIV_HOTBACKUP */ + +#endif diff --git a/storage/innobase/include/srv0mon.ic b/storage/innobase/include/srv0mon.ic new file mode 100644 index 00000000000..225390c6b6f --- /dev/null +++ b/storage/innobase/include/srv0mon.ic @@ -0,0 +1,113 @@ +/***************************************************************************** + +Copyright (c) 2010, 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/*******************************************************************//** +@file include/srv0mon.ic +Server monitoring system + +Created 1/20/2010 Jimmy Yang +************************************************************************/ + +/*************************************************************//** +This function is used to calculate the maximum counter value +since the start of monitor counter +@return max counter value since start. */ +UNIV_INLINE +mon_type_t +srv_mon_calc_max_since_start( +/*=========================*/ + monitor_id_t monitor) /*!< in: monitor id */ +{ + if (MONITOR_MAX_VALUE_START(monitor) == MAX_RESERVED) { + + /* MONITOR_MAX_VALUE_START has not yet been + initialized, the max value since start is the + max count in MONITOR_MAX_VALUE */ + MONITOR_MAX_VALUE_START(monitor) = + MONITOR_MAX_VALUE(monitor); + + } else if (MONITOR_MAX_VALUE(monitor) != MAX_RESERVED + && (MONITOR_MAX_VALUE(monitor) + + MONITOR_VALUE_RESET(monitor) + > MONITOR_MAX_VALUE_START(monitor))) { + + /* If the max value since reset (as specified + in MONITOR_MAX_VALUE) plus the reset value is + larger than MONITOR_MAX_VALUE_START, reset + MONITOR_MAX_VALUE_START to this new max value */ + MONITOR_MAX_VALUE_START(monitor) = + MONITOR_MAX_VALUE(monitor) + + MONITOR_VALUE_RESET(monitor); + } + + return(MONITOR_MAX_VALUE_START(monitor)); +} + +/*************************************************************//** +This function is used to calculate the minimum counter value +since the start of monitor counter +@return min counter value since start. */ +UNIV_INLINE +mon_type_t +srv_mon_calc_min_since_start( +/*=========================*/ + monitor_id_t monitor) /*!< in: monitor id */ +{ + if (MONITOR_MIN_VALUE_START(monitor) == MIN_RESERVED) { + + /* MONITOR_MIN_VALUE_START has not yet been + initialized, the min value since start is the + min count in MONITOR_MIN_VALUE */ + MONITOR_MIN_VALUE_START(monitor) = + MONITOR_MIN_VALUE(monitor); + + } else if (MONITOR_MIN_VALUE(monitor) != MIN_RESERVED + && (MONITOR_MIN_VALUE(monitor) + + MONITOR_VALUE_RESET(monitor) + < MONITOR_MIN_VALUE_START(monitor))) { + + /* If the min value since reset (as specified + in MONITOR_MIN_VALUE) plus the reset value is + less than MONITOR_MIN_VALUE_START, reset + MONITOR_MIN_VALUE_START to this new min value */ + MONITOR_MIN_VALUE_START(monitor) = + MONITOR_MIN_VALUE(monitor) + + MONITOR_VALUE_RESET(monitor); + } + + return(MONITOR_MIN_VALUE_START(monitor)); +} + +/*************************************************************//** +This function resets all values of a monitor counter */ +UNIV_INLINE +void +srv_mon_reset_all( +/*==============*/ + monitor_id_t monitor) /*!< in: monitor id */ +{ + /* Do not reset all counter values if monitor is still on. */ + if (MONITOR_IS_ON(monitor)) { + fprintf(stderr, "InnoDB: Cannot reset all values for " + "monitor counter %s while it is on. Please " + "turn it off and retry. \n", + srv_mon_get_name(monitor)); + } else { + MONITOR_RESET_ALL(monitor); + } +} diff --git a/storage/innobase/include/srv0srv.h b/storage/innobase/include/srv0srv.h new file mode 100644 index 00000000000..7a6c9f93e3d --- /dev/null +++ b/storage/innobase/include/srv0srv.h @@ -0,0 +1,888 @@ +/***************************************************************************** + +Copyright (c) 1995, 2013, Oracle and/or its affiliates. All rights reserved. +Copyright (c) 2008, 2009, Google Inc. +Copyright (c) 2009, Percona Inc. + +Portions of this file contain modifications contributed and copyrighted by +Google, Inc. Those modifications are gratefully acknowledged and are described +briefly in the InnoDB documentation. The contributions by Google are +incorporated with their permission, and subject to the conditions contained in +the file COPYING.Google. + +Portions of this file contain modifications contributed and copyrighted +by Percona Inc.. Those modifications are +gratefully acknowledged and are described briefly in the InnoDB +documentation. The contributions by Percona Inc. are incorporated with +their permission, and subject to the conditions contained in the file +COPYING.Percona. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/srv0srv.h +The server main program + +Created 10/10/1995 Heikki Tuuri +*******************************************************/ + +#ifndef srv0srv_h +#define srv0srv_h + +#include "univ.i" +#ifndef UNIV_HOTBACKUP +#include "log0log.h" +#include "sync0sync.h" +#include "os0sync.h" +#include "que0types.h" +#include "trx0types.h" +#include "srv0conc.h" +#include "buf0checksum.h" +#include "ut0counter.h" + +/* Global counters used inside InnoDB. */ +struct srv_stats_t { + typedef ib_counter_t<lsn_t, 1, single_indexer_t> lsn_ctr_1_t; + typedef ib_counter_t<ulint, 1, single_indexer_t> ulint_ctr_1_t; + typedef ib_counter_t<lint, 1, single_indexer_t> lint_ctr_1_t; + typedef ib_counter_t<ulint, 64> ulint_ctr_64_t; + typedef ib_counter_t<ib_int64_t, 1, single_indexer_t> ib_int64_ctr_1_t; + + /** Count the amount of data written in total (in bytes) */ + ulint_ctr_1_t data_written; + + /** Number of the log write requests done */ + ulint_ctr_1_t log_write_requests; + + /** Number of physical writes to the log performed */ + ulint_ctr_1_t log_writes; + + /** Amount of data written to the log files in bytes */ + lsn_ctr_1_t os_log_written; + + /** Number of writes being done to the log files */ + lint_ctr_1_t os_log_pending_writes; + + /** We increase this counter, when we don't have enough + space in the log buffer and have to flush it */ + ulint_ctr_1_t log_waits; + + /** Count the number of times the doublewrite buffer was flushed */ + ulint_ctr_1_t dblwr_writes; + + /** Store the number of pages that have been flushed to the + doublewrite buffer */ + ulint_ctr_1_t dblwr_pages_written; + + /** Store the number of write requests issued */ + ulint_ctr_1_t buf_pool_write_requests; + + /** Store the number of times when we had to wait for a free page + in the buffer pool. It happens when the buffer pool is full and we + need to make a flush, in order to be able to read or create a page. */ + ulint_ctr_1_t buf_pool_wait_free; + + /** Count the number of pages that were written from buffer + pool to the disk */ + ulint_ctr_1_t buf_pool_flushed; + + /** Number of buffer pool reads that led to the reading of + a disk page */ + ulint_ctr_1_t buf_pool_reads; + + /** Number of data read in total (in bytes) */ + ulint_ctr_1_t data_read; + + /** Wait time of database locks */ + ib_int64_ctr_1_t n_lock_wait_time; + + /** Number of database lock waits */ + ulint_ctr_1_t n_lock_wait_count; + + /** Number of threads currently waiting on database locks */ + lint_ctr_1_t n_lock_wait_current_count; + + /** Number of rows read. */ + ulint_ctr_64_t n_rows_read; + + /** Number of rows updated */ + ulint_ctr_64_t n_rows_updated; + + /** Number of rows deleted */ + ulint_ctr_64_t n_rows_deleted; + + /** Number of rows inserted */ + ulint_ctr_64_t n_rows_inserted; +}; + +extern const char* srv_main_thread_op_info; + +/** Prefix used by MySQL to indicate pre-5.1 table name encoding */ +extern const char srv_mysql50_table_name_prefix[10]; + +/* The monitor thread waits on this event. */ +extern os_event_t srv_monitor_event; + +/* The error monitor thread waits on this event. */ +extern os_event_t srv_error_event; + +/** The buffer pool dump/load thread waits on this event. */ +extern os_event_t srv_buf_dump_event; + +/** The buffer pool dump/load file name */ +#define SRV_BUF_DUMP_FILENAME_DEFAULT "ib_buffer_pool" +extern char* srv_buf_dump_filename; + +/** Boolean config knobs that tell InnoDB to dump the buffer pool at shutdown +and/or load it during startup. */ +extern char srv_buffer_pool_dump_at_shutdown; +extern char srv_buffer_pool_load_at_startup; + +/* Whether to disable file system cache if it is defined */ +extern char srv_disable_sort_file_cache; + +/* If the last data file is auto-extended, we add this many pages to it +at a time */ +#define SRV_AUTO_EXTEND_INCREMENT \ + (srv_auto_extend_increment * ((1024 * 1024) / UNIV_PAGE_SIZE)) + +/* Mutex for locking srv_monitor_file. Not created if srv_read_only_mode */ +extern ib_mutex_t srv_monitor_file_mutex; +/* Temporary file for innodb monitor output */ +extern FILE* srv_monitor_file; +/* Mutex for locking srv_dict_tmpfile. Only created if !srv_read_only_mode. +This mutex has a very high rank; threads reserving it should not +be holding any InnoDB latches. */ +extern ib_mutex_t srv_dict_tmpfile_mutex; +/* Temporary file for output from the data dictionary */ +extern FILE* srv_dict_tmpfile; +/* Mutex for locking srv_misc_tmpfile. Only created if !srv_read_only_mode. +This mutex has a very low rank; threads reserving it should not +acquire any further latches or sleep before releasing this one. */ +extern ib_mutex_t srv_misc_tmpfile_mutex; +/* Temporary file for miscellanous diagnostic output */ +extern FILE* srv_misc_tmpfile; + +/* Server parameters which are read from the initfile */ + +extern char* srv_data_home; + +#ifdef UNIV_LOG_ARCHIVE +extern char* srv_arch_dir; +#endif /* UNIV_LOG_ARCHIVE */ + +/** Set if InnoDB must operate in read-only mode. We don't do any +recovery and open all tables in RO mode instead of RW mode. We don't +sync the max trx id to disk either. */ +extern my_bool srv_read_only_mode; +/** store to its own file each table created by an user; data +dictionary tables are in the system tablespace 0 */ +extern my_bool srv_file_per_table; +/** Sleep delay for threads waiting to enter InnoDB. In micro-seconds. */ +extern ulong srv_thread_sleep_delay; +#if defined(HAVE_ATOMIC_BUILTINS) +/** Maximum sleep delay (in micro-seconds), value of 0 disables it.*/ +extern ulong srv_adaptive_max_sleep_delay; +#endif /* HAVE_ATOMIC_BUILTINS */ + +/** The file format to use on new *.ibd files. */ +extern ulint srv_file_format; +/** Whether to check file format during startup. A value of +UNIV_FORMAT_MAX + 1 means no checking ie. FALSE. The default is to +set it to the highest format we support. */ +extern ulint srv_max_file_format_at_startup; +/** Place locks to records only i.e. do not use next-key locking except +on duplicate key checking and foreign key checking */ +extern ibool srv_locks_unsafe_for_binlog; + +/** Sort buffer size in index creation */ +extern ulong srv_sort_buf_size; +/** Maximum modification log file size for online index creation */ +extern unsigned long long srv_online_max_size; + +/* If this flag is TRUE, then we will use the native aio of the +OS (provided we compiled Innobase with it in), otherwise we will +use simulated aio we build below with threads. +Currently we support native aio on windows and linux */ +extern my_bool srv_use_native_aio; +#ifdef __WIN__ +extern ibool srv_use_native_conditions; +#endif /* __WIN__ */ +#endif /* !UNIV_HOTBACKUP */ + +/** Server undo tablespaces directory, can be absolute path. */ +extern char* srv_undo_dir; + +/** Number of undo tablespaces to use. */ +extern ulong srv_undo_tablespaces; + +/** The number of UNDO tablespaces that are open and ready to use. */ +extern ulint srv_undo_tablespaces_open; + +/* The number of undo segments to use */ +extern ulong srv_undo_logs; + +extern ulint srv_n_data_files; +extern char** srv_data_file_names; +extern ulint* srv_data_file_sizes; +extern ulint* srv_data_file_is_raw_partition; + +extern ibool srv_auto_extend_last_data_file; +extern ulint srv_last_file_size_max; +extern char* srv_log_group_home_dir; +#ifndef UNIV_HOTBACKUP +extern ulong srv_auto_extend_increment; + +extern ibool srv_created_new_raw; + +/** Maximum number of srv_n_log_files, or innodb_log_files_in_group */ +#define SRV_N_LOG_FILES_MAX 100 +extern ulong srv_n_log_files; +extern ib_uint64_t srv_log_file_size; +extern ib_uint64_t srv_log_file_size_requested; +extern ulint srv_log_buffer_size; +extern ulong srv_flush_log_at_trx_commit; +extern uint srv_flush_log_at_timeout; +extern char srv_adaptive_flushing; + +/* If this flag is TRUE, then we will load the indexes' (and tables') metadata +even if they are marked as "corrupted". Mostly it is for DBA to process +corrupted index and table */ +extern my_bool srv_load_corrupted; + +/* The sort order table of the MySQL latin1_swedish_ci character set +collation */ +extern const byte* srv_latin1_ordering; +#ifndef UNIV_HOTBACKUP +extern my_bool srv_use_sys_malloc; +#else +extern ibool srv_use_sys_malloc; +#endif /* UNIV_HOTBACKUP */ +extern ulint srv_buf_pool_size; /*!< requested size in bytes */ +extern ulint srv_buf_pool_instances; /*!< requested number of buffer pool instances */ +extern ulong srv_n_page_hash_locks; /*!< number of locks to + protect buf_pool->page_hash */ +extern ulong srv_LRU_scan_depth; /*!< Scan depth for LRU + flush batch */ +extern ulong srv_flush_neighbors; /*!< whether or not to flush + neighbors of a block */ +extern ulint srv_buf_pool_old_size; /*!< previously requested size */ +extern ulint srv_buf_pool_curr_size; /*!< current size in bytes */ +extern ulint srv_mem_pool_size; +extern ulint srv_lock_table_size; + +extern ulint srv_n_file_io_threads; +extern my_bool srv_random_read_ahead; +extern ulong srv_read_ahead_threshold; +extern ulint srv_n_read_io_threads; +extern ulint srv_n_write_io_threads; + +/* Number of IO operations per second the server can do */ +extern ulong srv_io_capacity; + +/* We use this dummy default value at startup for max_io_capacity. +The real value is set based on the value of io_capacity. */ +#define SRV_MAX_IO_CAPACITY_DUMMY_DEFAULT (~0UL) +#define SRV_MAX_IO_CAPACITY_LIMIT (~0UL) +extern ulong srv_max_io_capacity; +/* Returns the number of IO operations that is X percent of the +capacity. PCT_IO(5) -> returns the number of IO operations that +is 5% of the max where max is srv_io_capacity. */ +#define PCT_IO(p) ((ulong) (srv_io_capacity * ((double) (p) / 100.0))) + +/* The "innodb_stats_method" setting, decides how InnoDB is going +to treat NULL value when collecting statistics. It is not defined +as enum type because the configure option takes unsigned integer type. */ +extern ulong srv_innodb_stats_method; + +#ifdef UNIV_LOG_ARCHIVE +extern ibool srv_log_archive_on; +extern ibool srv_archive_recovery; +extern ib_uint64_t srv_archive_recovery_limit_lsn; +#endif /* UNIV_LOG_ARCHIVE */ + +extern char* srv_file_flush_method_str; +extern ulint srv_unix_file_flush_method; +extern ulint srv_win_file_flush_method; + +extern ulint srv_max_n_open_files; + +extern ulong srv_max_dirty_pages_pct; +extern ulong srv_max_dirty_pages_pct_lwm; + +extern ulong srv_adaptive_flushing_lwm; +extern ulong srv_flushing_avg_loops; + +extern ulong srv_force_recovery; +#ifndef DBUG_OFF +extern ulong srv_force_recovery_crash; +#endif /* !DBUG_OFF */ + +extern ulint srv_fast_shutdown; /*!< If this is 1, do not do a + purge and index buffer merge. + If this 2, do not even flush the + buffer pool to data files at the + shutdown: we effectively 'crash' + InnoDB (but lose no committed + transactions). */ +extern ibool srv_innodb_status; + +extern unsigned long long srv_stats_transient_sample_pages; +extern my_bool srv_stats_persistent; +extern unsigned long long srv_stats_persistent_sample_pages; +extern my_bool srv_stats_auto_recalc; + +extern ibool srv_use_doublewrite_buf; +extern ulong srv_doublewrite_batch_size; +extern ulong srv_checksum_algorithm; + +extern ulong srv_max_buf_pool_modified_pct; +extern ulong srv_max_purge_lag; +extern ulong srv_max_purge_lag_delay; + +extern ulong srv_replication_delay; +/*-------------------------------------------*/ + +extern my_bool srv_print_innodb_monitor; +extern my_bool srv_print_innodb_lock_monitor; +extern ibool srv_print_innodb_tablespace_monitor; +extern ibool srv_print_verbose_log; +#define DEPRECATED_MSG_INNODB_TABLE_MONITOR \ + "Using innodb_table_monitor is deprecated and it may be removed " \ + "in future releases. Please use the InnoDB INFORMATION_SCHEMA " \ + "tables instead, see " REFMAN "innodb-i_s-tables.html" +extern ibool srv_print_innodb_table_monitor; + +extern ibool srv_monitor_active; +extern ibool srv_error_monitor_active; + +/* TRUE during the lifetime of the buffer pool dump/load thread */ +extern ibool srv_buf_dump_thread_active; + +/* TRUE during the lifetime of the stats thread */ +extern ibool srv_dict_stats_thread_active; + +extern ulong srv_n_spin_wait_rounds; +extern ulong srv_n_free_tickets_to_enter; +extern ulong srv_thread_sleep_delay; +extern ulong srv_spin_wait_delay; +extern ibool srv_priority_boost; + +extern ulint srv_truncated_status_writes; +extern ulint srv_available_undo_logs; + +extern ulint srv_mem_pool_size; +extern ulint srv_lock_table_size; + +#ifdef UNIV_DEBUG +extern ibool srv_print_thread_releases; +extern ibool srv_print_lock_waits; +extern ibool srv_print_buf_io; +extern ibool srv_print_log_io; +extern ibool srv_print_latch_waits; +#else /* UNIV_DEBUG */ +# define srv_print_thread_releases FALSE +# define srv_print_lock_waits FALSE +# define srv_print_buf_io FALSE +# define srv_print_log_io FALSE +# define srv_print_latch_waits FALSE +#endif /* UNIV_DEBUG */ + +#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG +extern my_bool srv_ibuf_disable_background_merge; +#endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */ + +#ifdef UNIV_DEBUG +extern my_bool srv_purge_view_update_only_debug; +#endif /* UNIV_DEBUG */ + +extern ulint srv_fatal_semaphore_wait_threshold; +#define SRV_SEMAPHORE_WAIT_EXTENSION 7200 +extern ulint srv_dml_needed_delay; + +#ifndef HAVE_ATOMIC_BUILTINS +/** Mutex protecting some server global variables. */ +extern ib_mutex_t server_mutex; +#endif /* !HAVE_ATOMIC_BUILTINS */ + +#define SRV_MAX_N_IO_THREADS 130 + +/* Array of English strings describing the current state of an +i/o handler thread */ +extern const char* srv_io_thread_op_info[]; +extern const char* srv_io_thread_function[]; + +/* the number of purge threads to use from the worker pool (currently 0 or 1) */ +extern ulong srv_n_purge_threads; + +/* the number of pages to purge in one batch */ +extern ulong srv_purge_batch_size; + +/* the number of sync wait arrays */ +extern ulong srv_sync_array_size; + +/* print all user-level transactions deadlocks to mysqld stderr */ +extern my_bool srv_print_all_deadlocks; + +extern my_bool srv_cmp_per_index_enabled; + +/** Status variables to be passed to MySQL */ +extern struct export_var_t export_vars; + +/** Global counters */ +extern srv_stats_t srv_stats; + +# ifdef UNIV_PFS_THREAD +/* Keys to register InnoDB threads with performance schema */ +extern mysql_pfs_key_t buf_page_cleaner_thread_key; +extern mysql_pfs_key_t trx_rollback_clean_thread_key; +extern mysql_pfs_key_t io_handler_thread_key; +extern mysql_pfs_key_t srv_lock_timeout_thread_key; +extern mysql_pfs_key_t srv_error_monitor_thread_key; +extern mysql_pfs_key_t srv_monitor_thread_key; +extern mysql_pfs_key_t srv_master_thread_key; +extern mysql_pfs_key_t srv_purge_thread_key; +extern mysql_pfs_key_t recv_writer_thread_key; + +/* This macro register the current thread and its key with performance +schema */ +# define pfs_register_thread(key) \ +do { \ + struct PSI_thread* psi = PSI_THREAD_CALL(new_thread)(key, NULL, 0);\ + PSI_THREAD_CALL(set_thread)(psi); \ +} while (0) + +/* This macro delist the current thread from performance schema */ +# define pfs_delete_thread() \ +do { \ + PSI_THREAD_CALL(delete_current_thread)(); \ +} while (0) +# endif /* UNIV_PFS_THREAD */ + +#endif /* !UNIV_HOTBACKUP */ + +/** Types of raw partitions in innodb_data_file_path */ +enum { + SRV_NOT_RAW = 0, /*!< Not a raw partition */ + SRV_NEW_RAW, /*!< A 'newraw' partition, only to be + initialized */ + SRV_OLD_RAW /*!< An initialized raw partition */ +}; + +/** Alternatives for the file flush option in Unix; see the InnoDB manual +about what these mean */ +enum { + SRV_UNIX_FSYNC = 1, /*!< fsync, the default */ + SRV_UNIX_O_DSYNC, /*!< open log files in O_SYNC mode */ + SRV_UNIX_LITTLESYNC, /*!< do not call os_file_flush() + when writing data files, but do flush + after writing to log files */ + SRV_UNIX_NOSYNC, /*!< do not flush after writing */ + SRV_UNIX_O_DIRECT, /*!< invoke os_file_set_nocache() on + data files. This implies using + non-buffered IO but still using fsync, + the reason for which is that some FS + do not flush meta-data when + unbuffered IO happens */ + SRV_UNIX_O_DIRECT_NO_FSYNC + /*!< do not use fsync() when using + direct IO i.e.: it can be set to avoid + the fsync() call that we make when + using SRV_UNIX_O_DIRECT. However, in + this case user/DBA should be sure about + the integrity of the meta-data */ +}; + +/** Alternatives for file i/o in Windows */ +enum { + SRV_WIN_IO_NORMAL = 1, /*!< buffered I/O */ + SRV_WIN_IO_UNBUFFERED /*!< unbuffered I/O; this is the default */ +}; + +/** Alternatives for srv_force_recovery. Non-zero values are intended +to help the user get a damaged database up so that he can dump intact +tables and rows with SELECT INTO OUTFILE. The database must not otherwise +be used with these options! A bigger number below means that all precautions +of lower numbers are included. */ +enum { + SRV_FORCE_IGNORE_CORRUPT = 1, /*!< let the server run even if it + detects a corrupt page */ + SRV_FORCE_NO_BACKGROUND = 2, /*!< prevent the main thread from + running: if a crash would occur + in purge, this prevents it */ + SRV_FORCE_NO_TRX_UNDO = 3, /*!< do not run trx rollback after + recovery */ + SRV_FORCE_NO_IBUF_MERGE = 4, /*!< prevent also ibuf operations: + if they would cause a crash, better + not do them */ + SRV_FORCE_NO_UNDO_LOG_SCAN = 5, /*!< do not look at undo logs when + starting the database: InnoDB will + treat even incomplete transactions + as committed */ + SRV_FORCE_NO_LOG_REDO = 6 /*!< do not do the log roll-forward + in connection with recovery */ +}; + +/* Alternatives for srv_innodb_stats_method, which could be changed by +setting innodb_stats_method */ +enum srv_stats_method_name_enum { + SRV_STATS_NULLS_EQUAL, /* All NULL values are treated as + equal. This is the default setting + for innodb_stats_method */ + SRV_STATS_NULLS_UNEQUAL, /* All NULL values are treated as + NOT equal. */ + SRV_STATS_NULLS_IGNORED /* NULL values are ignored */ +}; + +typedef enum srv_stats_method_name_enum srv_stats_method_name_t; + +#ifndef UNIV_HOTBACKUP +/** Types of threads existing in the system. */ +enum srv_thread_type { + SRV_NONE, /*!< None */ + SRV_WORKER, /*!< threads serving parallelized + queries and queries released from + lock wait */ + SRV_PURGE, /*!< Purge coordinator thread */ + SRV_MASTER /*!< the master thread, (whose type + number must be biggest) */ +}; + +/*********************************************************************//** +Boots Innobase server. */ +UNIV_INTERN +void +srv_boot(void); +/*==========*/ +/*********************************************************************//** +Initializes the server. */ +UNIV_INTERN +void +srv_init(void); +/*==========*/ +/*********************************************************************//** +Frees the data structures created in srv_init(). */ +UNIV_INTERN +void +srv_free(void); +/*==========*/ +/*********************************************************************//** +Initializes the synchronization primitives, memory system, and the thread +local storage. */ +UNIV_INTERN +void +srv_general_init(void); +/*==================*/ +/*********************************************************************//** +Sets the info describing an i/o thread current state. */ +UNIV_INTERN +void +srv_set_io_thread_op_info( +/*======================*/ + ulint i, /*!< in: the 'segment' of the i/o thread */ + const char* str); /*!< in: constant char string describing the + state */ +/*********************************************************************//** +Resets the info describing an i/o thread current state. */ +UNIV_INTERN +void +srv_reset_io_thread_op_info(); +/*=========================*/ +/*******************************************************************//** +Tells the purge thread that there has been activity in the database +and wakes up the purge thread if it is suspended (not sleeping). Note +that there is a small chance that the purge thread stays suspended +(we do not protect our operation with the srv_sys_t:mutex, for +performance reasons). */ +UNIV_INTERN +void +srv_wake_purge_thread_if_not_active(void); +/*=====================================*/ +/*******************************************************************//** +Tells the Innobase server that there has been activity in the database +and wakes up the master thread if it is suspended (not sleeping). Used +in the MySQL interface. Note that there is a small chance that the master +thread stays suspended (we do not protect our operation with the kernel +mutex, for performace reasons). */ +UNIV_INTERN +void +srv_active_wake_master_thread(void); +/*===============================*/ +/*******************************************************************//** +Wakes up the master thread if it is suspended or being suspended. */ +UNIV_INTERN +void +srv_wake_master_thread(void); +/*========================*/ +/******************************************************************//** +Outputs to a file the output of the InnoDB Monitor. +@return FALSE if not all information printed +due to failure to obtain necessary mutex */ +UNIV_INTERN +ibool +srv_printf_innodb_monitor( +/*======================*/ + FILE* file, /*!< in: output stream */ + ibool nowait, /*!< in: whether to wait for the + lock_sys_t::mutex */ + ulint* trx_start, /*!< out: file position of the start of + the list of active transactions */ + ulint* trx_end); /*!< out: file position of the end of + the list of active transactions */ + +/******************************************************************//** +Function to pass InnoDB status variables to MySQL */ +UNIV_INTERN +void +srv_export_innodb_status(void); +/*==========================*/ +/*******************************************************************//** +Get current server activity count. We don't hold srv_sys::mutex while +reading this value as it is only used in heuristics. +@return activity count. */ +UNIV_INTERN +ulint +srv_get_activity_count(void); +/*========================*/ +/*******************************************************************//** +Check if there has been any activity. +@return FALSE if no change in activity counter. */ +UNIV_INTERN +ibool +srv_check_activity( +/*===============*/ + ulint old_activity_count); /*!< old activity count */ +/******************************************************************//** +Increment the server activity counter. */ +UNIV_INTERN +void +srv_inc_activity_count(void); +/*=========================*/ + +/**********************************************************************//** +Enqueues a task to server task queue and releases a worker thread, if there +is a suspended one. */ +UNIV_INTERN +void +srv_que_task_enqueue_low( +/*=====================*/ + que_thr_t* thr); /*!< in: query thread */ + +/**********************************************************************//** +Check whether any background thread is active. If so, return the thread +type. +@return SRV_NONE if all are are suspended or have exited, thread +type if any are still active. */ +UNIV_INTERN +enum srv_thread_type +srv_get_active_thread_type(void); +/*============================*/ + +extern "C" { + +/*********************************************************************//** +A thread which prints the info output by various InnoDB monitors. +@return a dummy parameter */ +UNIV_INTERN +os_thread_ret_t +DECLARE_THREAD(srv_monitor_thread)( +/*===============================*/ + void* arg); /*!< in: a dummy parameter required by + os_thread_create */ + +/*********************************************************************//** +The master thread controlling the server. +@return a dummy parameter */ +UNIV_INTERN +os_thread_ret_t +DECLARE_THREAD(srv_master_thread)( +/*==============================*/ + void* arg); /*!< in: a dummy parameter required by + os_thread_create */ + +/************************************************************************* +A thread which prints warnings about semaphore waits which have lasted +too long. These can be used to track bugs which cause hangs. +@return a dummy parameter */ +UNIV_INTERN +os_thread_ret_t +DECLARE_THREAD(srv_error_monitor_thread)( +/*=====================================*/ + void* arg); /*!< in: a dummy parameter required by + os_thread_create */ + +/*********************************************************************//** +Purge coordinator thread that schedules the purge tasks. +@return a dummy parameter */ +UNIV_INTERN +os_thread_ret_t +DECLARE_THREAD(srv_purge_coordinator_thread)( +/*=========================================*/ + void* arg __attribute__((unused))); /*!< in: a dummy parameter + required by os_thread_create */ + +/*********************************************************************//** +Worker thread that reads tasks from the work queue and executes them. +@return a dummy parameter */ +UNIV_INTERN +os_thread_ret_t +DECLARE_THREAD(srv_worker_thread)( +/*==============================*/ + void* arg __attribute__((unused))); /*!< in: a dummy parameter + required by os_thread_create */ +} /* extern "C" */ + +/**********************************************************************//** +Get count of tasks in the queue. +@return number of tasks in queue */ +UNIV_INTERN +ulint +srv_get_task_queue_length(void); +/*===========================*/ + +/*********************************************************************//** +Releases threads of the type given from suspension in the thread table. +NOTE! The server mutex has to be reserved by the caller! +@return number of threads released: this may be less than n if not +enough threads were suspended at the moment */ +UNIV_INTERN +ulint +srv_release_threads( +/*================*/ + enum srv_thread_type type, /*!< in: thread type */ + ulint n); /*!< in: number of threads to release */ + +/**********************************************************************//** +Check whether any background thread are active. If so print which thread +is active. Send the threads wakeup signal. +@return name of thread that is active or NULL */ +UNIV_INTERN +const char* +srv_any_background_threads_are_active(void); +/*=======================================*/ + +/**********************************************************************//** +Wakeup the purge threads. */ +UNIV_INTERN +void +srv_purge_wakeup(void); +/*==================*/ + +/** Status variables to be passed to MySQL */ +struct export_var_t{ + ulint innodb_data_pending_reads; /*!< Pending reads */ + ulint innodb_data_pending_writes; /*!< Pending writes */ + ulint innodb_data_pending_fsyncs; /*!< Pending fsyncs */ + ulint innodb_data_fsyncs; /*!< Number of fsyncs so far */ + ulint innodb_data_read; /*!< Data bytes read */ + ulint innodb_data_writes; /*!< I/O write requests */ + ulint innodb_data_written; /*!< Data bytes written */ + ulint innodb_data_reads; /*!< I/O read requests */ + char innodb_buffer_pool_dump_status[512];/*!< Buf pool dump status */ + char innodb_buffer_pool_load_status[512];/*!< Buf pool load status */ + ulint innodb_buffer_pool_pages_total; /*!< Buffer pool size */ + ulint innodb_buffer_pool_pages_data; /*!< Data pages */ + ulint innodb_buffer_pool_bytes_data; /*!< File bytes used */ + ulint innodb_buffer_pool_pages_dirty; /*!< Dirty data pages */ + ulint innodb_buffer_pool_bytes_dirty; /*!< File bytes modified */ + ulint innodb_buffer_pool_pages_misc; /*!< Miscellanous pages */ + ulint innodb_buffer_pool_pages_free; /*!< Free pages */ +#ifdef UNIV_DEBUG + ulint innodb_buffer_pool_pages_latched; /*!< Latched pages */ +#endif /* UNIV_DEBUG */ + ulint innodb_buffer_pool_read_requests; /*!< buf_pool->stat.n_page_gets */ + ulint innodb_buffer_pool_reads; /*!< srv_buf_pool_reads */ + ulint innodb_buffer_pool_wait_free; /*!< srv_buf_pool_wait_free */ + ulint innodb_buffer_pool_pages_flushed; /*!< srv_buf_pool_flushed */ + ulint innodb_buffer_pool_write_requests;/*!< srv_buf_pool_write_requests */ + ulint innodb_buffer_pool_read_ahead_rnd;/*!< srv_read_ahead_rnd */ + ulint innodb_buffer_pool_read_ahead; /*!< srv_read_ahead */ + ulint innodb_buffer_pool_read_ahead_evicted;/*!< srv_read_ahead evicted*/ + ulint innodb_dblwr_pages_written; /*!< srv_dblwr_pages_written */ + ulint innodb_dblwr_writes; /*!< srv_dblwr_writes */ + ibool innodb_have_atomic_builtins; /*!< HAVE_ATOMIC_BUILTINS */ + ulint innodb_log_waits; /*!< srv_log_waits */ + ulint innodb_log_write_requests; /*!< srv_log_write_requests */ + ulint innodb_log_writes; /*!< srv_log_writes */ + lsn_t innodb_os_log_written; /*!< srv_os_log_written */ + ulint innodb_os_log_fsyncs; /*!< fil_n_log_flushes */ + ulint innodb_os_log_pending_writes; /*!< srv_os_log_pending_writes */ + ulint innodb_os_log_pending_fsyncs; /*!< fil_n_pending_log_flushes */ + ulint innodb_page_size; /*!< UNIV_PAGE_SIZE */ + ulint innodb_pages_created; /*!< buf_pool->stat.n_pages_created */ + ulint innodb_pages_read; /*!< buf_pool->stat.n_pages_read */ + ulint innodb_pages_written; /*!< buf_pool->stat.n_pages_written */ + ulint innodb_row_lock_waits; /*!< srv_n_lock_wait_count */ + ulint innodb_row_lock_current_waits; /*!< srv_n_lock_wait_current_count */ + ib_int64_t innodb_row_lock_time; /*!< srv_n_lock_wait_time + / 1000 */ + ulint innodb_row_lock_time_avg; /*!< srv_n_lock_wait_time + / 1000 + / srv_n_lock_wait_count */ + ulint innodb_row_lock_time_max; /*!< srv_n_lock_max_wait_time + / 1000 */ + ulint innodb_rows_read; /*!< srv_n_rows_read */ + ulint innodb_rows_inserted; /*!< srv_n_rows_inserted */ + ulint innodb_rows_updated; /*!< srv_n_rows_updated */ + ulint innodb_rows_deleted; /*!< srv_n_rows_deleted */ + ulint innodb_num_open_files; /*!< fil_n_file_opened */ + ulint innodb_truncated_status_writes; /*!< srv_truncated_status_writes */ + ulint innodb_available_undo_logs; /*!< srv_available_undo_logs */ +#ifdef UNIV_DEBUG + ulint innodb_purge_trx_id_age; /*!< rw_max_trx_id - purged trx_id */ + ulint innodb_purge_view_trx_id_age; /*!< rw_max_trx_id + - purged view's min trx_id */ +#endif /* UNIV_DEBUG */ +}; + +/** Thread slot in the thread table. */ +struct srv_slot_t{ + srv_thread_type type; /*!< thread type: user, + utility etc. */ + ibool in_use; /*!< TRUE if this slot + is in use */ + ibool suspended; /*!< TRUE if the thread is + waiting for the event of this + slot */ + ib_time_t suspend_time; /*!< time when the thread was + suspended. Initialized by + lock_wait_table_reserve_slot() + for lock wait */ + ulong wait_timeout; /*!< wait time that if exceeded + the thread will be timed out. + Initialized by + lock_wait_table_reserve_slot() + for lock wait */ + os_event_t event; /*!< event used in suspending + the thread when it has nothing + to do */ + que_thr_t* thr; /*!< suspended query thread + (only used for user threads) */ +}; + +#else /* !UNIV_HOTBACKUP */ +# define srv_use_adaptive_hash_indexes FALSE +# define srv_use_native_aio FALSE +# define srv_force_recovery 0UL +# define srv_set_io_thread_op_info(t,info) ((void) 0) +# define srv_reset_io_thread_op_info() ((void) 0) +# define srv_is_being_started 0 +# define srv_win_file_flush_method SRV_WIN_IO_UNBUFFERED +# define srv_unix_file_flush_method SRV_UNIX_O_DSYNC +# define srv_start_raw_disk_in_use 0 +# define srv_file_per_table 1 +#endif /* !UNIV_HOTBACKUP */ + +#endif diff --git a/storage/innobase/include/srv0srv.ic b/storage/innobase/include/srv0srv.ic new file mode 100644 index 00000000000..53405c06f97 --- /dev/null +++ b/storage/innobase/include/srv0srv.ic @@ -0,0 +1,24 @@ +/***************************************************************************** + +Copyright (c) 1995, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/srv0srv.ic +Server main program + +Created 10/4/1995 Heikki Tuuri +*******************************************************/ diff --git a/storage/innobase/include/srv0start.h b/storage/innobase/include/srv0start.h new file mode 100644 index 00000000000..40d502f4459 --- /dev/null +++ b/storage/innobase/include/srv0start.h @@ -0,0 +1,167 @@ +/***************************************************************************** + +Copyright (c) 1995, 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/srv0start.h +Starts the Innobase database server + +Created 10/10/1995 Heikki Tuuri +*******************************************************/ + +#ifndef srv0start_h +#define srv0start_h + +#include "univ.i" +#include "log0log.h" +#include "ut0byte.h" + +#ifdef __WIN__ +#define SRV_PATH_SEPARATOR '\\' +#else +#define SRV_PATH_SEPARATOR '/' +#endif + +/*********************************************************************//** +Normalizes a directory path for Windows: converts slashes to backslashes. */ +UNIV_INTERN +void +srv_normalize_path_for_win( +/*=======================*/ + char* str); /*!< in/out: null-terminated character string */ +/*********************************************************************//** +Reads the data files and their sizes from a character string given in +the .cnf file. +@return TRUE if ok, FALSE on parse error */ +UNIV_INTERN +ibool +srv_parse_data_file_paths_and_sizes( +/*================================*/ + char* str); /*!< in/out: the data file path string */ +/*********************************************************************//** +Frees the memory allocated by srv_parse_data_file_paths_and_sizes() +and srv_parse_log_group_home_dirs(). */ +UNIV_INTERN +void +srv_free_paths_and_sizes(void); +/*==========================*/ +/*********************************************************************//** +Adds a slash or a backslash to the end of a string if it is missing +and the string is not empty. +@return string which has the separator if the string is not empty */ +UNIV_INTERN +char* +srv_add_path_separator_if_needed( +/*=============================*/ + char* str); /*!< in: null-terminated character string */ +#ifndef UNIV_HOTBACKUP +/****************************************************************//** +Starts Innobase and creates a new database if database files +are not found and the user wants. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +innobase_start_or_create_for_mysql(void); +/*====================================*/ +/****************************************************************//** +Shuts down the Innobase database. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +innobase_shutdown_for_mysql(void); + +/******************************************************************** +Signal all per-table background threads to shutdown, and wait for them to do +so. */ +UNIV_INTERN +void +srv_shutdown_table_bg_threads(void); +/*=============================*/ + +/*************************************************************//** +Copy the file path component of the physical file to parameter. It will +copy up to and including the terminating path separator. +@return number of bytes copied or ULINT_UNDEFINED if destination buffer + is smaller than the path to be copied. */ +UNIV_INTERN +ulint +srv_path_copy( +/*==========*/ + char* dest, /*!< out: destination buffer */ + ulint dest_len, /*!< in: max bytes to copy */ + const char* basedir, /*!< in: base directory */ + const char* table_name) /*!< in: source table name */ + __attribute__((nonnull, warn_unused_result)); + +/*****************************************************************//** +Get the meta-data filename from the table name. */ +UNIV_INTERN +void +srv_get_meta_data_filename( +/*======================*/ + dict_table_t* table, /*!< in: table */ + char* filename, /*!< out: filename */ + ulint max_len) /*!< in: filename max length */ + __attribute__((nonnull)); + +/** Log sequence number at shutdown */ +extern lsn_t srv_shutdown_lsn; +/** Log sequence number immediately after startup */ +extern lsn_t srv_start_lsn; + +#ifdef HAVE_DARWIN_THREADS +/** TRUE if the F_FULLFSYNC option is available */ +extern ibool srv_have_fullfsync; +#endif + +/** TRUE if the server is being started */ +extern ibool srv_is_being_started; +/** TRUE if the server was successfully started */ +extern ibool srv_was_started; +/** TRUE if the server is being started, before rolling back any +incomplete transactions */ +extern ibool srv_startup_is_before_trx_rollback_phase; + +/** TRUE if a raw partition is in use */ +extern ibool srv_start_raw_disk_in_use; + + +/** Shutdown state */ +enum srv_shutdown_state { + SRV_SHUTDOWN_NONE = 0, /*!< Database running normally */ + SRV_SHUTDOWN_CLEANUP, /*!< Cleaning up in + logs_empty_and_mark_files_at_shutdown() */ + SRV_SHUTDOWN_FLUSH_PHASE,/*!< At this phase the master and the + purge threads must have completed their + work. Once we enter this phase the + page_cleaner can clean up the buffer + pool and exit */ + SRV_SHUTDOWN_LAST_PHASE,/*!< Last phase after ensuring that + the buffer pool can be freed: flush + all file spaces and close all files */ + SRV_SHUTDOWN_EXIT_THREADS/*!< Exit all threads */ +}; + +/** At a shutdown this value climbs from SRV_SHUTDOWN_NONE to +SRV_SHUTDOWN_CLEANUP and then to SRV_SHUTDOWN_LAST_PHASE, and so on */ +extern enum srv_shutdown_state srv_shutdown_state; +#endif /* !UNIV_HOTBACKUP */ + +/** Log 'spaces' have id's >= this */ +#define SRV_LOG_SPACE_FIRST_ID 0xFFFFFFF0UL + +#endif diff --git a/storage/innobase/include/sync0arr.h b/storage/innobase/include/sync0arr.h new file mode 100644 index 00000000000..15dbdcb540d --- /dev/null +++ b/storage/innobase/include/sync0arr.h @@ -0,0 +1,155 @@ +/***************************************************************************** + +Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/sync0arr.h +The wait array used in synchronization primitives + +Created 9/5/1995 Heikki Tuuri +*******************************************************/ + +#ifndef sync0arr_h +#define sync0arr_h + +#include "univ.i" +#include "ut0lst.h" +#include "ut0mem.h" +#include "os0thread.h" + +/** Synchronization wait array cell */ +struct sync_cell_t; +/** Synchronization wait array */ +struct sync_array_t; + +/******************************************************************//** +Get an instance of the sync wait array and reserve a wait array cell +in the instance for waiting for an object. The event of the cell is +reset to nonsignalled state. +If reserving cell of the instance fails, try to get another new +instance until we can reserve an empty cell of it. +@return the instance found, never NULL. */ +UNIV_INLINE +sync_array_t* +sync_array_get_and_reserve_cell( +/*============================*/ + void* object, /*!< in: pointer to the object to wait for */ + ulint type, /*!< in: lock request type */ + const char* file, /*!< in: file where requested */ + ulint line, /*!< in: line where requested */ + ulint* index); /*!< out: index of the reserved cell */ +/******************************************************************//** +Reserves a wait array cell for waiting for an object. +The event of the cell is reset to nonsignalled state. +@return true if free cell is found, otherwise false */ +UNIV_INTERN +bool +sync_array_reserve_cell( +/*====================*/ + sync_array_t* arr, /*!< in: wait array */ + void* object, /*!< in: pointer to the object to wait for */ + ulint type, /*!< in: lock request type */ + const char* file, /*!< in: file where requested */ + ulint line, /*!< in: line where requested */ + ulint* index); /*!< out: index of the reserved cell */ +/******************************************************************//** +This function should be called when a thread starts to wait on +a wait array cell. In the debug version this function checks +if the wait for a semaphore will result in a deadlock, in which +case prints info and asserts. */ +UNIV_INTERN +void +sync_array_wait_event( +/*==================*/ + sync_array_t* arr, /*!< in: wait array */ + ulint index); /*!< in: index of the reserved cell */ +/******************************************************************//** +Frees the cell. NOTE! sync_array_wait_event frees the cell +automatically! */ +UNIV_INTERN +void +sync_array_free_cell( +/*=================*/ + sync_array_t* arr, /*!< in: wait array */ + ulint index); /*!< in: index of the cell in array */ +/**********************************************************************//** +Note that one of the wait objects was signalled. */ +UNIV_INTERN +void +sync_array_object_signalled(void); +/*=============================*/ + +/**********************************************************************//** +If the wakeup algorithm does not work perfectly at semaphore relases, +this function will do the waking (see the comment in mutex_exit). This +function should be called about every 1 second in the server. */ +UNIV_INTERN +void +sync_arr_wake_threads_if_sema_free(void); +/*====================================*/ +/**********************************************************************//** +Prints warnings of long semaphore waits to stderr. +@return TRUE if fatal semaphore wait threshold was exceeded */ +UNIV_INTERN +ibool +sync_array_print_long_waits( +/*========================*/ + os_thread_id_t* waiter, /*!< out: longest waiting thread */ + const void** sema) /*!< out: longest-waited-for semaphore */ + __attribute__((nonnull)); +/********************************************************************//** +Validates the integrity of the wait array. Checks +that the number of reserved cells equals the count variable. */ +UNIV_INTERN +void +sync_array_validate( +/*================*/ + sync_array_t* arr); /*!< in: sync wait array */ +/**********************************************************************//** +Prints info of the wait array. */ +UNIV_INTERN +void +sync_array_print( +/*=============*/ + FILE* file); /*!< in: file where to print */ + +/**********************************************************************//** +Create the primary system wait array(s), they are protected by an OS mutex */ +UNIV_INTERN +void +sync_array_init( +/*============*/ + ulint n_threads); /*!< in: Number of slots to create */ +/**********************************************************************//** +Close sync array wait sub-system. */ +UNIV_INTERN +void +sync_array_close(void); +/*==================*/ + +/**********************************************************************//** +Get an instance of the sync wait array. */ +UNIV_INTERN +sync_array_t* +sync_array_get(void); +/*================*/ + +#ifndef UNIV_NONINL +#include "sync0arr.ic" +#endif + +#endif diff --git a/storage/innobase/include/sync0arr.ic b/storage/innobase/include/sync0arr.ic new file mode 100644 index 00000000000..18a46dd0a41 --- /dev/null +++ b/storage/innobase/include/sync0arr.ic @@ -0,0 +1,64 @@ +/***************************************************************************** + +Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/sync0arr.ic +The wait array for synchronization primitives + +Inline code + +Created 9/5/1995 Heikki Tuuri +*******************************************************/ + +/** User configured sync array size */ +extern ulong srv_sync_array_size; + +/******************************************************************//** +Get an instance of the sync wait array and reserve a wait array cell +in the instance for waiting for an object. The event of the cell is +reset to nonsignalled state. +If reserving cell of the instance fails, try to get another new +instance until we can reserve an empty cell of it. +@return the instance found, never NULL. */ +UNIV_INLINE +sync_array_t* +sync_array_get_and_reserve_cell( +/*============================*/ + void* object, /*!< in: pointer to the object to wait for */ + ulint type, /*!< in: lock request type */ + const char* file, /*!< in: file where requested */ + ulint line, /*!< in: line where requested */ + ulint* index) /*!< out: index of the reserved cell */ +{ + sync_array_t* sync_arr; + bool reserved = false; + + for (ulint i = 0; i < srv_sync_array_size && !reserved; ++i) { + sync_arr = sync_array_get(); + reserved = sync_array_reserve_cell(sync_arr, object, type, + file, line, index); + } + + /* This won't be true every time, for the loop above may execute + more than srv_sync_array_size times to reserve a cell. + But an assertion here makes the code more solid. */ + ut_a(reserved); + + return sync_arr; +} + diff --git a/storage/innobase/include/sync0rw.h b/storage/innobase/include/sync0rw.h new file mode 100644 index 00000000000..fdcbb1b6fa5 --- /dev/null +++ b/storage/innobase/include/sync0rw.h @@ -0,0 +1,813 @@ +/***************************************************************************** + +Copyright (c) 1995, 2014, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2008, Google Inc. + +Portions of this file contain modifications contributed and copyrighted by +Google, Inc. Those modifications are gratefully acknowledged and are described +briefly in the InnoDB documentation. The contributions by Google are +incorporated with their permission, and subject to the conditions contained in +the file COPYING.Google. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/sync0rw.h +The read-write lock (for threads, not for database transactions) + +Created 9/11/1995 Heikki Tuuri +*******************************************************/ + +#ifndef sync0rw_h +#define sync0rw_h + +#include "univ.i" +#ifndef UNIV_HOTBACKUP +#include "ut0lst.h" +#include "ut0counter.h" +#include "sync0sync.h" +#include "os0sync.h" + +/* The following undef is to prevent a name conflict with a macro +in MySQL: */ +#undef rw_lock_t +#endif /* !UNIV_HOTBACKUP */ + +/** Counters for RW locks. */ +struct rw_lock_stats_t { + typedef ib_counter_t<ib_int64_t, IB_N_SLOTS> ib_int64_counter_t; + + /** number of spin waits on rw-latches, + resulted during shared (read) locks */ + ib_int64_counter_t rw_s_spin_wait_count; + + /** number of spin loop rounds on rw-latches, + resulted during shared (read) locks */ + ib_int64_counter_t rw_s_spin_round_count; + + /** number of OS waits on rw-latches, + resulted during shared (read) locks */ + ib_int64_counter_t rw_s_os_wait_count; + + /** number of unlocks (that unlock shared locks), + set only when UNIV_SYNC_PERF_STAT is defined */ + ib_int64_counter_t rw_s_exit_count; + + /** number of spin waits on rw-latches, + resulted during exclusive (write) locks */ + ib_int64_counter_t rw_x_spin_wait_count; + + /** number of spin loop rounds on rw-latches, + resulted during exclusive (write) locks */ + ib_int64_counter_t rw_x_spin_round_count; + + /** number of OS waits on rw-latches, + resulted during exclusive (write) locks */ + ib_int64_counter_t rw_x_os_wait_count; + + /** number of unlocks (that unlock exclusive locks), + set only when UNIV_SYNC_PERF_STAT is defined */ + ib_int64_counter_t rw_x_exit_count; +}; + +/* Latch types; these are used also in btr0btr.h: keep the numerical values +smaller than 30 and the order of the numerical values like below! */ +#define RW_S_LATCH 1 +#define RW_X_LATCH 2 +#define RW_NO_LATCH 3 + +#ifndef UNIV_HOTBACKUP +/* We decrement lock_word by this amount for each x_lock. It is also the +start value for the lock_word, meaning that it limits the maximum number +of concurrent read locks before the rw_lock breaks. The current value of +0x00100000 allows 1,048,575 concurrent readers and 2047 recursive writers.*/ +#define X_LOCK_DECR 0x00100000 + +struct rw_lock_t; +#ifdef UNIV_SYNC_DEBUG +struct rw_lock_debug_t; +#endif /* UNIV_SYNC_DEBUG */ + +typedef UT_LIST_BASE_NODE_T(rw_lock_t) rw_lock_list_t; + +extern rw_lock_list_t rw_lock_list; +extern ib_mutex_t rw_lock_list_mutex; + +#ifdef UNIV_SYNC_DEBUG +/* The global mutex which protects debug info lists of all rw-locks. +To modify the debug info list of an rw-lock, this mutex has to be + +acquired in addition to the mutex protecting the lock. */ +extern ib_mutex_t rw_lock_debug_mutex; +extern os_event_t rw_lock_debug_event; /*!< If deadlock detection does + not get immediately the mutex it + may wait for this event */ +extern ibool rw_lock_debug_waiters; /*!< This is set to TRUE, if + there may be waiters for the event */ +#endif /* UNIV_SYNC_DEBUG */ + +/** Counters for RW locks. */ +extern rw_lock_stats_t rw_lock_stats; + +#ifdef UNIV_PFS_RWLOCK +/* Following are rwlock keys used to register with MySQL +performance schema */ +# ifdef UNIV_LOG_ARCHIVE +extern mysql_pfs_key_t archive_lock_key; +# endif /* UNIV_LOG_ARCHIVE */ +extern mysql_pfs_key_t btr_search_latch_key; +extern mysql_pfs_key_t buf_block_lock_key; +# ifdef UNIV_SYNC_DEBUG +extern mysql_pfs_key_t buf_block_debug_latch_key; +# endif /* UNIV_SYNC_DEBUG */ +extern mysql_pfs_key_t dict_operation_lock_key; +extern mysql_pfs_key_t checkpoint_lock_key; +extern mysql_pfs_key_t fil_space_latch_key; +extern mysql_pfs_key_t fts_cache_rw_lock_key; +extern mysql_pfs_key_t fts_cache_init_rw_lock_key; +extern mysql_pfs_key_t trx_i_s_cache_lock_key; +extern mysql_pfs_key_t trx_purge_latch_key; +extern mysql_pfs_key_t index_tree_rw_lock_key; +extern mysql_pfs_key_t index_online_log_key; +extern mysql_pfs_key_t dict_table_stats_key; +extern mysql_pfs_key_t trx_sys_rw_lock_key; +extern mysql_pfs_key_t hash_table_rw_lock_key; +#endif /* UNIV_PFS_RWLOCK */ + + +#ifndef UNIV_PFS_RWLOCK +/******************************************************************//** +Creates, or rather, initializes an rw-lock object in a specified memory +location (which must be appropriately aligned). The rw-lock is initialized +to the non-locked state. Explicit freeing of the rw-lock with rw_lock_free +is necessary only if the memory block containing it is freed. +if MySQL performance schema is enabled and "UNIV_PFS_RWLOCK" is +defined, the rwlock are instrumented with performance schema probes. */ +# ifdef UNIV_DEBUG +# ifdef UNIV_SYNC_DEBUG +# define rw_lock_create(K, L, level) \ + rw_lock_create_func((L), (level), #L, __FILE__, __LINE__) +# else /* UNIV_SYNC_DEBUG */ +# define rw_lock_create(K, L, level) \ + rw_lock_create_func((L), #L, __FILE__, __LINE__) +# endif/* UNIV_SYNC_DEBUG */ +# else /* UNIV_DEBUG */ +# define rw_lock_create(K, L, level) \ + rw_lock_create_func((L), __FILE__, __LINE__) +# endif /* UNIV_DEBUG */ + +/**************************************************************//** +NOTE! The following macros should be used in rw locking and +unlocking, not the corresponding function. */ + +# define rw_lock_s_lock(M) \ + rw_lock_s_lock_func((M), 0, __FILE__, __LINE__) + +# define rw_lock_s_lock_inline(M, P, F, L) \ + rw_lock_s_lock_func((M), (P), (F), (L)) + +# define rw_lock_s_lock_gen(M, P) \ + rw_lock_s_lock_func((M), (P), __FILE__, __LINE__) + +# define rw_lock_s_lock_gen_nowait(M, P) \ + rw_lock_s_lock_low((M), (P), __FILE__, __LINE__) + +# define rw_lock_s_lock_nowait(M, F, L) \ + rw_lock_s_lock_low((M), 0, (F), (L)) + +# ifdef UNIV_SYNC_DEBUG +# define rw_lock_s_unlock_gen(L, P) rw_lock_s_unlock_func(P, L) +# else +# define rw_lock_s_unlock_gen(L, P) rw_lock_s_unlock_func(L) +# endif + + +# define rw_lock_x_lock(M) \ + rw_lock_x_lock_func((M), 0, __FILE__, __LINE__) + +# define rw_lock_x_lock_inline(M, P, F, L) \ + rw_lock_x_lock_func((M), (P), (F), (L)) + +# define rw_lock_x_lock_gen(M, P) \ + rw_lock_x_lock_func((M), (P), __FILE__, __LINE__) + +# define rw_lock_x_lock_nowait(M) \ + rw_lock_x_lock_func_nowait((M), __FILE__, __LINE__) + +# define rw_lock_x_lock_func_nowait_inline(M, F, L) \ + rw_lock_x_lock_func_nowait((M), (F), (L)) + +# ifdef UNIV_SYNC_DEBUG +# define rw_lock_x_unlock_gen(L, P) rw_lock_x_unlock_func(P, L) +# else +# define rw_lock_x_unlock_gen(L, P) rw_lock_x_unlock_func(L) +# endif + +# define rw_lock_free(M) rw_lock_free_func(M) + +#else /* !UNIV_PFS_RWLOCK */ + +/* Following macros point to Performance Schema instrumented functions. */ +# ifdef UNIV_DEBUG +# ifdef UNIV_SYNC_DEBUG +# define rw_lock_create(K, L, level) \ + pfs_rw_lock_create_func((K), (L), (level), #L, __FILE__, __LINE__) +# else /* UNIV_SYNC_DEBUG */ +# define rw_lock_create(K, L, level) \ + pfs_rw_lock_create_func((K), (L), #L, __FILE__, __LINE__) +# endif/* UNIV_SYNC_DEBUG */ +# else /* UNIV_DEBUG */ +# define rw_lock_create(K, L, level) \ + pfs_rw_lock_create_func((K), (L), __FILE__, __LINE__) +# endif /* UNIV_DEBUG */ + +/****************************************************************** +NOTE! The following macros should be used in rw locking and +unlocking, not the corresponding function. */ + +# define rw_lock_s_lock(M) \ + pfs_rw_lock_s_lock_func((M), 0, __FILE__, __LINE__) + +# define rw_lock_s_lock_inline(M, P, F, L) \ + pfs_rw_lock_s_lock_func((M), (P), (F), (L)) + +# define rw_lock_s_lock_gen(M, P) \ + pfs_rw_lock_s_lock_func((M), (P), __FILE__, __LINE__) + +# define rw_lock_s_lock_gen_nowait(M, P) \ + pfs_rw_lock_s_lock_low((M), (P), __FILE__, __LINE__) + +# define rw_lock_s_lock_nowait(M, F, L) \ + pfs_rw_lock_s_lock_low((M), 0, (F), (L)) + +# ifdef UNIV_SYNC_DEBUG +# define rw_lock_s_unlock_gen(L, P) pfs_rw_lock_s_unlock_func(P, L) +# else +# define rw_lock_s_unlock_gen(L, P) pfs_rw_lock_s_unlock_func(L) +# endif + +# define rw_lock_x_lock(M) \ + pfs_rw_lock_x_lock_func((M), 0, __FILE__, __LINE__) + +# define rw_lock_x_lock_inline(M, P, F, L) \ + pfs_rw_lock_x_lock_func((M), (P), (F), (L)) + +# define rw_lock_x_lock_gen(M, P) \ + pfs_rw_lock_x_lock_func((M), (P), __FILE__, __LINE__) + +# define rw_lock_x_lock_nowait(M) \ + pfs_rw_lock_x_lock_func_nowait((M), __FILE__, __LINE__) + +# define rw_lock_x_lock_func_nowait_inline(M, F, L) \ + pfs_rw_lock_x_lock_func_nowait((M), (F), (L)) + +# ifdef UNIV_SYNC_DEBUG +# define rw_lock_x_unlock_gen(L, P) pfs_rw_lock_x_unlock_func(P, L) +# else +# define rw_lock_x_unlock_gen(L, P) pfs_rw_lock_x_unlock_func(L) +# endif + +# define rw_lock_free(M) pfs_rw_lock_free_func(M) + +#endif /* UNIV_PFS_RWLOCK */ + +#define rw_lock_s_unlock(L) rw_lock_s_unlock_gen(L, 0) +#define rw_lock_x_unlock(L) rw_lock_x_unlock_gen(L, 0) + +/******************************************************************//** +Creates, or rather, initializes an rw-lock object in a specified memory +location (which must be appropriately aligned). The rw-lock is initialized +to the non-locked state. Explicit freeing of the rw-lock with rw_lock_free +is necessary only if the memory block containing it is freed. */ +UNIV_INTERN +void +rw_lock_create_func( +/*================*/ + rw_lock_t* lock, /*!< in: pointer to memory */ +#ifdef UNIV_DEBUG +# ifdef UNIV_SYNC_DEBUG + ulint level, /*!< in: level */ +# endif /* UNIV_SYNC_DEBUG */ + const char* cmutex_name, /*!< in: mutex name */ +#endif /* UNIV_DEBUG */ + const char* cfile_name, /*!< in: file name where created */ + ulint cline); /*!< in: file line where created */ +/******************************************************************//** +Calling this function is obligatory only if the memory buffer containing +the rw-lock is freed. Removes an rw-lock object from the global list. The +rw-lock is checked to be in the non-locked state. */ +UNIV_INTERN +void +rw_lock_free_func( +/*==============*/ + rw_lock_t* lock); /*!< in: rw-lock */ +#ifdef UNIV_DEBUG +/******************************************************************//** +Checks that the rw-lock has been initialized and that there are no +simultaneous shared and exclusive locks. +@return TRUE */ +UNIV_INTERN +ibool +rw_lock_validate( +/*=============*/ + rw_lock_t* lock); /*!< in: rw-lock */ +#endif /* UNIV_DEBUG */ +/******************************************************************//** +Low-level function which tries to lock an rw-lock in s-mode. Performs no +spinning. +@return TRUE if success */ +UNIV_INLINE +ibool +rw_lock_s_lock_low( +/*===============*/ + rw_lock_t* lock, /*!< in: pointer to rw-lock */ + ulint pass __attribute__((unused)), + /*!< in: pass value; != 0, if the lock will be + passed to another thread to unlock */ + const char* file_name, /*!< in: file name where lock requested */ + ulint line); /*!< in: line where requested */ +/******************************************************************//** +NOTE! Use the corresponding macro, not directly this function, except if +you supply the file name and line number. Lock an rw-lock in shared mode +for the current thread. If the rw-lock is locked in exclusive mode, or +there is an exclusive lock request waiting, the function spins a preset +time (controlled by SYNC_SPIN_ROUNDS), waiting for the lock, before +suspending the thread. */ +UNIV_INLINE +void +rw_lock_s_lock_func( +/*================*/ + rw_lock_t* lock, /*!< in: pointer to rw-lock */ + ulint pass, /*!< in: pass value; != 0, if the lock will + be passed to another thread to unlock */ + const char* file_name,/*!< in: file name where lock requested */ + ulint line); /*!< in: line where requested */ +/******************************************************************//** +NOTE! Use the corresponding macro, not directly this function! Lock an +rw-lock in exclusive mode for the current thread if the lock can be +obtained immediately. +@return TRUE if success */ +UNIV_INLINE +ibool +rw_lock_x_lock_func_nowait( +/*=======================*/ + rw_lock_t* lock, /*!< in: pointer to rw-lock */ + const char* file_name,/*!< in: file name where lock requested */ + ulint line); /*!< in: line where requested */ +/******************************************************************//** +Releases a shared mode lock. */ +UNIV_INLINE +void +rw_lock_s_unlock_func( +/*==================*/ +#ifdef UNIV_SYNC_DEBUG + ulint pass, /*!< in: pass value; != 0, if the lock may have + been passed to another thread to unlock */ +#endif + rw_lock_t* lock); /*!< in/out: rw-lock */ + +/******************************************************************//** +NOTE! Use the corresponding macro, not directly this function! Lock an +rw-lock in exclusive mode for the current thread. If the rw-lock is locked +in shared or exclusive mode, or there is an exclusive lock request waiting, +the function spins a preset time (controlled by SYNC_SPIN_ROUNDS), waiting +for the lock, before suspending the thread. If the same thread has an x-lock +on the rw-lock, locking succeed, with the following exception: if pass != 0, +only a single x-lock may be taken on the lock. NOTE: If the same thread has +an s-lock, locking does not succeed! */ +UNIV_INTERN +void +rw_lock_x_lock_func( +/*================*/ + rw_lock_t* lock, /*!< in: pointer to rw-lock */ + ulint pass, /*!< in: pass value; != 0, if the lock will + be passed to another thread to unlock */ + const char* file_name,/*!< in: file name where lock requested */ + ulint line); /*!< in: line where requested */ +/******************************************************************//** +Releases an exclusive mode lock. */ +UNIV_INLINE +void +rw_lock_x_unlock_func( +/*==================*/ +#ifdef UNIV_SYNC_DEBUG + ulint pass, /*!< in: pass value; != 0, if the lock may have + been passed to another thread to unlock */ +#endif + rw_lock_t* lock); /*!< in/out: rw-lock */ +/******************************************************************//** +This function is used in the insert buffer to move the ownership of an +x-latch on a buffer frame to the current thread. The x-latch was set by +the buffer read operation and it protected the buffer frame while the +read was done. The ownership is moved because we want that the current +thread is able to acquire a second x-latch which is stored in an mtr. +This, in turn, is needed to pass the debug checks of index page +operations. */ +UNIV_INTERN +void +rw_lock_x_lock_move_ownership( +/*==========================*/ + rw_lock_t* lock); /*!< in: lock which was x-locked in the + buffer read */ +/******************************************************************//** +Returns the value of writer_count for the lock. Does not reserve the lock +mutex, so the caller must be sure it is not changed during the call. +@return value of writer_count */ +UNIV_INLINE +ulint +rw_lock_get_x_lock_count( +/*=====================*/ + const rw_lock_t* lock); /*!< in: rw-lock */ +/********************************************************************//** +Check if there are threads waiting for the rw-lock. +@return 1 if waiters, 0 otherwise */ +UNIV_INLINE +ulint +rw_lock_get_waiters( +/*================*/ + const rw_lock_t* lock); /*!< in: rw-lock */ +/******************************************************************//** +Returns the write-status of the lock - this function made more sense +with the old rw_lock implementation. +@return RW_LOCK_NOT_LOCKED, RW_LOCK_EX, RW_LOCK_WAIT_EX */ +UNIV_INLINE +ulint +rw_lock_get_writer( +/*===============*/ + const rw_lock_t* lock); /*!< in: rw-lock */ +/******************************************************************//** +Returns the number of readers. +@return number of readers */ +UNIV_INLINE +ulint +rw_lock_get_reader_count( +/*=====================*/ + const rw_lock_t* lock); /*!< in: rw-lock */ +/******************************************************************//** +Decrements lock_word the specified amount if it is greater than 0. +This is used by both s_lock and x_lock operations. +@return TRUE if decr occurs */ +UNIV_INLINE +ibool +rw_lock_lock_word_decr( +/*===================*/ + rw_lock_t* lock, /*!< in/out: rw-lock */ + ulint amount); /*!< in: amount to decrement */ +/******************************************************************//** +Increments lock_word the specified amount and returns new value. +@return lock->lock_word after increment */ +UNIV_INLINE +lint +rw_lock_lock_word_incr( +/*===================*/ + rw_lock_t* lock, /*!< in/out: rw-lock */ + ulint amount); /*!< in: amount to increment */ +/******************************************************************//** +This function sets the lock->writer_thread and lock->recursive fields. +For platforms where we are using atomic builtins instead of lock->mutex +it sets the lock->writer_thread field using atomics to ensure memory +ordering. Note that it is assumed that the caller of this function +effectively owns the lock i.e.: nobody else is allowed to modify +lock->writer_thread at this point in time. +The protocol is that lock->writer_thread MUST be updated BEFORE the +lock->recursive flag is set. */ +UNIV_INLINE +void +rw_lock_set_writer_id_and_recursion_flag( +/*=====================================*/ + rw_lock_t* lock, /*!< in/out: lock to work on */ + ibool recursive); /*!< in: TRUE if recursion + allowed */ +#ifdef UNIV_SYNC_DEBUG +/******************************************************************//** +Checks if the thread has locked the rw-lock in the specified mode, with +the pass value == 0. */ +UNIV_INTERN +ibool +rw_lock_own( +/*========*/ + rw_lock_t* lock, /*!< in: rw-lock */ + ulint lock_type) /*!< in: lock type: RW_LOCK_SHARED, + RW_LOCK_EX */ + __attribute__((warn_unused_result)); +#endif /* UNIV_SYNC_DEBUG */ +/******************************************************************//** +Checks if somebody has locked the rw-lock in the specified mode. */ +UNIV_INTERN +ibool +rw_lock_is_locked( +/*==============*/ + rw_lock_t* lock, /*!< in: rw-lock */ + ulint lock_type); /*!< in: lock type: RW_LOCK_SHARED, + RW_LOCK_EX */ +#ifdef UNIV_SYNC_DEBUG +/***************************************************************//** +Prints debug info of an rw-lock. */ +UNIV_INTERN +void +rw_lock_print( +/*==========*/ + rw_lock_t* lock); /*!< in: rw-lock */ +/***************************************************************//** +Prints debug info of currently locked rw-locks. */ +UNIV_INTERN +void +rw_lock_list_print_info( +/*====================*/ + FILE* file); /*!< in: file where to print */ +/***************************************************************//** +Returns the number of currently locked rw-locks. +Works only in the debug version. +@return number of locked rw-locks */ +UNIV_INTERN +ulint +rw_lock_n_locked(void); +/*==================*/ + +/*#####################################################################*/ + +/******************************************************************//** +Acquires the debug mutex. We cannot use the mutex defined in sync0sync, +because the debug mutex is also acquired in sync0arr while holding the OS +mutex protecting the sync array, and the ordinary mutex_enter might +recursively call routines in sync0arr, leading to a deadlock on the OS +mutex. */ +UNIV_INTERN +void +rw_lock_debug_mutex_enter(void); +/*===========================*/ +/******************************************************************//** +Releases the debug mutex. */ +UNIV_INTERN +void +rw_lock_debug_mutex_exit(void); +/*==========================*/ +/*********************************************************************//** +Prints info of a debug struct. */ +UNIV_INTERN +void +rw_lock_debug_print( +/*================*/ + FILE* f, /*!< in: output stream */ + rw_lock_debug_t* info); /*!< in: debug struct */ +#endif /* UNIV_SYNC_DEBUG */ + +/* NOTE! The structure appears here only for the compiler to know its size. +Do not use its fields directly! */ + +/** The structure used in the spin lock implementation of a read-write +lock. Several threads may have a shared lock simultaneously in this +lock, but only one writer may have an exclusive lock, in which case no +shared locks are allowed. To prevent starving of a writer blocked by +readers, a writer may queue for x-lock by decrementing lock_word: no +new readers will be let in while the thread waits for readers to +exit. */ +struct rw_lock_t { + volatile lint lock_word; + /*!< Holds the state of the lock. */ + volatile ulint waiters;/*!< 1: there are waiters */ + volatile ibool recursive;/*!< Default value FALSE which means the lock + is non-recursive. The value is typically set + to TRUE making normal rw_locks recursive. In + case of asynchronous IO, when a non-zero + value of 'pass' is passed then we keep the + lock non-recursive. + This flag also tells us about the state of + writer_thread field. If this flag is set + then writer_thread MUST contain the thread + id of the current x-holder or wait-x thread. + This flag must be reset in x_unlock + functions before incrementing the lock_word */ + volatile os_thread_id_t writer_thread; + /*!< Thread id of writer thread. Is only + guaranteed to have sane and non-stale + value iff recursive flag is set. */ + os_event_t event; /*!< Used by sync0arr.cc for thread queueing */ + os_event_t wait_ex_event; + /*!< Event for next-writer to wait on. A thread + must decrement lock_word before waiting. */ +#ifndef INNODB_RW_LOCKS_USE_ATOMICS + ib_mutex_t mutex; /*!< The mutex protecting rw_lock_t */ +#endif /* INNODB_RW_LOCKS_USE_ATOMICS */ + + UT_LIST_NODE_T(rw_lock_t) list; + /*!< All allocated rw locks are put into a + list */ +#ifdef UNIV_SYNC_DEBUG + UT_LIST_BASE_NODE_T(rw_lock_debug_t) debug_list; + /*!< In the debug version: pointer to the debug + info list of the lock */ + ulint level; /*!< Level in the global latching order. */ +#endif /* UNIV_SYNC_DEBUG */ +#ifdef UNIV_PFS_RWLOCK + struct PSI_rwlock *pfs_psi;/*!< The instrumentation hook */ +#endif + ulint count_os_wait; /*!< Count of os_waits. May not be accurate */ + const char* cfile_name;/*!< File name where lock created */ + /* last s-lock file/line is not guaranteed to be correct */ + const char* last_s_file_name;/*!< File name where last s-locked */ + const char* last_x_file_name;/*!< File name where last x-locked */ + ibool writer_is_wait_ex; + /*!< This is TRUE if the writer field is + RW_LOCK_WAIT_EX; this field is located far + from the memory update hotspot fields which + are at the start of this struct, thus we can + peek this field without causing much memory + bus traffic */ + unsigned cline:14; /*!< Line where created */ + unsigned last_s_line:14; /*!< Line number where last time s-locked */ + unsigned last_x_line:14; /*!< Line number where last time x-locked */ +#ifdef UNIV_DEBUG + ulint magic_n; /*!< RW_LOCK_MAGIC_N */ +/** Value of rw_lock_t::magic_n */ +#define RW_LOCK_MAGIC_N 22643 +#endif /* UNIV_DEBUG */ +}; + +#ifdef UNIV_SYNC_DEBUG +/** The structure for storing debug info of an rw-lock. All access to this +structure must be protected by rw_lock_debug_mutex_enter(). */ +struct rw_lock_debug_t { + + os_thread_id_t thread_id; /*!< The thread id of the thread which + locked the rw-lock */ + ulint pass; /*!< Pass value given in the lock operation */ + ulint lock_type; /*!< Type of the lock: RW_LOCK_EX, + RW_LOCK_SHARED, RW_LOCK_WAIT_EX */ + const char* file_name;/*!< File name where the lock was obtained */ + ulint line; /*!< Line where the rw-lock was locked */ + UT_LIST_NODE_T(rw_lock_debug_t) list; + /*!< Debug structs are linked in a two-way + list */ +}; +#endif /* UNIV_SYNC_DEBUG */ + +/* For performance schema instrumentation, a new set of rwlock +wrap functions are created if "UNIV_PFS_RWLOCK" is defined. +The instrumentations are not planted directly into original +functions, so that we keep the underlying function as they +are. And in case, user wants to "take out" some rwlock from +instrumentation even if performance schema (UNIV_PFS_RWLOCK) +is defined, they can do so by reinstating APIs directly link to +original underlying functions. +The instrumented function names have prefix of "pfs_rw_lock_" vs. +original name prefix of "rw_lock_". Following are list of functions +that have been instrumented: + +rw_lock_create() +rw_lock_x_lock() +rw_lock_x_lock_gen() +rw_lock_x_lock_nowait() +rw_lock_x_unlock_gen() +rw_lock_s_lock() +rw_lock_s_lock_gen() +rw_lock_s_lock_nowait() +rw_lock_s_unlock_gen() +rw_lock_free() +*/ + +#ifdef UNIV_PFS_RWLOCK +/******************************************************************//** +Performance schema instrumented wrap function for rw_lock_create_func() +NOTE! Please use the corresponding macro rw_lock_create(), not +directly this function! */ +UNIV_INLINE +void +pfs_rw_lock_create_func( +/*====================*/ + PSI_rwlock_key key, /*!< in: key registered with + performance schema */ + rw_lock_t* lock, /*!< in: rw lock */ +#ifdef UNIV_DEBUG +# ifdef UNIV_SYNC_DEBUG + ulint level, /*!< in: level */ +# endif /* UNIV_SYNC_DEBUG */ + const char* cmutex_name, /*!< in: mutex name */ +#endif /* UNIV_DEBUG */ + const char* cfile_name, /*!< in: file name where created */ + ulint cline); /*!< in: file line where created */ + +/******************************************************************//** +Performance schema instrumented wrap function for rw_lock_x_lock_func() +NOTE! Please use the corresponding macro rw_lock_x_lock(), not +directly this function! */ +UNIV_INLINE +void +pfs_rw_lock_x_lock_func( +/*====================*/ + rw_lock_t* lock, /*!< in: pointer to rw-lock */ + ulint pass, /*!< in: pass value; != 0, if the lock will + be passed to another thread to unlock */ + const char* file_name,/*!< in: file name where lock requested */ + ulint line); /*!< in: line where requested */ +/******************************************************************//** +Performance schema instrumented wrap function for +rw_lock_x_lock_func_nowait() +NOTE! Please use the corresponding macro, not directly this function! +@return TRUE if success */ +UNIV_INLINE +ibool +pfs_rw_lock_x_lock_func_nowait( +/*===========================*/ + rw_lock_t* lock, /*!< in: pointer to rw-lock */ + const char* file_name,/*!< in: file name where lock requested */ + ulint line); /*!< in: line where requested */ +/******************************************************************//** +Performance schema instrumented wrap function for rw_lock_s_lock_func() +NOTE! Please use the corresponding macro rw_lock_s_lock(), not directly +this function! */ +UNIV_INLINE +void +pfs_rw_lock_s_lock_func( +/*====================*/ + rw_lock_t* lock, /*!< in: pointer to rw-lock */ + ulint pass, /*!< in: pass value; != 0, if the lock will + be passed to another thread to unlock */ + const char* file_name,/*!< in: file name where lock requested */ + ulint line); /*!< in: line where requested */ +/******************************************************************//** +Performance schema instrumented wrap function for rw_lock_s_lock_func() +NOTE! Please use the corresponding macro rw_lock_s_lock(), not directly +this function! +@return TRUE if success */ +UNIV_INLINE +ibool +pfs_rw_lock_s_lock_low( +/*===================*/ + rw_lock_t* lock, /*!< in: pointer to rw-lock */ + ulint pass, /*!< in: pass value; != 0, if the + lock will be passed to another + thread to unlock */ + const char* file_name, /*!< in: file name where lock requested */ + ulint line); /*!< in: line where requested */ +/******************************************************************//** +Performance schema instrumented wrap function for rw_lock_x_lock_func() +NOTE! Please use the corresponding macro rw_lock_x_lock(), not directly +this function! */ +UNIV_INLINE +void +pfs_rw_lock_x_lock_func( +/*====================*/ + rw_lock_t* lock, /*!< in: pointer to rw-lock */ + ulint pass, /*!< in: pass value; != 0, if the lock will + be passed to another thread to unlock */ + const char* file_name,/*!< in: file name where lock requested */ + ulint line); /*!< in: line where requested */ +/******************************************************************//** +Performance schema instrumented wrap function for rw_lock_s_unlock_func() +NOTE! Please use the corresponding macro rw_lock_s_unlock(), not directly +this function! */ +UNIV_INLINE +void +pfs_rw_lock_s_unlock_func( +/*======================*/ +#ifdef UNIV_SYNC_DEBUG + ulint pass, /*!< in: pass value; != 0, if the + lock may have been passed to another + thread to unlock */ +#endif + rw_lock_t* lock); /*!< in/out: rw-lock */ +/******************************************************************//** +Performance schema instrumented wrap function for rw_lock_s_unlock_func() +NOTE! Please use the corresponding macro rw_lock_x_unlock(), not directly +this function! */ +UNIV_INLINE +void +pfs_rw_lock_x_unlock_func( +/*======================*/ +#ifdef UNIV_SYNC_DEBUG + ulint pass, /*!< in: pass value; != 0, if the + lock may have been passed to another + thread to unlock */ +#endif + rw_lock_t* lock); /*!< in/out: rw-lock */ +/******************************************************************//** +Performance schema instrumented wrap function for rw_lock_free_func() +NOTE! Please use the corresponding macro rw_lock_free(), not directly +this function! */ +UNIV_INLINE +void +pfs_rw_lock_free_func( +/*==================*/ + rw_lock_t* lock); /*!< in: rw-lock */ +#endif /* UNIV_PFS_RWLOCK */ + + +#ifndef UNIV_NONINL +#include "sync0rw.ic" +#endif +#endif /* !UNIV_HOTBACKUP */ + +#endif diff --git a/storage/innobase/include/sync0rw.ic b/storage/innobase/include/sync0rw.ic new file mode 100644 index 00000000000..bb05ae7daf1 --- /dev/null +++ b/storage/innobase/include/sync0rw.ic @@ -0,0 +1,797 @@ +/***************************************************************************** + +Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2008, Google Inc. + +Portions of this file contain modifications contributed and copyrighted by +Google, Inc. Those modifications are gratefully acknowledged and are described +briefly in the InnoDB documentation. The contributions by Google are +incorporated with their permission, and subject to the conditions contained in +the file COPYING.Google. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/sync0rw.ic +The read-write lock (for threads) + +Created 9/11/1995 Heikki Tuuri +*******************************************************/ + +/******************************************************************//** +Lock an rw-lock in shared mode for the current thread. If the rw-lock is +locked in exclusive mode, or there is an exclusive lock request waiting, +the function spins a preset time (controlled by SYNC_SPIN_ROUNDS), +waiting for the lock before suspending the thread. */ +UNIV_INTERN +void +rw_lock_s_lock_spin( +/*================*/ + rw_lock_t* lock, /*!< in: pointer to rw-lock */ + ulint pass, /*!< in: pass value; != 0, if the lock will + be passed to another thread to unlock */ + const char* file_name,/*!< in: file name where lock requested */ + ulint line); /*!< in: line where requested */ +#ifdef UNIV_SYNC_DEBUG +/******************************************************************//** +Inserts the debug information for an rw-lock. */ +UNIV_INTERN +void +rw_lock_add_debug_info( +/*===================*/ + rw_lock_t* lock, /*!< in: rw-lock */ + ulint pass, /*!< in: pass value */ + ulint lock_type, /*!< in: lock type */ + const char* file_name, /*!< in: file where requested */ + ulint line); /*!< in: line where requested */ +/******************************************************************//** +Removes a debug information struct for an rw-lock. */ +UNIV_INTERN +void +rw_lock_remove_debug_info( +/*======================*/ + rw_lock_t* lock, /*!< in: rw-lock */ + ulint pass, /*!< in: pass value */ + ulint lock_type); /*!< in: lock type */ +#endif /* UNIV_SYNC_DEBUG */ + +/********************************************************************//** +Check if there are threads waiting for the rw-lock. +@return 1 if waiters, 0 otherwise */ +UNIV_INLINE +ulint +rw_lock_get_waiters( +/*================*/ + const rw_lock_t* lock) /*!< in: rw-lock */ +{ + return(lock->waiters); +} + +/********************************************************************//** +Sets lock->waiters to 1. It is not an error if lock->waiters is already +1. On platforms where ATOMIC builtins are used this function enforces a +memory barrier. */ +UNIV_INLINE +void +rw_lock_set_waiter_flag( +/*====================*/ + rw_lock_t* lock) /*!< in/out: rw-lock */ +{ +#ifdef INNODB_RW_LOCKS_USE_ATOMICS + (void) os_compare_and_swap_ulint(&lock->waiters, 0, 1); +#else /* INNODB_RW_LOCKS_USE_ATOMICS */ + lock->waiters = 1; + os_wmb; +#endif /* INNODB_RW_LOCKS_USE_ATOMICS */ +} + +/********************************************************************//** +Resets lock->waiters to 0. It is not an error if lock->waiters is already +0. On platforms where ATOMIC builtins are used this function enforces a +memory barrier. */ +UNIV_INLINE +void +rw_lock_reset_waiter_flag( +/*======================*/ + rw_lock_t* lock) /*!< in/out: rw-lock */ +{ +#ifdef INNODB_RW_LOCKS_USE_ATOMICS + (void) os_compare_and_swap_ulint(&lock->waiters, 1, 0); +#else /* INNODB_RW_LOCKS_USE_ATOMICS */ + lock->waiters = 0; + os_wmb; +#endif /* INNODB_RW_LOCKS_USE_ATOMICS */ +} + +/******************************************************************//** +Returns the write-status of the lock - this function made more sense +with the old rw_lock implementation. +@return RW_LOCK_NOT_LOCKED, RW_LOCK_EX, RW_LOCK_WAIT_EX */ +UNIV_INLINE +ulint +rw_lock_get_writer( +/*===============*/ + const rw_lock_t* lock) /*!< in: rw-lock */ +{ + lint lock_word = lock->lock_word; + if (lock_word > 0) { + /* return NOT_LOCKED in s-lock state, like the writer + member of the old lock implementation. */ + return(RW_LOCK_NOT_LOCKED); + } else if ((lock_word == 0) || (lock_word <= -X_LOCK_DECR)) { + return(RW_LOCK_EX); + } else { + ut_ad(lock_word > -X_LOCK_DECR); + return(RW_LOCK_WAIT_EX); + } +} + +/******************************************************************//** +Returns the number of readers. +@return number of readers */ +UNIV_INLINE +ulint +rw_lock_get_reader_count( +/*=====================*/ + const rw_lock_t* lock) /*!< in: rw-lock */ +{ + lint lock_word = lock->lock_word; + if (lock_word > 0) { + /* s-locked, no x-waiters */ + return(X_LOCK_DECR - lock_word); + } else if (lock_word < 0 && lock_word > -X_LOCK_DECR) { + /* s-locked, with x-waiters */ + return((ulint)(-lock_word)); + } + return(0); +} + +#ifndef INNODB_RW_LOCKS_USE_ATOMICS +UNIV_INLINE +ib_mutex_t* +rw_lock_get_mutex( +/*==============*/ + rw_lock_t* lock) +{ + return(&(lock->mutex)); +} +#endif + +/******************************************************************//** +Returns the value of writer_count for the lock. Does not reserve the lock +mutex, so the caller must be sure it is not changed during the call. +@return value of writer_count */ +UNIV_INLINE +ulint +rw_lock_get_x_lock_count( +/*=====================*/ + const rw_lock_t* lock) /*!< in: rw-lock */ +{ + lint lock_copy = lock->lock_word; + if ((lock_copy != 0) && (lock_copy > -X_LOCK_DECR)) { + return(0); + } + return((lock_copy == 0) ? 1 : (2 - (lock_copy + X_LOCK_DECR))); +} + +/******************************************************************//** +Two different implementations for decrementing the lock_word of a rw_lock: +one for systems supporting atomic operations, one for others. This does +does not support recusive x-locks: they should be handled by the caller and +need not be atomic since they are performed by the current lock holder. +Returns true if the decrement was made, false if not. +@return TRUE if decr occurs */ +UNIV_INLINE +ibool +rw_lock_lock_word_decr( +/*===================*/ + rw_lock_t* lock, /*!< in/out: rw-lock */ + ulint amount) /*!< in: amount to decrement */ +{ +#ifdef INNODB_RW_LOCKS_USE_ATOMICS + lint local_lock_word; + + os_rmb; + local_lock_word = lock->lock_word; + while (local_lock_word > 0) { + if (os_compare_and_swap_lint(&lock->lock_word, + local_lock_word, + local_lock_word - amount)) { + return(TRUE); + } + local_lock_word = lock->lock_word; + } + return(FALSE); +#else /* INNODB_RW_LOCKS_USE_ATOMICS */ + ibool success = FALSE; + mutex_enter(&(lock->mutex)); + if (lock->lock_word > 0) { + lock->lock_word -= amount; + success = TRUE; + } + mutex_exit(&(lock->mutex)); + return(success); +#endif /* INNODB_RW_LOCKS_USE_ATOMICS */ +} + +/******************************************************************//** +Increments lock_word the specified amount and returns new value. +@return lock->lock_word after increment */ +UNIV_INLINE +lint +rw_lock_lock_word_incr( +/*===================*/ + rw_lock_t* lock, /*!< in/out: rw-lock */ + ulint amount) /*!< in: amount of increment */ +{ +#ifdef INNODB_RW_LOCKS_USE_ATOMICS + return(os_atomic_increment_lint(&lock->lock_word, amount)); +#else /* INNODB_RW_LOCKS_USE_ATOMICS */ + lint local_lock_word; + + mutex_enter(&(lock->mutex)); + + lock->lock_word += amount; + local_lock_word = lock->lock_word; + + mutex_exit(&(lock->mutex)); + + return(local_lock_word); +#endif /* INNODB_RW_LOCKS_USE_ATOMICS */ +} + +/******************************************************************//** +This function sets the lock->writer_thread and lock->recursive fields. +For platforms where we are using atomic builtins instead of lock->mutex +it sets the lock->writer_thread field using atomics to ensure memory +ordering. Note that it is assumed that the caller of this function +effectively owns the lock i.e.: nobody else is allowed to modify +lock->writer_thread at this point in time. +The protocol is that lock->writer_thread MUST be updated BEFORE the +lock->recursive flag is set. */ +UNIV_INLINE +void +rw_lock_set_writer_id_and_recursion_flag( +/*=====================================*/ + rw_lock_t* lock, /*!< in/out: lock to work on */ + ibool recursive) /*!< in: TRUE if recursion + allowed */ +{ + os_thread_id_t curr_thread = os_thread_get_curr_id(); + +#ifdef INNODB_RW_LOCKS_USE_ATOMICS + os_thread_id_t local_thread; + ibool success; + + /* Prevent Valgrind warnings about writer_thread being + uninitialized. It does not matter if writer_thread is + uninitialized, because we are comparing writer_thread against + itself, and the operation should always succeed. */ + UNIV_MEM_VALID(&lock->writer_thread, sizeof lock->writer_thread); + + local_thread = lock->writer_thread; + success = os_compare_and_swap_thread_id( + &lock->writer_thread, local_thread, curr_thread); + ut_a(success); + lock->recursive = recursive; + +#else /* INNODB_RW_LOCKS_USE_ATOMICS */ + + mutex_enter(&lock->mutex); + lock->writer_thread = curr_thread; + lock->recursive = recursive; + mutex_exit(&lock->mutex); + +#endif /* INNODB_RW_LOCKS_USE_ATOMICS */ +} + +/******************************************************************//** +Low-level function which tries to lock an rw-lock in s-mode. Performs no +spinning. +@return TRUE if success */ +UNIV_INLINE +ibool +rw_lock_s_lock_low( +/*===============*/ + rw_lock_t* lock, /*!< in: pointer to rw-lock */ + ulint pass __attribute__((unused)), + /*!< in: pass value; != 0, if the lock will be + passed to another thread to unlock */ + const char* file_name, /*!< in: file name where lock requested */ + ulint line) /*!< in: line where requested */ +{ + if (!rw_lock_lock_word_decr(lock, 1)) { + /* Locking did not succeed */ + return(FALSE); + } + +#ifdef UNIV_SYNC_DEBUG + rw_lock_add_debug_info(lock, pass, RW_LOCK_SHARED, file_name, line); +#endif + /* These debugging values are not set safely: they may be incorrect + or even refer to a line that is invalid for the file name. */ + lock->last_s_file_name = file_name; + lock->last_s_line = line; + + return(TRUE); /* locking succeeded */ +} + +/******************************************************************//** +NOTE! Use the corresponding macro, not directly this function! Lock an +rw-lock in shared mode for the current thread. If the rw-lock is locked +in exclusive mode, or there is an exclusive lock request waiting, the +function spins a preset time (controlled by SYNC_SPIN_ROUNDS), waiting for +the lock, before suspending the thread. */ +UNIV_INLINE +void +rw_lock_s_lock_func( +/*================*/ + rw_lock_t* lock, /*!< in: pointer to rw-lock */ + ulint pass, /*!< in: pass value; != 0, if the lock will + be passed to another thread to unlock */ + const char* file_name,/*!< in: file name where lock requested */ + ulint line) /*!< in: line where requested */ +{ + /* NOTE: As we do not know the thread ids for threads which have + s-locked a latch, and s-lockers will be served only after waiting + x-lock requests have been fulfilled, then if this thread already + owns an s-lock here, it may end up in a deadlock with another thread + which requests an x-lock here. Therefore, we will forbid recursive + s-locking of a latch: the following assert will warn the programmer + of the possibility of this kind of a deadlock. If we want to implement + safe recursive s-locking, we should keep in a list the thread ids of + the threads which have s-locked a latch. This would use some CPU + time. */ + +#ifdef UNIV_SYNC_DEBUG + ut_ad(!rw_lock_own(lock, RW_LOCK_SHARED)); /* see NOTE above */ + ut_ad(!rw_lock_own(lock, RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + + if (rw_lock_s_lock_low(lock, pass, file_name, line)) { + + return; /* Success */ + } else { + /* Did not succeed, try spin wait */ + + rw_lock_s_lock_spin(lock, pass, file_name, line); + + return; + } +} + +/******************************************************************//** +NOTE! Use the corresponding macro, not directly this function! Lock an +rw-lock in exclusive mode for the current thread if the lock can be +obtained immediately. +@return TRUE if success */ +UNIV_INLINE +ibool +rw_lock_x_lock_func_nowait( +/*=======================*/ + rw_lock_t* lock, /*!< in: pointer to rw-lock */ + const char* file_name,/*!< in: file name where lock requested */ + ulint line) /*!< in: line where requested */ +{ + ibool success; + +#ifdef INNODB_RW_LOCKS_USE_ATOMICS + success = os_compare_and_swap_lint(&lock->lock_word, X_LOCK_DECR, 0); +#else + + success = FALSE; + mutex_enter(&(lock->mutex)); + if (lock->lock_word == X_LOCK_DECR) { + lock->lock_word = 0; + success = TRUE; + } + mutex_exit(&(lock->mutex)); + +#endif + if (success) { + rw_lock_set_writer_id_and_recursion_flag(lock, TRUE); + + } else if (lock->recursive + && os_thread_eq(lock->writer_thread, + os_thread_get_curr_id())) { + /* Relock: this lock_word modification is safe since no other + threads can modify (lock, unlock, or reserve) lock_word while + there is an exclusive writer and this is the writer thread. */ + if (lock->lock_word == 0) { + lock->lock_word = -X_LOCK_DECR; + } else { + lock->lock_word--; + } + + /* Watch for too many recursive locks */ + ut_ad(lock->lock_word < 0); + + } else { + /* Failure */ + return(FALSE); + } +#ifdef UNIV_SYNC_DEBUG + rw_lock_add_debug_info(lock, 0, RW_LOCK_EX, file_name, line); +#endif + + lock->last_x_file_name = file_name; + lock->last_x_line = line; + + ut_ad(rw_lock_validate(lock)); + + return(TRUE); +} + +/******************************************************************//** +Releases a shared mode lock. */ +UNIV_INLINE +void +rw_lock_s_unlock_func( +/*==================*/ +#ifdef UNIV_SYNC_DEBUG + ulint pass, /*!< in: pass value; != 0, if the lock may have + been passed to another thread to unlock */ +#endif + rw_lock_t* lock) /*!< in/out: rw-lock */ +{ + ut_ad(lock->lock_word > -X_LOCK_DECR); + ut_ad(lock->lock_word != 0); + ut_ad(lock->lock_word < X_LOCK_DECR); + +#ifdef UNIV_SYNC_DEBUG + rw_lock_remove_debug_info(lock, pass, RW_LOCK_SHARED); +#endif + + /* Increment lock_word to indicate 1 less reader */ + if (rw_lock_lock_word_incr(lock, 1) == 0) { + + /* wait_ex waiter exists. It may not be asleep, but we signal + anyway. We do not wake other waiters, because they can't + exist without wait_ex waiter and wait_ex waiter goes first.*/ + os_event_set(lock->wait_ex_event); + sync_array_object_signalled(); + + } + + ut_ad(rw_lock_validate(lock)); + +#ifdef UNIV_SYNC_PERF_STAT + rw_s_exit_count++; +#endif +} + +/******************************************************************//** +Releases an exclusive mode lock. */ +UNIV_INLINE +void +rw_lock_x_unlock_func( +/*==================*/ +#ifdef UNIV_SYNC_DEBUG + ulint pass, /*!< in: pass value; != 0, if the lock may have + been passed to another thread to unlock */ +#endif + rw_lock_t* lock) /*!< in/out: rw-lock */ +{ + ut_ad(lock->lock_word == 0 || lock->lock_word <= -X_LOCK_DECR); + + /* lock->recursive flag also indicates if lock->writer_thread is + valid or stale. If we are the last of the recursive callers + then we must unset lock->recursive flag to indicate that the + lock->writer_thread is now stale. + Note that since we still hold the x-lock we can safely read the + lock_word. */ + if (lock->lock_word == 0) { + /* Last caller in a possible recursive chain. */ + lock->recursive = FALSE; + } + +#ifdef UNIV_SYNC_DEBUG + rw_lock_remove_debug_info(lock, pass, RW_LOCK_EX); +#endif + + ulint x_lock_incr; + if (lock->lock_word == 0) { + x_lock_incr = X_LOCK_DECR; + } else if (lock->lock_word == -X_LOCK_DECR) { + x_lock_incr = X_LOCK_DECR; + } else { + ut_ad(lock->lock_word < -X_LOCK_DECR); + x_lock_incr = 1; + } + + if (rw_lock_lock_word_incr(lock, x_lock_incr) == X_LOCK_DECR) { + /* Lock is now free. May have to signal read/write waiters. + We do not need to signal wait_ex waiters, since they cannot + exist when there is a writer. */ + if (lock->waiters) { + rw_lock_reset_waiter_flag(lock); + os_event_set(lock->event); + sync_array_object_signalled(); + } + } + + ut_ad(rw_lock_validate(lock)); + +#ifdef UNIV_SYNC_PERF_STAT + rw_x_exit_count++; +#endif +} + +#ifdef UNIV_PFS_RWLOCK + +/******************************************************************//** +Performance schema instrumented wrap function for rw_lock_create_func(). +NOTE! Please use the corresponding macro rw_lock_create(), not directly +this function! */ +UNIV_INLINE +void +pfs_rw_lock_create_func( +/*====================*/ + mysql_pfs_key_t key, /*!< in: key registered with + performance schema */ + rw_lock_t* lock, /*!< in: pointer to memory */ +# ifdef UNIV_DEBUG +# ifdef UNIV_SYNC_DEBUG + ulint level, /*!< in: level */ +# endif /* UNIV_SYNC_DEBUG */ + const char* cmutex_name, /*!< in: mutex name */ +# endif /* UNIV_DEBUG */ + const char* cfile_name, /*!< in: file name where created */ + ulint cline) /*!< in: file line where created */ +{ + /* Initialize the rwlock for performance schema */ + lock->pfs_psi = PSI_RWLOCK_CALL(init_rwlock)(key, lock); + + /* The actual function to initialize an rwlock */ + rw_lock_create_func(lock, +# ifdef UNIV_DEBUG +# ifdef UNIV_SYNC_DEBUG + level, +# endif /* UNIV_SYNC_DEBUG */ + cmutex_name, +# endif /* UNIV_DEBUG */ + cfile_name, + cline); +} +/******************************************************************//** +Performance schema instrumented wrap function for rw_lock_x_lock_func() +NOTE! Please use the corresponding macro rw_lock_x_lock(), not directly +this function! */ +UNIV_INLINE +void +pfs_rw_lock_x_lock_func( +/*====================*/ + rw_lock_t* lock, /*!< in: pointer to rw-lock */ + ulint pass, /*!< in: pass value; != 0, if the lock will + be passed to another thread to unlock */ + const char* file_name,/*!< in: file name where lock requested */ + ulint line) /*!< in: line where requested */ +{ + if (lock->pfs_psi != NULL) + { + PSI_rwlock_locker* locker; + PSI_rwlock_locker_state state; + + /* Record the entry of rw x lock request in performance schema */ + locker = PSI_RWLOCK_CALL(start_rwlock_wrwait)( + &state, lock->pfs_psi, PSI_RWLOCK_WRITELOCK, + file_name, static_cast<uint>(line)); + + rw_lock_x_lock_func( + lock, pass, file_name, static_cast<uint>(line)); + + if (locker != NULL) { + PSI_RWLOCK_CALL(end_rwlock_wrwait)(locker, 0); + } + } + else + { + rw_lock_x_lock_func(lock, pass, file_name, line); + } +} +/******************************************************************//** +Performance schema instrumented wrap function for +rw_lock_x_lock_func_nowait() +NOTE! Please use the corresponding macro rw_lock_x_lock_func(), +not directly this function! +@return TRUE if success */ +UNIV_INLINE +ibool +pfs_rw_lock_x_lock_func_nowait( +/*===========================*/ + rw_lock_t* lock, /*!< in: pointer to rw-lock */ + const char* file_name,/*!< in: file name where lock + requested */ + ulint line) /*!< in: line where requested */ +{ + ibool ret; + + if (lock->pfs_psi != NULL) + { + PSI_rwlock_locker* locker; + PSI_rwlock_locker_state state; + + /* Record the entry of rw x lock request in performance schema */ + locker = PSI_RWLOCK_CALL(start_rwlock_wrwait)( + &state, lock->pfs_psi, PSI_RWLOCK_WRITELOCK, + file_name, static_cast<uint>(line)); + + ret = rw_lock_x_lock_func_nowait(lock, file_name, line); + + if (locker != NULL) { + PSI_RWLOCK_CALL(end_rwlock_wrwait)( + locker, static_cast<int>(ret)); + } + } + else + { + ret = rw_lock_x_lock_func_nowait(lock, file_name, line); + } + + return(ret); +} +/******************************************************************//** +Performance schema instrumented wrap function for rw_lock_free_func() +NOTE! Please use the corresponding macro rw_lock_free(), not directly +this function! */ +UNIV_INLINE +void +pfs_rw_lock_free_func( +/*==================*/ + rw_lock_t* lock) /*!< in: pointer to rw-lock */ +{ + if (lock->pfs_psi != NULL) + { + PSI_RWLOCK_CALL(destroy_rwlock)(lock->pfs_psi); + lock->pfs_psi = NULL; + } + + rw_lock_free_func(lock); +} +/******************************************************************//** +Performance schema instrumented wrap function for rw_lock_s_lock_func() +NOTE! Please use the corresponding macro rw_lock_s_lock(), not +directly this function! */ +UNIV_INLINE +void +pfs_rw_lock_s_lock_func( +/*====================*/ + rw_lock_t* lock, /*!< in: pointer to rw-lock */ + ulint pass, /*!< in: pass value; != 0, if the + lock will be passed to another + thread to unlock */ + const char* file_name,/*!< in: file name where lock + requested */ + ulint line) /*!< in: line where requested */ +{ + if (lock->pfs_psi != NULL) + { + PSI_rwlock_locker* locker; + PSI_rwlock_locker_state state; + + /* Instrumented to inform we are aquiring a shared rwlock */ + locker = PSI_RWLOCK_CALL(start_rwlock_rdwait)( + &state, lock->pfs_psi, PSI_RWLOCK_READLOCK, + file_name, static_cast<uint>(line)); + + rw_lock_s_lock_func(lock, pass, file_name, line); + + if (locker != NULL) { + PSI_RWLOCK_CALL(end_rwlock_rdwait)(locker, 0); + } + } + else + { + rw_lock_s_lock_func(lock, pass, file_name, line); + } + + return; +} +/******************************************************************//** +Performance schema instrumented wrap function for rw_lock_s_lock_func() +NOTE! Please use the corresponding macro rw_lock_s_lock(), not +directly this function! +@return TRUE if success */ +UNIV_INLINE +ibool +pfs_rw_lock_s_lock_low( +/*===================*/ + rw_lock_t* lock, /*!< in: pointer to rw-lock */ + ulint pass, /*!< in: pass value; != 0, if the + lock will be passed to another + thread to unlock */ + const char* file_name, /*!< in: file name where lock requested */ + ulint line) /*!< in: line where requested */ +{ + ibool ret; + + if (lock->pfs_psi != NULL) + { + PSI_rwlock_locker* locker; + PSI_rwlock_locker_state state; + + /* Instrumented to inform we are aquiring a shared rwlock */ + locker = PSI_RWLOCK_CALL(start_rwlock_rdwait)( + &state, lock->pfs_psi, PSI_RWLOCK_READLOCK, + file_name, static_cast<uint>(line)); + + ret = rw_lock_s_lock_low(lock, pass, file_name, line); + + if (locker != NULL) { + PSI_RWLOCK_CALL(end_rwlock_rdwait)( + locker, static_cast<int>(ret)); + } + } + else + { + ret = rw_lock_s_lock_low(lock, pass, file_name, line); + } + + return(ret); +} + +/******************************************************************//** +Performance schema instrumented wrap function for rw_lock_x_unlock_func() +NOTE! Please use the corresponding macro rw_lock_x_unlock(), not directly +this function! */ +UNIV_INLINE +void +pfs_rw_lock_x_unlock_func( +/*======================*/ +#ifdef UNIV_SYNC_DEBUG + ulint pass, /*!< in: pass value; != 0, if the + lock may have been passed to another + thread to unlock */ +#endif + rw_lock_t* lock) /*!< in/out: rw-lock */ +{ + /* Inform performance schema we are unlocking the lock */ + if (lock->pfs_psi != NULL) + PSI_RWLOCK_CALL(unlock_rwlock)(lock->pfs_psi); + + rw_lock_x_unlock_func( +#ifdef UNIV_SYNC_DEBUG + pass, +#endif + lock); +} + +/******************************************************************//** +Performance schema instrumented wrap function for rw_lock_s_unlock_func() +NOTE! Please use the corresponding macro pfs_rw_lock_s_unlock(), not +directly this function! */ +UNIV_INLINE +void +pfs_rw_lock_s_unlock_func( +/*======================*/ +#ifdef UNIV_SYNC_DEBUG + ulint pass, /*!< in: pass value; != 0, if the + lock may have been passed to another + thread to unlock */ +#endif + rw_lock_t* lock) /*!< in/out: rw-lock */ +{ + /* Inform performance schema we are unlocking the lock */ + if (lock->pfs_psi != NULL) + PSI_RWLOCK_CALL(unlock_rwlock)(lock->pfs_psi); + + rw_lock_s_unlock_func( +#ifdef UNIV_SYNC_DEBUG + pass, +#endif + lock); + +} +#endif /* UNIV_PFS_RWLOCK */ diff --git a/storage/innobase/include/sync0sync.h b/storage/innobase/include/sync0sync.h new file mode 100644 index 00000000000..82fb353a41b --- /dev/null +++ b/storage/innobase/include/sync0sync.h @@ -0,0 +1,845 @@ +/***************************************************************************** + +Copyright (c) 1995, 2012, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2008, Google Inc. +Copyright (c) 2012, Facebook Inc. + +Portions of this file contain modifications contributed and copyrighted by +Google, Inc. Those modifications are gratefully acknowledged and are described +briefly in the InnoDB documentation. The contributions by Google are +incorporated with their permission, and subject to the conditions contained in +the file COPYING.Google. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/sync0sync.h +Mutex, the basic synchronization primitive + +Created 9/5/1995 Heikki Tuuri +*******************************************************/ + +#ifndef sync0sync_h +#define sync0sync_h + +#include "univ.i" +#include "sync0types.h" +#include "ut0lst.h" +#include "ut0mem.h" +#include "os0thread.h" +#include "os0sync.h" +#include "sync0arr.h" + +#if defined(UNIV_DEBUG) && !defined(UNIV_HOTBACKUP) +extern "C" my_bool timed_mutexes; +#endif /* UNIV_DEBUG && !UNIV_HOTBACKUP */ + +#ifdef HAVE_WINDOWS_ATOMICS +typedef LONG lock_word_t; /*!< On Windows, InterlockedExchange operates + on LONG variable */ +#elif defined(HAVE_ATOMIC_BUILTINS) && !defined(HAVE_ATOMIC_BUILTINS_BYTE) +typedef ulint lock_word_t; +#else +typedef byte lock_word_t; +#endif + +#if defined UNIV_PFS_MUTEX || defined UNIV_PFS_RWLOCK + +/* By default, buffer mutexes and rwlocks will be excluded from +instrumentation due to their large number of instances. */ +# define PFS_SKIP_BUFFER_MUTEX_RWLOCK + +/* By default, event->mutex will also be excluded from instrumentation */ +# define PFS_SKIP_EVENT_MUTEX + +#endif /* UNIV_PFS_MUTEX || UNIV_PFS_RWLOCK */ + +#ifdef UNIV_PFS_MUTEX +/* Key defines to register InnoDB mutexes with performance schema */ +extern mysql_pfs_key_t autoinc_mutex_key; +extern mysql_pfs_key_t buffer_block_mutex_key; +extern mysql_pfs_key_t buf_pool_mutex_key; +extern mysql_pfs_key_t buf_pool_zip_mutex_key; +extern mysql_pfs_key_t cache_last_read_mutex_key; +extern mysql_pfs_key_t dict_foreign_err_mutex_key; +extern mysql_pfs_key_t dict_sys_mutex_key; +extern mysql_pfs_key_t file_format_max_mutex_key; +extern mysql_pfs_key_t fil_system_mutex_key; +extern mysql_pfs_key_t flush_list_mutex_key; +extern mysql_pfs_key_t fts_bg_threads_mutex_key; +extern mysql_pfs_key_t fts_delete_mutex_key; +extern mysql_pfs_key_t fts_optimize_mutex_key; +extern mysql_pfs_key_t fts_doc_id_mutex_key; +extern mysql_pfs_key_t fts_pll_tokenize_mutex_key; +extern mysql_pfs_key_t hash_table_mutex_key; +extern mysql_pfs_key_t ibuf_bitmap_mutex_key; +extern mysql_pfs_key_t ibuf_mutex_key; +extern mysql_pfs_key_t ibuf_pessimistic_insert_mutex_key; +extern mysql_pfs_key_t log_sys_mutex_key; +extern mysql_pfs_key_t log_flush_order_mutex_key; +# ifndef HAVE_ATOMIC_BUILTINS +extern mysql_pfs_key_t server_mutex_key; +# endif /* !HAVE_ATOMIC_BUILTINS */ +# ifdef UNIV_MEM_DEBUG +extern mysql_pfs_key_t mem_hash_mutex_key; +# endif /* UNIV_MEM_DEBUG */ +extern mysql_pfs_key_t mem_pool_mutex_key; +extern mysql_pfs_key_t mutex_list_mutex_key; +extern mysql_pfs_key_t purge_sys_bh_mutex_key; +extern mysql_pfs_key_t recv_sys_mutex_key; +extern mysql_pfs_key_t recv_writer_mutex_key; +extern mysql_pfs_key_t rseg_mutex_key; +# ifdef UNIV_SYNC_DEBUG +extern mysql_pfs_key_t rw_lock_debug_mutex_key; +# endif /* UNIV_SYNC_DEBUG */ +extern mysql_pfs_key_t rw_lock_list_mutex_key; +extern mysql_pfs_key_t rw_lock_mutex_key; +extern mysql_pfs_key_t srv_dict_tmpfile_mutex_key; +extern mysql_pfs_key_t srv_innodb_monitor_mutex_key; +extern mysql_pfs_key_t srv_misc_tmpfile_mutex_key; +extern mysql_pfs_key_t srv_threads_mutex_key; +extern mysql_pfs_key_t srv_monitor_file_mutex_key; +# ifdef UNIV_SYNC_DEBUG +extern mysql_pfs_key_t sync_thread_mutex_key; +# endif /* UNIV_SYNC_DEBUG */ +extern mysql_pfs_key_t buf_dblwr_mutex_key; +extern mysql_pfs_key_t trx_undo_mutex_key; +extern mysql_pfs_key_t trx_mutex_key; +extern mysql_pfs_key_t lock_sys_mutex_key; +extern mysql_pfs_key_t lock_sys_wait_mutex_key; +extern mysql_pfs_key_t trx_sys_mutex_key; +extern mysql_pfs_key_t srv_sys_mutex_key; +extern mysql_pfs_key_t srv_sys_tasks_mutex_key; +#ifndef HAVE_ATOMIC_BUILTINS +extern mysql_pfs_key_t srv_conc_mutex_key; +#endif /* !HAVE_ATOMIC_BUILTINS */ +#ifndef HAVE_ATOMIC_BUILTINS_64 +extern mysql_pfs_key_t monitor_mutex_key; +#endif /* !HAVE_ATOMIC_BUILTINS_64 */ +extern mysql_pfs_key_t event_os_mutex_key; +extern mysql_pfs_key_t ut_list_mutex_key; +extern mysql_pfs_key_t os_mutex_key; +extern mysql_pfs_key_t zip_pad_mutex_key; +#endif /* UNIV_PFS_MUTEX */ + +/******************************************************************//** +Initializes the synchronization data structures. */ +UNIV_INTERN +void +sync_init(void); +/*===========*/ +/******************************************************************//** +Frees the resources in synchronization data structures. */ +UNIV_INTERN +void +sync_close(void); +/*===========*/ + +#undef mutex_free /* Fix for MacOS X */ + +#ifdef UNIV_PFS_MUTEX +/********************************************************************** +Following mutex APIs would be performance schema instrumented +if "UNIV_PFS_MUTEX" is defined: + +mutex_create +mutex_enter +mutex_exit +mutex_enter_nowait +mutex_free + +These mutex APIs will point to corresponding wrapper functions that contain +the performance schema instrumentation if "UNIV_PFS_MUTEX" is defined. +The instrumented wrapper functions have the prefix of "innodb_". + +NOTE! The following macro should be used in mutex operation, not the +corresponding function. */ + +/******************************************************************//** +Creates, or rather, initializes a mutex object to a specified memory +location (which must be appropriately aligned). The mutex is initialized +in the reset state. Explicit freeing of the mutex with mutex_free is +necessary only if the memory block containing it is freed. */ +# ifdef UNIV_DEBUG +# ifdef UNIV_SYNC_DEBUG +# define mutex_create(K, M, level) \ + pfs_mutex_create_func((K), (M), #M, (level), __FILE__, __LINE__) +# else +# define mutex_create(K, M, level) \ + pfs_mutex_create_func((K), (M), #M, __FILE__, __LINE__) +# endif/* UNIV_SYNC_DEBUG */ +# else +# define mutex_create(K, M, level) \ + pfs_mutex_create_func((K), (M), __FILE__, __LINE__) +# endif /* UNIV_DEBUG */ + +# define mutex_enter(M) \ + pfs_mutex_enter_func((M), __FILE__, __LINE__) + +# define mutex_enter_nowait(M) \ + pfs_mutex_enter_nowait_func((M), __FILE__, __LINE__) + +# define mutex_exit(M) pfs_mutex_exit_func(M) + +# define mutex_free(M) pfs_mutex_free_func(M) + +#else /* UNIV_PFS_MUTEX */ + +/* If "UNIV_PFS_MUTEX" is not defined, the mutex APIs point to +original non-instrumented functions */ +# ifdef UNIV_DEBUG +# ifdef UNIV_SYNC_DEBUG +# define mutex_create(K, M, level) \ + mutex_create_func((M), #M, (level), __FILE__, __LINE__) +# else /* UNIV_SYNC_DEBUG */ +# define mutex_create(K, M, level) \ + mutex_create_func((M), #M, __FILE__, __LINE__) +# endif /* UNIV_SYNC_DEBUG */ +# else /* UNIV_DEBUG */ +# define mutex_create(K, M, level) \ + mutex_create_func((M), __FILE__, __LINE__) +# endif /* UNIV_DEBUG */ + +# define mutex_enter(M) mutex_enter_func((M), __FILE__, __LINE__) + +# define mutex_enter_nowait(M) \ + mutex_enter_nowait_func((M), __FILE__, __LINE__) + +# define mutex_exit(M) mutex_exit_func(M) + +# define mutex_free(M) mutex_free_func(M) + +#endif /* UNIV_PFS_MUTEX */ + +/******************************************************************//** +Creates, or rather, initializes a mutex object in a specified memory +location (which must be appropriately aligned). The mutex is initialized +in the reset state. Explicit freeing of the mutex with mutex_free is +necessary only if the memory block containing it is freed. */ +UNIV_INTERN +void +mutex_create_func( +/*==============*/ + ib_mutex_t* mutex, /*!< in: pointer to memory */ +#ifdef UNIV_DEBUG + const char* cmutex_name, /*!< in: mutex name */ +# ifdef UNIV_SYNC_DEBUG + ulint level, /*!< in: level */ +# endif /* UNIV_SYNC_DEBUG */ +#endif /* UNIV_DEBUG */ + const char* cfile_name, /*!< in: file name where created */ + ulint cline); /*!< in: file line where created */ + +/******************************************************************//** +NOTE! Use the corresponding macro mutex_free(), not directly this function! +Calling this function is obligatory only if the memory buffer containing +the mutex is freed. Removes a mutex object from the mutex list. The mutex +is checked to be in the reset state. */ +UNIV_INTERN +void +mutex_free_func( +/*============*/ + ib_mutex_t* mutex); /*!< in: mutex */ +/**************************************************************//** +NOTE! The following macro should be used in mutex locking, not the +corresponding function. */ + +/* NOTE! currently same as mutex_enter! */ + +#define mutex_enter_fast(M) mutex_enter_func((M), __FILE__, __LINE__) +/******************************************************************//** +NOTE! Use the corresponding macro in the header file, not this function +directly. Locks a mutex for the current thread. If the mutex is reserved +the function spins a preset time (controlled by SYNC_SPIN_ROUNDS) waiting +for the mutex before suspending the thread. */ +UNIV_INLINE +void +mutex_enter_func( +/*=============*/ + ib_mutex_t* mutex, /*!< in: pointer to mutex */ + const char* file_name, /*!< in: file name where locked */ + ulint line); /*!< in: line where locked */ +/********************************************************************//** +NOTE! Use the corresponding macro in the header file, not this function +directly. Tries to lock the mutex for the current thread. If the lock is not +acquired immediately, returns with return value 1. +@return 0 if succeed, 1 if not */ +UNIV_INTERN +ulint +mutex_enter_nowait_func( +/*====================*/ + ib_mutex_t* mutex, /*!< in: pointer to mutex */ + const char* file_name, /*!< in: file name where mutex + requested */ + ulint line); /*!< in: line where requested */ +/******************************************************************//** +NOTE! Use the corresponding macro mutex_exit(), not directly this function! +Unlocks a mutex owned by the current thread. */ +UNIV_INLINE +void +mutex_exit_func( +/*============*/ + ib_mutex_t* mutex); /*!< in: pointer to mutex */ + + +#ifdef UNIV_PFS_MUTEX +/******************************************************************//** +NOTE! Please use the corresponding macro mutex_create(), not directly +this function! +A wrapper function for mutex_create_func(), registers the mutex +with peformance schema if "UNIV_PFS_MUTEX" is defined when +creating the mutex */ +UNIV_INLINE +void +pfs_mutex_create_func( +/*==================*/ + PSI_mutex_key key, /*!< in: Performance Schema key */ + ib_mutex_t* mutex, /*!< in: pointer to memory */ +# ifdef UNIV_DEBUG + const char* cmutex_name, /*!< in: mutex name */ +# ifdef UNIV_SYNC_DEBUG + ulint level, /*!< in: level */ +# endif /* UNIV_SYNC_DEBUG */ +# endif /* UNIV_DEBUG */ + const char* cfile_name, /*!< in: file name where created */ + ulint cline); /*!< in: file line where created */ +/******************************************************************//** +NOTE! Please use the corresponding macro mutex_enter(), not directly +this function! +This is a performance schema instrumented wrapper function for +mutex_enter_func(). */ +UNIV_INLINE +void +pfs_mutex_enter_func( +/*=================*/ + ib_mutex_t* mutex, /*!< in: pointer to mutex */ + const char* file_name, /*!< in: file name where locked */ + ulint line); /*!< in: line where locked */ +/********************************************************************//** +NOTE! Please use the corresponding macro mutex_enter_nowait(), not directly +this function! +This is a performance schema instrumented wrapper function for +mutex_enter_nowait_func. +@return 0 if succeed, 1 if not */ +UNIV_INLINE +ulint +pfs_mutex_enter_nowait_func( +/*========================*/ + ib_mutex_t* mutex, /*!< in: pointer to mutex */ + const char* file_name, /*!< in: file name where mutex + requested */ + ulint line); /*!< in: line where requested */ +/******************************************************************//** +NOTE! Please use the corresponding macro mutex_exit(), not directly +this function! +A wrap function of mutex_exit_func() with peformance schema instrumentation. +Unlocks a mutex owned by the current thread. */ +UNIV_INLINE +void +pfs_mutex_exit_func( +/*================*/ + ib_mutex_t* mutex); /*!< in: pointer to mutex */ + +/******************************************************************//** +NOTE! Please use the corresponding macro mutex_free(), not directly +this function! +Wrapper function for mutex_free_func(). Also destroys the performance +schema probes when freeing the mutex */ +UNIV_INLINE +void +pfs_mutex_free_func( +/*================*/ + ib_mutex_t* mutex); /*!< in: mutex */ + +#endif /* UNIV_PFS_MUTEX */ + +#ifdef UNIV_SYNC_DEBUG +/******************************************************************//** +Returns TRUE if no mutex or rw-lock is currently locked. +Works only in the debug version. +@return TRUE if no mutexes and rw-locks reserved */ +UNIV_INTERN +ibool +sync_all_freed(void); +/*================*/ +#endif /* UNIV_SYNC_DEBUG */ +/*##################################################################### +FUNCTION PROTOTYPES FOR DEBUGGING */ +/*******************************************************************//** +Prints wait info of the sync system. */ +UNIV_INTERN +void +sync_print_wait_info( +/*=================*/ + FILE* file); /*!< in: file where to print */ +/*******************************************************************//** +Prints info of the sync system. */ +UNIV_INTERN +void +sync_print( +/*=======*/ + FILE* file); /*!< in: file where to print */ +#ifdef UNIV_DEBUG +/******************************************************************//** +Checks that the mutex has been initialized. +@return TRUE */ +UNIV_INTERN +ibool +mutex_validate( +/*===========*/ + const ib_mutex_t* mutex); /*!< in: mutex */ +/******************************************************************//** +Checks that the current thread owns the mutex. Works only +in the debug version. +@return TRUE if owns */ +UNIV_INTERN +ibool +mutex_own( +/*======*/ + const ib_mutex_t* mutex) /*!< in: mutex */ + __attribute__((warn_unused_result)); +#endif /* UNIV_DEBUG */ +#ifdef UNIV_SYNC_DEBUG +/******************************************************************//** +Adds a latch and its level in the thread level array. Allocates the memory +for the array if called first time for this OS thread. Makes the checks +against other latch levels stored in the array for this thread. */ +UNIV_INTERN +void +sync_thread_add_level( +/*==================*/ + void* latch, /*!< in: pointer to a mutex or an rw-lock */ + ulint level, /*!< in: level in the latching order; if + SYNC_LEVEL_VARYING, nothing is done */ + ibool relock) /*!< in: TRUE if re-entering an x-lock */ + __attribute__((nonnull)); +/******************************************************************//** +Removes a latch from the thread level array if it is found there. +@return TRUE if found in the array; it is no error if the latch is +not found, as we presently are not able to determine the level for +every latch reservation the program does */ +UNIV_INTERN +ibool +sync_thread_reset_level( +/*====================*/ + void* latch); /*!< in: pointer to a mutex or an rw-lock */ +/******************************************************************//** +Checks if the level array for the current thread contains a +mutex or rw-latch at the specified level. +@return a matching latch, or NULL if not found */ +UNIV_INTERN +void* +sync_thread_levels_contains( +/*========================*/ + ulint level); /*!< in: latching order level + (SYNC_DICT, ...)*/ +/******************************************************************//** +Checks that the level array for the current thread is empty. +@return a latch, or NULL if empty except the exceptions specified below */ +UNIV_INTERN +void* +sync_thread_levels_nonempty_gen( +/*============================*/ + ibool dict_mutex_allowed) /*!< in: TRUE if dictionary mutex is + allowed to be owned by the thread */ + __attribute__((warn_unused_result)); +/******************************************************************//** +Checks if the level array for the current thread is empty, +except for data dictionary latches. */ +#define sync_thread_levels_empty_except_dict() \ + (!sync_thread_levels_nonempty_gen(TRUE)) +/******************************************************************//** +Checks if the level array for the current thread is empty, +except for the btr_search_latch. +@return a latch, or NULL if empty except the exceptions specified below */ +UNIV_INTERN +void* +sync_thread_levels_nonempty_trx( +/*============================*/ + ibool has_search_latch) + /*!< in: TRUE if and only if the thread + is supposed to hold btr_search_latch */ + __attribute__((warn_unused_result)); + +/******************************************************************//** +Gets the debug information for a reserved mutex. */ +UNIV_INTERN +void +mutex_get_debug_info( +/*=================*/ + ib_mutex_t* mutex, /*!< in: mutex */ + const char** file_name, /*!< out: file where requested */ + ulint* line, /*!< out: line where requested */ + os_thread_id_t* thread_id); /*!< out: id of the thread which owns + the mutex */ +/******************************************************************//** +Counts currently reserved mutexes. Works only in the debug version. +@return number of reserved mutexes */ +UNIV_INTERN +ulint +mutex_n_reserved(void); +/*==================*/ +#endif /* UNIV_SYNC_DEBUG */ +/******************************************************************//** +NOT to be used outside this module except in debugging! Gets the value +of the lock word. */ +UNIV_INLINE +lock_word_t +mutex_get_lock_word( +/*================*/ + const ib_mutex_t* mutex); /*!< in: mutex */ +#ifdef UNIV_SYNC_DEBUG +/******************************************************************//** +NOT to be used outside this module except in debugging! Gets the waiters +field in a mutex. +@return value to set */ +UNIV_INLINE +ulint +mutex_get_waiters( +/*==============*/ + const ib_mutex_t* mutex); /*!< in: mutex */ +#endif /* UNIV_SYNC_DEBUG */ + +/* + LATCHING ORDER WITHIN THE DATABASE + ================================== + +The mutex or latch in the central memory object, for instance, a rollback +segment object, must be acquired before acquiring the latch or latches to +the corresponding file data structure. In the latching order below, these +file page object latches are placed immediately below the corresponding +central memory object latch or mutex. + +Synchronization object Notes +---------------------- ----- + +Dictionary mutex If we have a pointer to a dictionary +| object, e.g., a table, it can be +| accessed without reserving the +| dictionary mutex. We must have a +| reservation, a memoryfix, to the +| appropriate table object in this case, +| and the table must be explicitly +| released later. +V +Dictionary header +| +V +Secondary index tree latch The tree latch protects also all +| the B-tree non-leaf pages. These +V can be read with the page only +Secondary index non-leaf bufferfixed to save CPU time, +| no s-latch is needed on the page. +| Modification of a page requires an +| x-latch on the page, however. If a +| thread owns an x-latch to the tree, +| it is allowed to latch non-leaf pages +| even after it has acquired the fsp +| latch. +V +Secondary index leaf The latch on the secondary index leaf +| can be kept while accessing the +| clustered index, to save CPU time. +V +Clustered index tree latch To increase concurrency, the tree +| latch is usually released when the +| leaf page latch has been acquired. +V +Clustered index non-leaf +| +V +Clustered index leaf +| +V +Transaction system header +| +V +Transaction undo mutex The undo log entry must be written +| before any index page is modified. +| Transaction undo mutex is for the undo +| logs the analogue of the tree latch +| for a B-tree. If a thread has the +| trx undo mutex reserved, it is allowed +| to latch the undo log pages in any +| order, and also after it has acquired +| the fsp latch. +V +Rollback segment mutex The rollback segment mutex must be +| reserved, if, e.g., a new page must +| be added to an undo log. The rollback +| segment and the undo logs in its +| history list can be seen as an +| analogue of a B-tree, and the latches +| reserved similarly, using a version of +| lock-coupling. If an undo log must be +| extended by a page when inserting an +| undo log record, this corresponds to +| a pessimistic insert in a B-tree. +V +Rollback segment header +| +V +Purge system latch +| +V +Undo log pages If a thread owns the trx undo mutex, +| or for a log in the history list, the +| rseg mutex, it is allowed to latch +| undo log pages in any order, and even +| after it has acquired the fsp latch. +| If a thread does not have the +| appropriate mutex, it is allowed to +| latch only a single undo log page in +| a mini-transaction. +V +File space management latch If a mini-transaction must allocate +| several file pages, it can do that, +| because it keeps the x-latch to the +| file space management in its memo. +V +File system pages +| +V +lock_sys_wait_mutex Mutex protecting lock timeout data +| +V +lock_sys_mutex Mutex protecting lock_sys_t +| +V +trx_sys->mutex Mutex protecting trx_sys_t +| +V +Threads mutex Background thread scheduling mutex +| +V +query_thr_mutex Mutex protecting query threads +| +V +trx_mutex Mutex protecting trx_t fields +| +V +Search system mutex +| +V +Buffer pool mutex +| +V +Log mutex +| +Any other latch +| +V +Memory pool mutex */ + +/* Latching order levels. If you modify these, you have to also update +sync_thread_add_level(). */ + +/* User transaction locks are higher than any of the latch levels below: +no latches are allowed when a thread goes to wait for a normal table +or row lock! */ +#define SYNC_USER_TRX_LOCK 9999 +#define SYNC_NO_ORDER_CHECK 3000 /* this can be used to suppress + latching order checking */ +#define SYNC_LEVEL_VARYING 2000 /* Level is varying. Only used with + buffer pool page locks, which do not + have a fixed level, but instead have + their level set after the page is + locked; see e.g. + ibuf_bitmap_get_map_page(). */ +#define SYNC_TRX_I_S_RWLOCK 1910 /* Used for + trx_i_s_cache_t::rw_lock */ +#define SYNC_TRX_I_S_LAST_READ 1900 /* Used for + trx_i_s_cache_t::last_read_mutex */ +#define SYNC_FILE_FORMAT_TAG 1200 /* Used to serialize access to the + file format tag */ +#define SYNC_DICT_OPERATION 1010 /* table create, drop, etc. reserve + this in X-mode; implicit or backround + operations purge, rollback, foreign + key checks reserve this in S-mode */ +#define SYNC_FTS_CACHE 1005 /* FTS cache rwlock */ +#define SYNC_DICT 1000 +#define SYNC_DICT_AUTOINC_MUTEX 999 +#define SYNC_STATS_AUTO_RECALC 997 +#define SYNC_DICT_HEADER 995 +#define SYNC_IBUF_HEADER 914 +#define SYNC_IBUF_PESS_INSERT_MUTEX 912 +/*-------------------------------*/ +#define SYNC_INDEX_TREE 900 +#define SYNC_TREE_NODE_NEW 892 +#define SYNC_TREE_NODE_FROM_HASH 891 +#define SYNC_TREE_NODE 890 +#define SYNC_PURGE_LATCH 800 +#define SYNC_TRX_UNDO 700 +#define SYNC_RSEG 600 +#define SYNC_RSEG_HEADER_NEW 591 +#define SYNC_RSEG_HEADER 590 +#define SYNC_TRX_UNDO_PAGE 570 +#define SYNC_EXTERN_STORAGE 500 +#define SYNC_FSP 400 +#define SYNC_FSP_PAGE 395 +/*------------------------------------- Change buffer headers */ +#define SYNC_IBUF_MUTEX 370 /* ibuf_mutex */ +/*------------------------------------- Change buffer tree */ +#define SYNC_IBUF_INDEX_TREE 360 +#define SYNC_IBUF_TREE_NODE_NEW 359 +#define SYNC_IBUF_TREE_NODE 358 +#define SYNC_IBUF_BITMAP_MUTEX 351 +#define SYNC_IBUF_BITMAP 350 +/*------------------------------------- Change log for online create index */ +#define SYNC_INDEX_ONLINE_LOG 340 +/*------------------------------------- MySQL query cache mutex */ +/*------------------------------------- MySQL binlog mutex */ +/*-------------------------------*/ +#define SYNC_LOCK_WAIT_SYS 300 +#define SYNC_LOCK_SYS 299 +#define SYNC_TRX_SYS 298 +#define SYNC_TRX 297 +#define SYNC_THREADS 295 +#define SYNC_REC_LOCK 294 +#define SYNC_TRX_SYS_HEADER 290 +#define SYNC_PURGE_QUEUE 200 +#define SYNC_LOG 170 +#define SYNC_LOG_FLUSH_ORDER 147 +#define SYNC_RECV 168 +#define SYNC_FTS_TOKENIZE 167 +#define SYNC_FTS_CACHE_INIT 166 /* Used for FTS cache initialization */ +#define SYNC_FTS_BG_THREADS 165 +#define SYNC_FTS_OPTIMIZE 164 // FIXME: is this correct number, test +#define SYNC_WORK_QUEUE 162 +#define SYNC_SEARCH_SYS 160 /* NOTE that if we have a memory + heap that can be extended to the + buffer pool, its logical level is + SYNC_SEARCH_SYS, as memory allocation + can call routines there! Otherwise + the level is SYNC_MEM_HASH. */ +#define SYNC_BUF_POOL 150 /* Buffer pool mutex */ +#define SYNC_BUF_PAGE_HASH 149 /* buf_pool->page_hash rw_lock */ +#define SYNC_BUF_BLOCK 146 /* Block mutex */ +#define SYNC_BUF_FLUSH_LIST 145 /* Buffer flush list mutex */ +#define SYNC_DOUBLEWRITE 140 +#define SYNC_ANY_LATCH 135 +#define SYNC_MEM_HASH 131 +#define SYNC_MEM_POOL 130 + +/* Codes used to designate lock operations */ +#define RW_LOCK_NOT_LOCKED 350 +#define RW_LOCK_EX 351 +#define RW_LOCK_EXCLUSIVE 351 +#define RW_LOCK_SHARED 352 +#define RW_LOCK_WAIT_EX 353 +#define SYNC_MUTEX 354 + +/* NOTE! The structure appears here only for the compiler to know its size. +Do not use its fields directly! The structure used in the spin lock +implementation of a mutual exclusion semaphore. */ + +/** InnoDB mutex */ +struct ib_mutex_t { + os_event_t event; /*!< Used by sync0arr.cc for the wait queue */ + volatile lock_word_t lock_word; /*!< lock_word is the target + of the atomic test-and-set instruction when + atomic operations are enabled. */ + +#if !defined(HAVE_ATOMIC_BUILTINS) + os_fast_mutex_t + os_fast_mutex; /*!< We use this OS mutex in place of lock_word + when atomic operations are not enabled */ +#endif + ulint waiters; /*!< This ulint is set to 1 if there are (or + may be) threads waiting in the global wait + array for this mutex to be released. + Otherwise, this is 0. */ + UT_LIST_NODE_T(ib_mutex_t) list; /*!< All allocated mutexes are put into + a list. Pointers to the next and prev. */ +#ifdef UNIV_SYNC_DEBUG + const char* file_name; /*!< File where the mutex was locked */ + ulint line; /*!< Line where the mutex was locked */ + ulint level; /*!< Level in the global latching order */ +#endif /* UNIV_SYNC_DEBUG */ + const char* cfile_name;/*!< File name where mutex created */ + ulint cline; /*!< Line where created */ + ulong count_os_wait; /*!< count of os_wait */ +#ifdef UNIV_DEBUG + +/** Value of mutex_t::magic_n */ +# define MUTEX_MAGIC_N 979585UL + + os_thread_id_t thread_id; /*!< The thread id of the thread + which locked the mutex. */ + ulint magic_n; /*!< MUTEX_MAGIC_N */ + const char* cmutex_name; /*!< mutex name */ + ulint ib_mutex_type; /*!< 0=usual mutex, 1=rw_lock mutex */ +#endif /* UNIV_DEBUG */ +#ifdef UNIV_PFS_MUTEX + struct PSI_mutex* pfs_psi; /*!< The performance schema + instrumentation hook */ +#endif +}; + +/** Constant determining how long spin wait is continued before suspending +the thread. A value 600 rounds on a 1995 100 MHz Pentium seems to correspond +to 20 microseconds. */ + +#define SYNC_SPIN_ROUNDS srv_n_spin_wait_rounds + +/** The number of mutex_exit calls. Intended for performance monitoring. */ +extern ib_int64_t mutex_exit_count; + +#ifdef UNIV_SYNC_DEBUG +/** Latching order checks start when this is set TRUE */ +extern ibool sync_order_checks_on; +#endif /* UNIV_SYNC_DEBUG */ + +/** This variable is set to TRUE when sync_init is called */ +extern ibool sync_initialized; + +/** Global list of database mutexes (not OS mutexes) created. */ +typedef UT_LIST_BASE_NODE_T(ib_mutex_t) ut_list_base_node_t; +/** Global list of database mutexes (not OS mutexes) created. */ +extern ut_list_base_node_t mutex_list; + +/** Mutex protecting the mutex_list variable */ +extern ib_mutex_t mutex_list_mutex; + +#ifndef HAVE_ATOMIC_BUILTINS +/**********************************************************//** +Function that uses a mutex to decrement a variable atomically */ +UNIV_INLINE +void +os_atomic_dec_ulint_func( +/*=====================*/ + ib_mutex_t* mutex, /*!< in: mutex guarding the + decrement */ + volatile ulint* var, /*!< in/out: variable to + decrement */ + ulint delta); /*!< in: delta to decrement */ +/**********************************************************//** +Function that uses a mutex to increment a variable atomically */ +UNIV_INLINE +void +os_atomic_inc_ulint_func( +/*=====================*/ + ib_mutex_t* mutex, /*!< in: mutex guarding the + increment */ + volatile ulint* var, /*!< in/out: variable to + increment */ + ulint delta); /*!< in: delta to increment */ +#endif /* !HAVE_ATOMIC_BUILTINS */ + +#ifndef UNIV_NONINL +#include "sync0sync.ic" +#endif + +#endif diff --git a/storage/innobase/include/sync0sync.ic b/storage/innobase/include/sync0sync.ic new file mode 100644 index 00000000000..616e53d4aac --- /dev/null +++ b/storage/innobase/include/sync0sync.ic @@ -0,0 +1,414 @@ +/***************************************************************************** + +Copyright (c) 1995, 2009, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2008, Google Inc. + +Portions of this file contain modifications contributed and copyrighted by +Google, Inc. Those modifications are gratefully acknowledged and are described +briefly in the InnoDB documentation. The contributions by Google are +incorporated with their permission, and subject to the conditions contained in +the file COPYING.Google. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/sync0sync.ic +Mutex, the basic synchronization primitive + +Created 9/5/1995 Heikki Tuuri +*******************************************************/ + +/******************************************************************//** +Sets the waiters field in a mutex. */ +UNIV_INTERN +void +mutex_set_waiters( +/*==============*/ + ib_mutex_t* mutex, /*!< in: mutex */ + ulint n); /*!< in: value to set */ +/******************************************************************//** +Reserves a mutex for the current thread. If the mutex is reserved, the +function spins a preset time (controlled by SYNC_SPIN_ROUNDS) waiting +for the mutex before suspending the thread. */ +UNIV_INTERN +void +mutex_spin_wait( +/*============*/ + ib_mutex_t* mutex, /*!< in: pointer to mutex */ + const char* file_name, /*!< in: file name where mutex + requested */ + ulint line); /*!< in: line where requested */ +#ifdef UNIV_SYNC_DEBUG +/******************************************************************//** +Sets the debug information for a reserved mutex. */ +UNIV_INTERN +void +mutex_set_debug_info( +/*=================*/ + ib_mutex_t* mutex, /*!< in: mutex */ + const char* file_name, /*!< in: file where requested */ + ulint line); /*!< in: line where requested */ +#endif /* UNIV_SYNC_DEBUG */ +/******************************************************************//** +Releases the threads waiting in the primary wait array for this mutex. */ +UNIV_INTERN +void +mutex_signal_object( +/*================*/ + ib_mutex_t* mutex); /*!< in: mutex */ + +/******************************************************************//** +Performs an atomic test-and-set instruction to the lock_word field of a +mutex. +@return the previous value of lock_word: 0 or 1 */ +UNIV_INLINE +byte +ib_mutex_test_and_set( +/*===============*/ + ib_mutex_t* mutex) /*!< in: mutex */ +{ +#if defined(HAVE_ATOMIC_BUILTINS) +# if defined(HAVE_ATOMIC_BUILTINS_BYTE) + return(os_atomic_test_and_set_byte(&mutex->lock_word, 1)); +# else + return(os_atomic_test_and_set_ulint(&mutex->lock_word, 1)); +# endif +#else + ibool ret; + + ret = os_fast_mutex_trylock(&(mutex->os_fast_mutex)); + + if (ret == 0) { + /* We check that os_fast_mutex_trylock does not leak + and allow race conditions */ + ut_a(mutex->lock_word == 0); + + mutex->lock_word = 1; + os_wmb; + } + + return((byte) ret); +#endif +} + +/******************************************************************//** +Performs a reset instruction to the lock_word field of a mutex. This +instruction also serializes memory operations to the program order. */ +UNIV_INLINE +void +mutex_reset_lock_word( +/*==================*/ + ib_mutex_t* mutex) /*!< in: mutex */ +{ +#if defined(HAVE_ATOMIC_BUILTINS) + /* In theory __sync_lock_release should be used to release the lock. + Unfortunately, it does not work properly alone. The workaround is + that more conservative __sync_lock_test_and_set is used instead. */ +# if defined(HAVE_ATOMIC_BUILTINS_BYTE) + os_atomic_test_and_set_byte(&mutex->lock_word, 0); +# else + os_atomic_test_and_set_ulint(&mutex->lock_word, 0); +# endif +#else + mutex->lock_word = 0; + + os_fast_mutex_unlock(&(mutex->os_fast_mutex)); +#endif +} + +/******************************************************************//** +Gets the value of the lock word. */ +UNIV_INLINE +lock_word_t +mutex_get_lock_word( +/*================*/ + const ib_mutex_t* mutex) /*!< in: mutex */ +{ + ut_ad(mutex); + + return(mutex->lock_word); +} + +/******************************************************************//** +Gets the waiters field in a mutex. +@return value to set */ +UNIV_INLINE +ulint +mutex_get_waiters( +/*==============*/ + const ib_mutex_t* mutex) /*!< in: mutex */ +{ + const volatile ulint* ptr; /*!< declared volatile to ensure that + the value is read from memory */ + ut_ad(mutex); + + ptr = &(mutex->waiters); + + return(*ptr); /* Here we assume that the read of a single + word from memory is atomic */ +} + +/******************************************************************//** +NOTE! Use the corresponding macro mutex_exit(), not directly this function! +Unlocks a mutex owned by the current thread. */ +UNIV_INLINE +void +mutex_exit_func( +/*============*/ + ib_mutex_t* mutex) /*!< in: pointer to mutex */ +{ + ut_ad(mutex_own(mutex)); + + ut_d(mutex->thread_id = (os_thread_id_t) ULINT_UNDEFINED); + +#ifdef UNIV_SYNC_DEBUG + sync_thread_reset_level(mutex); +#endif + mutex_reset_lock_word(mutex); + + /* A problem: we assume that mutex_reset_lock word + is a memory barrier, that is when we read the waiters + field next, the read must be serialized in memory + after the reset. A speculative processor might + perform the read first, which could leave a waiting + thread hanging indefinitely. + + Our current solution call every second + sync_arr_wake_threads_if_sema_free() + to wake up possible hanging threads if + they are missed in mutex_signal_object. */ + + if (mutex_get_waiters(mutex) != 0) { + + mutex_signal_object(mutex); + } + +#ifdef UNIV_SYNC_PERF_STAT + mutex_exit_count++; +#endif +} + +/******************************************************************//** +Locks a mutex for the current thread. If the mutex is reserved, the function +spins a preset time (controlled by SYNC_SPIN_ROUNDS), waiting for the mutex +before suspending the thread. */ +UNIV_INLINE +void +mutex_enter_func( +/*=============*/ + ib_mutex_t* mutex, /*!< in: pointer to mutex */ + const char* file_name, /*!< in: file name where locked */ + ulint line) /*!< in: line where locked */ +{ + ut_ad(mutex_validate(mutex)); + ut_ad(!mutex_own(mutex)); + + /* Note that we do not peek at the value of lock_word before trying + the atomic test_and_set; we could peek, and possibly save time. */ + + if (!ib_mutex_test_and_set(mutex)) { + ut_d(mutex->thread_id = os_thread_get_curr_id()); +#ifdef UNIV_SYNC_DEBUG + mutex_set_debug_info(mutex, file_name, line); +#endif + return; /* Succeeded! */ + } + + mutex_spin_wait(mutex, file_name, line); +} + +#ifdef UNIV_PFS_MUTEX +/******************************************************************//** +NOTE! Please use the corresponding macro mutex_enter(), not directly +this function! +This is a performance schema instrumented wrapper function for +mutex_enter_func(). */ +UNIV_INLINE +void +pfs_mutex_enter_func( +/*=================*/ + ib_mutex_t* mutex, /*!< in: pointer to mutex */ + const char* file_name, /*!< in: file name where locked */ + ulint line) /*!< in: line where locked */ +{ + if (mutex->pfs_psi != NULL) { + PSI_mutex_locker* locker; + PSI_mutex_locker_state state; + + locker = PSI_MUTEX_CALL(start_mutex_wait)( + &state, mutex->pfs_psi, + PSI_MUTEX_LOCK, file_name, + static_cast<uint>(line)); + + mutex_enter_func(mutex, file_name, line); + + if (locker != NULL) { + PSI_MUTEX_CALL(end_mutex_wait)(locker, 0); + } + } else { + mutex_enter_func(mutex, file_name, line); + } +} + +/********************************************************************//** +NOTE! Please use the corresponding macro mutex_enter_nowait(), not directly +this function! +This is a performance schema instrumented wrapper function for +mutex_enter_nowait_func. +@return 0 if succeed, 1 if not */ +UNIV_INLINE +ulint +pfs_mutex_enter_nowait_func( +/*========================*/ + ib_mutex_t* mutex, /*!< in: pointer to mutex */ + const char* file_name, /*!< in: file name where mutex + requested */ + ulint line) /*!< in: line where requested */ +{ + ulint ret; + + if (mutex->pfs_psi != NULL) { + PSI_mutex_locker* locker; + PSI_mutex_locker_state state; + + locker = PSI_MUTEX_CALL(start_mutex_wait)( + &state, mutex->pfs_psi, + PSI_MUTEX_TRYLOCK, file_name, + static_cast<uint>(line)); + + ret = mutex_enter_nowait_func(mutex, file_name, line); + + if (locker != NULL) { + PSI_MUTEX_CALL(end_mutex_wait)(locker, (int) ret); + } + } else { + ret = mutex_enter_nowait_func(mutex, file_name, line); + } + + return(ret); +} + +/******************************************************************//** +NOTE! Please use the corresponding macro mutex_exit(), not directly +this function! +A wrap function of mutex_exit_func() with performance schema instrumentation. +Unlocks a mutex owned by the current thread. */ +UNIV_INLINE +void +pfs_mutex_exit_func( +/*================*/ + ib_mutex_t* mutex) /*!< in: pointer to mutex */ +{ + if (mutex->pfs_psi != NULL) { + PSI_MUTEX_CALL(unlock_mutex)(mutex->pfs_psi); + } + + mutex_exit_func(mutex); +} + +/******************************************************************//** +NOTE! Please use the corresponding macro mutex_create(), not directly +this function! +A wrapper function for mutex_create_func(), registers the mutex +with performance schema if "UNIV_PFS_MUTEX" is defined when +creating the mutex */ +UNIV_INLINE +void +pfs_mutex_create_func( +/*==================*/ + mysql_pfs_key_t key, /*!< in: Performance Schema key */ + ib_mutex_t* mutex, /*!< in: pointer to memory */ +# ifdef UNIV_DEBUG + const char* cmutex_name, /*!< in: mutex name */ +# ifdef UNIV_SYNC_DEBUG + ulint level, /*!< in: level */ +# endif /* UNIV_SYNC_DEBUG */ +# endif /* UNIV_DEBUG */ + const char* cfile_name, /*!< in: file name where created */ + ulint cline) /*!< in: file line where created */ +{ + mutex->pfs_psi = PSI_MUTEX_CALL(init_mutex)(key, mutex); + + mutex_create_func(mutex, +# ifdef UNIV_DEBUG + cmutex_name, +# ifdef UNIV_SYNC_DEBUG + level, +# endif /* UNIV_SYNC_DEBUG */ +# endif /* UNIV_DEBUG */ + cfile_name, + cline); +} + +/******************************************************************//** +NOTE! Please use the corresponding macro mutex_free(), not directly +this function! +Wrapper function for mutex_free_func(). Also destroys the performance +schema probes when freeing the mutex */ +UNIV_INLINE +void +pfs_mutex_free_func( +/*================*/ + ib_mutex_t* mutex) /*!< in: mutex */ +{ + if (mutex->pfs_psi != NULL) { + PSI_MUTEX_CALL(destroy_mutex)(mutex->pfs_psi); + mutex->pfs_psi = NULL; + } + + mutex_free_func(mutex); +} + +#endif /* UNIV_PFS_MUTEX */ + +#ifndef HAVE_ATOMIC_BUILTINS +/**********************************************************//** +Function that uses a mutex to decrement a variable atomically */ +UNIV_INLINE +void +os_atomic_dec_ulint_func( +/*=====================*/ + ib_mutex_t* mutex, /*!< in: mutex guarding the dec */ + volatile ulint* var, /*!< in/out: variable to decrement */ + ulint delta) /*!< in: delta to decrement */ +{ + mutex_enter(mutex); + + /* I don't think we will encounter a situation where + this check will not be required. */ + ut_ad(*var >= delta); + + *var -= delta; + + mutex_exit(mutex); +} + +/**********************************************************//** +Function that uses a mutex to increment a variable atomically */ +UNIV_INLINE +void +os_atomic_inc_ulint_func( +/*=====================*/ + ib_mutex_t* mutex, /*!< in: mutex guarding the increment */ + volatile ulint* var, /*!< in/out: variable to increment */ + ulint delta) /*!< in: delta to increment */ +{ + mutex_enter(mutex); + + *var += delta; + + mutex_exit(mutex); +} +#endif /* !HAVE_ATOMIC_BUILTINS */ diff --git a/storage/innobase/include/sync0types.h b/storage/innobase/include/sync0types.h new file mode 100644 index 00000000000..0d143004a7a --- /dev/null +++ b/storage/innobase/include/sync0types.h @@ -0,0 +1,31 @@ +/***************************************************************************** + +Copyright (c) 1995, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/sync0types.h +Global types for sync + +Created 9/5/1995 Heikki Tuuri +*******************************************************/ + +#ifndef sync0types_h +#define sync0types_h + +struct ib_mutex_t; + +#endif diff --git a/storage/innobase/include/trx0i_s.h b/storage/innobase/include/trx0i_s.h new file mode 100644 index 00000000000..662971a7841 --- /dev/null +++ b/storage/innobase/include/trx0i_s.h @@ -0,0 +1,311 @@ +/***************************************************************************** + +Copyright (c) 2007, 2011, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/trx0i_s.h +INFORMATION SCHEMA innodb_trx, innodb_locks and +innodb_lock_waits tables cache structures and public +functions. + +Created July 17, 2007 Vasil Dimov +*******************************************************/ + +#ifndef trx0i_s_h +#define trx0i_s_h + +#include "univ.i" +#include "trx0types.h" +#include "dict0types.h" +#include "ut0ut.h" + +/** The maximum amount of memory that can be consumed by innodb_trx, +innodb_locks and innodb_lock_waits information schema tables. */ +#define TRX_I_S_MEM_LIMIT 16777216 /* 16 MiB */ + +/** The maximum length of a string that can be stored in +i_s_locks_row_t::lock_data */ +#define TRX_I_S_LOCK_DATA_MAX_LEN 8192 + +/** The maximum length of a string that can be stored in +i_s_trx_row_t::trx_query */ +#define TRX_I_S_TRX_QUERY_MAX_LEN 1024 + +/** The maximum length of a string that can be stored in +i_s_trx_row_t::trx_operation_state */ +#define TRX_I_S_TRX_OP_STATE_MAX_LEN 64 + +/** The maximum length of a string that can be stored in +i_s_trx_row_t::trx_foreign_key_error */ +#define TRX_I_S_TRX_FK_ERROR_MAX_LEN 256 + +/** The maximum length of a string that can be stored in +i_s_trx_row_t::trx_isolation_level */ +#define TRX_I_S_TRX_ISOLATION_LEVEL_MAX_LEN 16 + +/** Safely copy strings in to the INNODB_TRX table's +string based columns */ +#define TRX_I_S_STRING_COPY(data, field, constraint, tcache) \ +do { \ + if (strlen(data) > constraint) { \ + char buff[constraint + 1]; \ + strncpy(buff, data, constraint); \ + buff[constraint] = '\0'; \ + \ + field = static_cast<const char*>( \ + ha_storage_put_memlim( \ + (tcache)->storage, buff, constraint + 1,\ + MAX_ALLOWED_FOR_STORAGE(tcache))); \ + } else { \ + field = static_cast<const char*>( \ + ha_storage_put_str_memlim( \ + (tcache)->storage, data, \ + MAX_ALLOWED_FOR_STORAGE(tcache))); \ + } \ +} while (0) + +/** A row of INFORMATION_SCHEMA.innodb_locks */ +struct i_s_locks_row_t; + +/** Objects of trx_i_s_cache_t::locks_hash */ +struct i_s_hash_chain_t; + +/** Objects of this type are added to the hash table +trx_i_s_cache_t::locks_hash */ +struct i_s_hash_chain_t { + i_s_locks_row_t* value; /*!< row of + INFORMATION_SCHEMA.innodb_locks*/ + i_s_hash_chain_t* next; /*!< next item in the hash chain */ +}; + +/** This structure represents INFORMATION_SCHEMA.innodb_locks row */ +struct i_s_locks_row_t { + trx_id_t lock_trx_id; /*!< transaction identifier */ + const char* lock_mode; /*!< lock mode from + lock_get_mode_str() */ + const char* lock_type; /*!< lock type from + lock_get_type_str() */ + const char* lock_table; /*!< table name from + lock_get_table_name() */ + const char* lock_index; /*!< index name from + lock_rec_get_index_name() */ + /** Information for record locks. All these are + ULINT_UNDEFINED for table locks. */ + /* @{ */ + ulint lock_space; /*!< tablespace identifier */ + ulint lock_page; /*!< page number within the_space */ + ulint lock_rec; /*!< heap number of the record + on the page */ + const char* lock_data; /*!< (some) content of the record */ + /* @} */ + + /** The following are auxiliary and not included in the table */ + /* @{ */ + table_id_t lock_table_id; + /*!< table identifier from + lock_get_table_id */ + i_s_hash_chain_t hash_chain; /*!< hash table chain node for + trx_i_s_cache_t::locks_hash */ + /* @} */ +}; + +/** This structure represents INFORMATION_SCHEMA.innodb_trx row */ +struct i_s_trx_row_t { + trx_id_t trx_id; /*!< transaction identifier */ + const char* trx_state; /*!< transaction state from + trx_get_que_state_str() */ + ib_time_t trx_started; /*!< trx_t::start_time */ + const i_s_locks_row_t* requested_lock_row; + /*!< pointer to a row + in innodb_locks if trx + is waiting, or NULL */ + ib_time_t trx_wait_started; /*!< trx_t::wait_started */ + ullint trx_weight; /*!< TRX_WEIGHT() */ + ulint trx_mysql_thread_id; /*!< thd_get_thread_id() */ + const char* trx_query; /*!< MySQL statement being + executed in the transaction */ + struct charset_info_st* trx_query_cs; + /*!< charset encode the MySQL + statement */ + const char* trx_operation_state; /*!< trx_t::op_info */ + ulint trx_tables_in_use;/*!< n_mysql_tables_in_use in + trx_t */ + ulint trx_tables_locked; + /*!< mysql_n_tables_locked in + trx_t */ + ulint trx_lock_structs;/*!< list len of trx_locks in + trx_t */ + ulint trx_lock_memory_bytes; + /*!< mem_heap_get_size( + trx->lock_heap) */ + ulint trx_rows_locked;/*!< lock_number_of_rows_locked() */ + ullint trx_rows_modified;/*!< trx_t::undo_no */ + ulint trx_concurrency_tickets; + /*!< n_tickets_to_enter_innodb in + trx_t */ + const char* trx_isolation_level; + /*!< isolation_level in trx_t */ + ibool trx_unique_checks; + /*!< check_unique_secondary in trx_t*/ + ibool trx_foreign_key_checks; + /*!< check_foreigns in trx_t */ + const char* trx_foreign_key_error; + /*!< detailed_error in trx_t */ + ibool trx_has_search_latch; + /*!< has_search_latch in trx_t */ + ulint trx_search_latch_timeout; + /*!< search_latch_timeout in trx_t */ + ulint trx_is_read_only; + /*!< trx_t::read_only */ + ulint trx_is_autocommit_non_locking; + /*!< trx_is_autocommit_non_locking(trx) + */ +}; + +/** This structure represents INFORMATION_SCHEMA.innodb_lock_waits row */ +struct i_s_lock_waits_row_t { + const i_s_locks_row_t* requested_lock_row; /*!< requested lock */ + const i_s_locks_row_t* blocking_lock_row; /*!< blocking lock */ +}; + +/** Cache of INFORMATION_SCHEMA table data */ +struct trx_i_s_cache_t; + +/** Auxiliary enum used by functions that need to select one of the +INFORMATION_SCHEMA tables */ +enum i_s_table { + I_S_INNODB_TRX, /*!< INFORMATION_SCHEMA.innodb_trx */ + I_S_INNODB_LOCKS, /*!< INFORMATION_SCHEMA.innodb_locks */ + I_S_INNODB_LOCK_WAITS /*!< INFORMATION_SCHEMA.innodb_lock_waits */ +}; + +/** This is the intermediate buffer where data needed to fill the +INFORMATION SCHEMA tables is fetched and later retrieved by the C++ +code in handler/i_s.cc. */ +extern trx_i_s_cache_t* trx_i_s_cache; + +/*******************************************************************//** +Initialize INFORMATION SCHEMA trx related cache. */ +UNIV_INTERN +void +trx_i_s_cache_init( +/*===============*/ + trx_i_s_cache_t* cache); /*!< out: cache to init */ +/*******************************************************************//** +Free the INFORMATION SCHEMA trx related cache. */ +UNIV_INTERN +void +trx_i_s_cache_free( +/*===============*/ + trx_i_s_cache_t* cache); /*!< in/out: cache to free */ + +/*******************************************************************//** +Issue a shared/read lock on the tables cache. */ +UNIV_INTERN +void +trx_i_s_cache_start_read( +/*=====================*/ + trx_i_s_cache_t* cache); /*!< in: cache */ + +/*******************************************************************//** +Release a shared/read lock on the tables cache. */ +UNIV_INTERN +void +trx_i_s_cache_end_read( +/*===================*/ + trx_i_s_cache_t* cache); /*!< in: cache */ + +/*******************************************************************//** +Issue an exclusive/write lock on the tables cache. */ +UNIV_INTERN +void +trx_i_s_cache_start_write( +/*======================*/ + trx_i_s_cache_t* cache); /*!< in: cache */ + +/*******************************************************************//** +Release an exclusive/write lock on the tables cache. */ +UNIV_INTERN +void +trx_i_s_cache_end_write( +/*====================*/ + trx_i_s_cache_t* cache); /*!< in: cache */ + + +/*******************************************************************//** +Retrieves the number of used rows in the cache for a given +INFORMATION SCHEMA table. +@return number of rows */ +UNIV_INTERN +ulint +trx_i_s_cache_get_rows_used( +/*========================*/ + trx_i_s_cache_t* cache, /*!< in: cache */ + enum i_s_table table); /*!< in: which table */ + +/*******************************************************************//** +Retrieves the nth row in the cache for a given INFORMATION SCHEMA +table. +@return row */ +UNIV_INTERN +void* +trx_i_s_cache_get_nth_row( +/*======================*/ + trx_i_s_cache_t* cache, /*!< in: cache */ + enum i_s_table table, /*!< in: which table */ + ulint n); /*!< in: row number */ + +/*******************************************************************//** +Update the transactions cache if it has not been read for some time. +@return 0 - fetched, 1 - not */ +UNIV_INTERN +int +trx_i_s_possibly_fetch_data_into_cache( +/*===================================*/ + trx_i_s_cache_t* cache); /*!< in/out: cache */ + +/*******************************************************************//** +Returns TRUE if the data in the cache is truncated due to the memory +limit posed by TRX_I_S_MEM_LIMIT. +@return TRUE if truncated */ +UNIV_INTERN +ibool +trx_i_s_cache_is_truncated( +/*=======================*/ + trx_i_s_cache_t* cache); /*!< in: cache */ + +/** The maximum length of a resulting lock_id_size in +trx_i_s_create_lock_id(), not including the terminating NUL. +":%lu:%lu:%lu" -> 63 chars */ +#define TRX_I_S_LOCK_ID_MAX_LEN (TRX_ID_MAX_LEN + 63) + +/*******************************************************************//** +Crafts a lock id string from a i_s_locks_row_t object. Returns its +second argument. This function aborts if there is not enough space in +lock_id. Be sure to provide at least TRX_I_S_LOCK_ID_MAX_LEN + 1 if you +want to be 100% sure that it will not abort. +@return resulting lock id */ +UNIV_INTERN +char* +trx_i_s_create_lock_id( +/*===================*/ + const i_s_locks_row_t* row, /*!< in: innodb_locks row */ + char* lock_id,/*!< out: resulting lock_id */ + ulint lock_id_size);/*!< in: size of the lock id + buffer */ + +#endif /* trx0i_s_h */ diff --git a/storage/innobase/include/trx0purge.h b/storage/innobase/include/trx0purge.h new file mode 100644 index 00000000000..1e13c883800 --- /dev/null +++ b/storage/innobase/include/trx0purge.h @@ -0,0 +1,218 @@ +/***************************************************************************** + +Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/trx0purge.h +Purge old versions + +Created 3/26/1996 Heikki Tuuri +*******************************************************/ + +#ifndef trx0purge_h +#define trx0purge_h + +#include "univ.i" +#include "trx0types.h" +#include "mtr0mtr.h" +#include "trx0sys.h" +#include "que0types.h" +#include "page0page.h" +#include "usr0sess.h" +#include "fil0fil.h" + +/** The global data structure coordinating a purge */ +extern trx_purge_t* purge_sys; + +/** A dummy undo record used as a return value when we have a whole undo log +which needs no purge */ +extern trx_undo_rec_t trx_purge_dummy_rec; + +/********************************************************************//** +Calculates the file address of an undo log header when we have the file +address of its history list node. +@return file address of the log */ +UNIV_INLINE +fil_addr_t +trx_purge_get_log_from_hist( +/*========================*/ + fil_addr_t node_addr); /*!< in: file address of the history + list node of the log */ +/********************************************************************//** +Creates the global purge system control structure and inits the history +mutex. */ +UNIV_INTERN +void +trx_purge_sys_create( +/*=================*/ + ulint n_purge_threads,/*!< in: number of purge threads */ + ib_bh_t* ib_bh); /*!< in/own: UNDO log min binary heap*/ +/********************************************************************//** +Frees the global purge system control structure. */ +UNIV_INTERN +void +trx_purge_sys_close(void); +/*======================*/ +/************************************************************************ +Adds the update undo log as the first log in the history list. Removes the +update undo log segment from the rseg slot if it is too big for reuse. */ +UNIV_INTERN +void +trx_purge_add_update_undo_to_history( +/*=================================*/ + trx_t* trx, /*!< in: transaction */ + page_t* undo_page, /*!< in: update undo log header page, + x-latched */ + mtr_t* mtr); /*!< in: mtr */ +/*******************************************************************//** +This function runs a purge batch. +@return number of undo log pages handled in the batch */ +UNIV_INTERN +ulint +trx_purge( +/*======*/ + ulint n_purge_threads, /*!< in: number of purge tasks to + submit to task queue. */ + ulint limit, /*!< in: the maximum number of + records to purge in one batch */ + bool truncate); /*!< in: truncate history if true */ +/*******************************************************************//** +Stop purge and wait for it to stop, move to PURGE_STATE_STOP. */ +UNIV_INTERN +void +trx_purge_stop(void); +/*================*/ +/*******************************************************************//** +Resume purge, move to PURGE_STATE_RUN. */ +UNIV_INTERN +void +trx_purge_run(void); +/*================*/ + +/** Purge states */ +enum purge_state_t { + PURGE_STATE_INIT, /*!< Purge instance created */ + PURGE_STATE_RUN, /*!< Purge should be running */ + PURGE_STATE_STOP, /*!< Purge should be stopped */ + PURGE_STATE_EXIT, /*!< Purge has been shutdown */ + PURGE_STATE_DISABLED /*!< Purge was never started */ +}; + +/*******************************************************************//** +Get the purge state. +@return purge state. */ +UNIV_INTERN +purge_state_t +trx_purge_state(void); +/*=================*/ + +/** This is the purge pointer/iterator. We need both the undo no and the +transaction no up to which purge has parsed and applied the records. */ +struct purge_iter_t { + trx_id_t trx_no; /*!< Purge has advanced past all + transactions whose number is less + than this */ + undo_no_t undo_no; /*!< Purge has advanced past all records + whose undo number is less than this */ +}; + +/** The control structure used in the purge operation */ +struct trx_purge_t{ + sess_t* sess; /*!< System session running the purge + query */ + trx_t* trx; /*!< System transaction running the + purge query: this trx is not in the + trx list of the trx system and it + never ends */ + rw_lock_t latch; /*!< The latch protecting the purge + view. A purge operation must acquire an + x-latch here for the instant at which + it changes the purge view: an undo + log operation can prevent this by + obtaining an s-latch here. It also + protects state and running */ + os_event_t event; /*!< State signal event */ + ulint n_stop; /*!< Counter to track number stops */ + volatile bool running; /*!< true, if purge is active, + we check this without the latch too */ + volatile purge_state_t state; /*!< Purge coordinator thread states, + we check this in several places + without holding the latch. */ + que_t* query; /*!< The query graph which will do the + parallelized purge operation */ + read_view_t* view; /*!< The purge will not remove undo logs + which are >= this view (purge view) */ + volatile ulint n_submitted; /*!< Count of total tasks submitted + to the task queue */ + volatile ulint n_completed; /*!< Count of total tasks completed */ + + /*------------------------------*/ + /* The following two fields form the 'purge pointer' which advances + during a purge, and which is used in history list truncation */ + + purge_iter_t iter; /* Limit up to which we have read and + parsed the UNDO log records. Not + necessarily purged from the indexes. + Note that this can never be less than + the limit below, we check for this + invariant in trx0purge.cc */ + purge_iter_t limit; /* The 'purge pointer' which advances + during a purge, and which is used in + history list truncation */ +#ifdef UNIV_DEBUG + purge_iter_t done; /* Indicate 'purge pointer' which have + purged already accurately. */ +#endif /* UNIV_DEBUG */ + /*-----------------------------*/ + ibool next_stored; /*!< TRUE if the info of the next record + to purge is stored below: if yes, then + the transaction number and the undo + number of the record are stored in + purge_trx_no and purge_undo_no above */ + trx_rseg_t* rseg; /*!< Rollback segment for the next undo + record to purge */ + ulint page_no; /*!< Page number for the next undo + record to purge, page number of the + log header, if dummy record */ + ulint offset; /*!< Page offset for the next undo + record to purge, 0 if the dummy + record */ + ulint hdr_page_no; /*!< Header page of the undo log where + the next record to purge belongs */ + ulint hdr_offset; /*!< Header byte offset on the page */ + /*-----------------------------*/ + mem_heap_t* heap; /*!< Temporary storage used during a + purge: can be emptied after purge + completes */ + /*-----------------------------*/ + ib_bh_t* ib_bh; /*!< Binary min-heap, ordered on + rseg_queue_t::trx_no. It is protected + by the bh_mutex */ + ib_mutex_t bh_mutex; /*!< Mutex protecting ib_bh */ +}; + +/** Info required to purge a record */ +struct trx_purge_rec_t { + trx_undo_rec_t* undo_rec; /*!< Record to purge */ + roll_ptr_t roll_ptr; /*!< File pointr to UNDO record */ +}; + +#ifndef UNIV_NONINL +#include "trx0purge.ic" +#endif + +#endif diff --git a/storage/innobase/include/trx0purge.ic b/storage/innobase/include/trx0purge.ic new file mode 100644 index 00000000000..ca9cc1fb894 --- /dev/null +++ b/storage/innobase/include/trx0purge.ic @@ -0,0 +1,62 @@ +/***************************************************************************** + +Copyright (c) 1996, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/trx0purge.ic +Purge old versions + +Created 3/26/1996 Heikki Tuuri +*******************************************************/ + +#include "trx0undo.h" + +/********************************************************************//** +Calculates the file address of an undo log header when we have the file +address of its history list node. +@return file address of the log */ +UNIV_INLINE +fil_addr_t +trx_purge_get_log_from_hist( +/*========================*/ + fil_addr_t node_addr) /*!< in: file address of the history + list node of the log */ +{ + node_addr.boffset -= TRX_UNDO_HISTORY_NODE; + + return(node_addr); +} + +#ifdef UNIV_DEBUG +/********************************************************************//** +address of its history list node. +@return TRUE if purge_sys_t::limit <= purge_sys_t::iter*/ +UNIV_INLINE +ibool +trx_purge_check_limit(void) +/*=======================*/ +{ + ut_ad(purge_sys->limit.trx_no <= purge_sys->iter.trx_no); + + if (purge_sys->limit.trx_no == purge_sys->iter.trx_no) { + ut_ad(purge_sys->limit.undo_no <= purge_sys->iter.undo_no); + } + + return(TRUE); +} +#endif /* UNIV_DEBUG */ + diff --git a/storage/innobase/include/trx0rec.h b/storage/innobase/include/trx0rec.h new file mode 100644 index 00000000000..50da55d2ea3 --- /dev/null +++ b/storage/innobase/include/trx0rec.h @@ -0,0 +1,326 @@ +/***************************************************************************** + +Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/trx0rec.h +Transaction undo log record + +Created 3/26/1996 Heikki Tuuri +*******************************************************/ + +#ifndef trx0rec_h +#define trx0rec_h + +#include "univ.i" +#include "trx0types.h" +#include "row0types.h" +#include "mtr0mtr.h" +#include "dict0types.h" +#include "data0data.h" +#include "rem0types.h" + +#ifndef UNIV_HOTBACKUP +# include "que0types.h" + +/***********************************************************************//** +Copies the undo record to the heap. +@return own: copy of undo log record */ +UNIV_INLINE +trx_undo_rec_t* +trx_undo_rec_copy( +/*==============*/ + const trx_undo_rec_t* undo_rec, /*!< in: undo log record */ + mem_heap_t* heap); /*!< in: heap where copied */ +/**********************************************************************//** +Reads the undo log record type. +@return record type */ +UNIV_INLINE +ulint +trx_undo_rec_get_type( +/*==================*/ + const trx_undo_rec_t* undo_rec); /*!< in: undo log record */ +/**********************************************************************//** +Reads from an undo log record the record compiler info. +@return compiler info */ +UNIV_INLINE +ulint +trx_undo_rec_get_cmpl_info( +/*=======================*/ + const trx_undo_rec_t* undo_rec); /*!< in: undo log record */ +/**********************************************************************//** +Returns TRUE if an undo log record contains an extern storage field. +@return TRUE if extern */ +UNIV_INLINE +ibool +trx_undo_rec_get_extern_storage( +/*============================*/ + const trx_undo_rec_t* undo_rec); /*!< in: undo log record */ +/**********************************************************************//** +Reads the undo log record number. +@return undo no */ +UNIV_INLINE +undo_no_t +trx_undo_rec_get_undo_no( +/*=====================*/ + const trx_undo_rec_t* undo_rec); /*!< in: undo log record */ +/**********************************************************************//** +Returns the start of the undo record data area. +@return offset to the data area */ +UNIV_INLINE +ulint +trx_undo_rec_get_offset( +/*====================*/ + undo_no_t undo_no) /*!< in: undo no read from node */ + __attribute__((const)); + +/**********************************************************************//** +Returns the start of the undo record data area. */ +#define trx_undo_rec_get_ptr(undo_rec, undo_no) \ + ((undo_rec) + trx_undo_rec_get_offset(undo_no)) + +/**********************************************************************//** +Reads from an undo log record the general parameters. +@return remaining part of undo log record after reading these values */ +UNIV_INTERN +byte* +trx_undo_rec_get_pars( +/*==================*/ + trx_undo_rec_t* undo_rec, /*!< in: undo log record */ + ulint* type, /*!< out: undo record type: + TRX_UNDO_INSERT_REC, ... */ + ulint* cmpl_info, /*!< out: compiler info, relevant only + for update type records */ + bool* updated_extern, /*!< out: true if we updated an + externally stored fild */ + undo_no_t* undo_no, /*!< out: undo log record number */ + table_id_t* table_id) /*!< out: table id */ + __attribute__((nonnull)); +/*******************************************************************//** +Builds a row reference from an undo log record. +@return pointer to remaining part of undo record */ +UNIV_INTERN +byte* +trx_undo_rec_get_row_ref( +/*=====================*/ + byte* ptr, /*!< in: remaining part of a copy of an undo log + record, at the start of the row reference; + NOTE that this copy of the undo log record must + be preserved as long as the row reference is + used, as we do NOT copy the data in the + record! */ + dict_index_t* index, /*!< in: clustered index */ + dtuple_t** ref, /*!< out, own: row reference */ + mem_heap_t* heap); /*!< in: memory heap from which the memory + needed is allocated */ +/*******************************************************************//** +Skips a row reference from an undo log record. +@return pointer to remaining part of undo record */ +UNIV_INTERN +byte* +trx_undo_rec_skip_row_ref( +/*======================*/ + byte* ptr, /*!< in: remaining part in update undo log + record, at the start of the row reference */ + dict_index_t* index); /*!< in: clustered index */ +/**********************************************************************//** +Reads from an undo log update record the system field values of the old +version. +@return remaining part of undo log record after reading these values */ +UNIV_INTERN +byte* +trx_undo_update_rec_get_sys_cols( +/*=============================*/ + byte* ptr, /*!< in: remaining part of undo + log record after reading + general parameters */ + trx_id_t* trx_id, /*!< out: trx id */ + roll_ptr_t* roll_ptr, /*!< out: roll ptr */ + ulint* info_bits); /*!< out: info bits state */ +/*******************************************************************//** +Builds an update vector based on a remaining part of an undo log record. +@return remaining part of the record, NULL if an error detected, which +means that the record is corrupted */ +UNIV_INTERN +byte* +trx_undo_update_rec_get_update( +/*===========================*/ + byte* ptr, /*!< in: remaining part in update undo log + record, after reading the row reference + NOTE that this copy of the undo log record must + be preserved as long as the update vector is + used, as we do NOT copy the data in the + record! */ + dict_index_t* index, /*!< in: clustered index */ + ulint type, /*!< in: TRX_UNDO_UPD_EXIST_REC, + TRX_UNDO_UPD_DEL_REC, or + TRX_UNDO_DEL_MARK_REC; in the last case, + only trx id and roll ptr fields are added to + the update vector */ + trx_id_t trx_id, /*!< in: transaction id from this undorecord */ + roll_ptr_t roll_ptr,/*!< in: roll pointer from this undo record */ + ulint info_bits,/*!< in: info bits from this undo record */ + trx_t* trx, /*!< in: transaction */ + mem_heap_t* heap, /*!< in: memory heap from which the memory + needed is allocated */ + upd_t** upd); /*!< out, own: update vector */ +/*******************************************************************//** +Builds a partial row from an update undo log record, for purge. +It contains the columns which occur as ordering in any index of the table. +Any missing columns are indicated by col->mtype == DATA_MISSING. +@return pointer to remaining part of undo record */ +UNIV_INTERN +byte* +trx_undo_rec_get_partial_row( +/*=========================*/ + byte* ptr, /*!< in: remaining part in update undo log + record of a suitable type, at the start of + the stored index columns; + NOTE that this copy of the undo log record must + be preserved as long as the partial row is + used, as we do NOT copy the data in the + record! */ + dict_index_t* index, /*!< in: clustered index */ + dtuple_t** row, /*!< out, own: partial row */ + ibool ignore_prefix, /*!< in: flag to indicate if we + expect blob prefixes in undo. Used + only in the assertion. */ + mem_heap_t* heap) /*!< in: memory heap from which the memory + needed is allocated */ + __attribute__((nonnull, warn_unused_result)); +/***********************************************************************//** +Writes information to an undo log about an insert, update, or a delete marking +of a clustered index record. This information is used in a rollback of the +transaction and in consistent reads that must look to the history of this +transaction. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +trx_undo_report_row_operation( +/*==========================*/ + ulint flags, /*!< in: if BTR_NO_UNDO_LOG_FLAG bit is + set, does nothing */ + ulint op_type, /*!< in: TRX_UNDO_INSERT_OP or + TRX_UNDO_MODIFY_OP */ + que_thr_t* thr, /*!< in: query thread */ + dict_index_t* index, /*!< in: clustered index */ + const dtuple_t* clust_entry, /*!< in: in the case of an insert, + index entry to insert into the + clustered index, otherwise NULL */ + const upd_t* update, /*!< in: in the case of an update, + the update vector, otherwise NULL */ + ulint cmpl_info, /*!< in: compiler info on secondary + index updates */ + const rec_t* rec, /*!< in: case of an update or delete + marking, the record in the clustered + index, otherwise NULL */ + const ulint* offsets, /*!< in: rec_get_offsets(rec) */ + roll_ptr_t* roll_ptr) /*!< out: rollback pointer to the + inserted undo log record, + 0 if BTR_NO_UNDO_LOG + flag was specified */ + __attribute__((nonnull(3,4,10), warn_unused_result)); +/******************************************************************//** +Copies an undo record to heap. This function can be called if we know that +the undo log record exists. +@return own: copy of the record */ +UNIV_INTERN +trx_undo_rec_t* +trx_undo_get_undo_rec_low( +/*======================*/ + roll_ptr_t roll_ptr, /*!< in: roll pointer to record */ + mem_heap_t* heap) /*!< in: memory heap where copied */ + __attribute__((nonnull, warn_unused_result)); +/*******************************************************************//** +Build a previous version of a clustered index record. The caller must +hold a latch on the index page of the clustered index record. +@retval true if previous version was built, or if it was an insert +or the table has been rebuilt +@retval false if the previous version is earlier than purge_view, +which means that it may have been removed */ +UNIV_INTERN +bool +trx_undo_prev_version_build( +/*========================*/ + const rec_t* index_rec,/*!< in: clustered index record in the + index tree */ + mtr_t* index_mtr,/*!< in: mtr which contains the latch to + index_rec page and purge_view */ + const rec_t* rec, /*!< in: version of a clustered index record */ + dict_index_t* index, /*!< in: clustered index */ + ulint* offsets,/*!< in/out: rec_get_offsets(rec, index) */ + mem_heap_t* heap, /*!< in: memory heap from which the memory + needed is allocated */ + rec_t** old_vers)/*!< out, own: previous version, or NULL if + rec is the first inserted version, or if + history data has been deleted */ + __attribute__((nonnull)); +#endif /* !UNIV_HOTBACKUP */ +/***********************************************************//** +Parses a redo log record of adding an undo log record. +@return end of log record or NULL */ +UNIV_INTERN +byte* +trx_undo_parse_add_undo_rec( +/*========================*/ + byte* ptr, /*!< in: buffer */ + byte* end_ptr,/*!< in: buffer end */ + page_t* page); /*!< in: page or NULL */ +/***********************************************************//** +Parses a redo log record of erasing of an undo page end. +@return end of log record or NULL */ +UNIV_INTERN +byte* +trx_undo_parse_erase_page_end( +/*==========================*/ + byte* ptr, /*!< in: buffer */ + byte* end_ptr,/*!< in: buffer end */ + page_t* page, /*!< in: page or NULL */ + mtr_t* mtr); /*!< in: mtr or NULL */ + +#ifndef UNIV_HOTBACKUP + +/* Types of an undo log record: these have to be smaller than 16, as the +compilation info multiplied by 16 is ORed to this value in an undo log +record */ + +#define TRX_UNDO_INSERT_REC 11 /* fresh insert into clustered index */ +#define TRX_UNDO_UPD_EXIST_REC 12 /* update of a non-delete-marked + record */ +#define TRX_UNDO_UPD_DEL_REC 13 /* update of a delete marked record to + a not delete marked record; also the + fields of the record can change */ +#define TRX_UNDO_DEL_MARK_REC 14 /* delete marking of a record; fields + do not change */ +#define TRX_UNDO_CMPL_INFO_MULT 16 /* compilation info is multiplied by + this and ORed to the type above */ +#define TRX_UNDO_UPD_EXTERN 128 /* This bit can be ORed to type_cmpl + to denote that we updated external + storage fields: used by purge to + free the external storage */ + +/* Operation type flags used in trx_undo_report_row_operation */ +#define TRX_UNDO_INSERT_OP 1 +#define TRX_UNDO_MODIFY_OP 2 + +#ifndef UNIV_NONINL +#include "trx0rec.ic" +#endif + +#endif /* !UNIV_HOTBACKUP */ + +#endif /* trx0rec_h */ diff --git a/storage/innobase/include/trx0rec.ic b/storage/innobase/include/trx0rec.ic new file mode 100644 index 00000000000..08704f6b821 --- /dev/null +++ b/storage/innobase/include/trx0rec.ic @@ -0,0 +1,113 @@ +/***************************************************************************** + +Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/trx0rec.ic +Transaction undo log record + +Created 3/26/1996 Heikki Tuuri +*******************************************************/ + +#ifndef UNIV_HOTBACKUP +/**********************************************************************//** +Reads from an undo log record the record type. +@return record type */ +UNIV_INLINE +ulint +trx_undo_rec_get_type( +/*==================*/ + const trx_undo_rec_t* undo_rec) /*!< in: undo log record */ +{ + return(mach_read_from_1(undo_rec + 2) & (TRX_UNDO_CMPL_INFO_MULT - 1)); +} + +/**********************************************************************//** +Reads from an undo log record the record compiler info. +@return compiler info */ +UNIV_INLINE +ulint +trx_undo_rec_get_cmpl_info( +/*=======================*/ + const trx_undo_rec_t* undo_rec) /*!< in: undo log record */ +{ + return(mach_read_from_1(undo_rec + 2) / TRX_UNDO_CMPL_INFO_MULT); +} + +/**********************************************************************//** +Returns TRUE if an undo log record contains an extern storage field. +@return TRUE if extern */ +UNIV_INLINE +ibool +trx_undo_rec_get_extern_storage( +/*============================*/ + const trx_undo_rec_t* undo_rec) /*!< in: undo log record */ +{ + if (mach_read_from_1(undo_rec + 2) & TRX_UNDO_UPD_EXTERN) { + + return(TRUE); + } + + return(FALSE); +} + +/**********************************************************************//** +Reads the undo log record number. +@return undo no */ +UNIV_INLINE +undo_no_t +trx_undo_rec_get_undo_no( +/*=====================*/ + const trx_undo_rec_t* undo_rec) /*!< in: undo log record */ +{ + const byte* ptr; + + ptr = undo_rec + 3; + + return(mach_ull_read_much_compressed(ptr)); +} + +/**********************************************************************//** +Returns the start of the undo record data area. +@return offset to the data area */ +UNIV_INLINE +ulint +trx_undo_rec_get_offset( +/*====================*/ + undo_no_t undo_no) /*!< in: undo no read from node */ +{ + return(3 + mach_ull_get_much_compressed_size(undo_no)); +} + +/***********************************************************************//** +Copies the undo record to the heap. +@return own: copy of undo log record */ +UNIV_INLINE +trx_undo_rec_t* +trx_undo_rec_copy( +/*==============*/ + const trx_undo_rec_t* undo_rec, /*!< in: undo log record */ + mem_heap_t* heap) /*!< in: heap where copied */ +{ + ulint len; + + len = mach_read_from_2(undo_rec) + - ut_align_offset(undo_rec, UNIV_PAGE_SIZE); + ut_ad(len < UNIV_PAGE_SIZE); + return((trx_undo_rec_t*) mem_heap_dup(heap, undo_rec, len)); +} +#endif /* !UNIV_HOTBACKUP */ diff --git a/storage/innobase/include/trx0roll.h b/storage/innobase/include/trx0roll.h new file mode 100644 index 00000000000..d5ab83d7767 --- /dev/null +++ b/storage/innobase/include/trx0roll.h @@ -0,0 +1,297 @@ +/***************************************************************************** + +Copyright (c) 1996, 2014, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/trx0roll.h +Transaction rollback + +Created 3/26/1996 Heikki Tuuri +*******************************************************/ + +#ifndef trx0roll_h +#define trx0roll_h + +#include "univ.i" +#include "trx0trx.h" +#include "trx0types.h" +#include "mtr0mtr.h" +#include "trx0sys.h" + +extern bool trx_rollback_or_clean_is_active; + +/*******************************************************************//** +Determines if this transaction is rolling back an incomplete transaction +in crash recovery. +@return TRUE if trx is an incomplete transaction that is being rolled +back in crash recovery */ +UNIV_INTERN +ibool +trx_is_recv( +/*========*/ + const trx_t* trx); /*!< in: transaction */ +/*******************************************************************//** +Returns a transaction savepoint taken at this point in time. +@return savepoint */ +UNIV_INTERN +trx_savept_t +trx_savept_take( +/*============*/ + trx_t* trx); /*!< in: transaction */ +/*******************************************************************//** +Frees an undo number array. */ +UNIV_INTERN +void +trx_undo_arr_free( +/*==============*/ + trx_undo_arr_t* arr); /*!< in: undo number array */ +/*******************************************************************//** +Returns pointer to nth element in an undo number array. +@return pointer to the nth element */ +UNIV_INLINE +trx_undo_inf_t* +trx_undo_arr_get_nth_info( +/*======================*/ + trx_undo_arr_t* arr, /*!< in: undo number array */ + ulint n); /*!< in: position */ +/********************************************************************//** +Pops the topmost record when the two undo logs of a transaction are seen +as a single stack of records ordered by their undo numbers. Inserts the +undo number of the popped undo record to the array of currently processed +undo numbers in the transaction. When the query thread finishes processing +of this undo record, it must be released with trx_undo_rec_release. +@return undo log record copied to heap, NULL if none left, or if the +undo number of the top record would be less than the limit */ +UNIV_INTERN +trx_undo_rec_t* +trx_roll_pop_top_rec_of_trx( +/*========================*/ + trx_t* trx, /*!< in: transaction */ + undo_no_t limit, /*!< in: least undo number we need */ + roll_ptr_t* roll_ptr,/*!< out: roll pointer to undo record */ + mem_heap_t* heap); /*!< in: memory heap where copied */ +/********************************************************************//** +Reserves an undo log record for a query thread to undo. This should be +called if the query thread gets the undo log record not using the pop +function above. +@return TRUE if succeeded */ +UNIV_INTERN +ibool +trx_undo_rec_reserve( +/*=================*/ + trx_t* trx, /*!< in/out: transaction */ + undo_no_t undo_no);/*!< in: undo number of the record */ +/*******************************************************************//** +Releases a reserved undo record. */ +UNIV_INTERN +void +trx_undo_rec_release( +/*=================*/ + trx_t* trx, /*!< in/out: transaction */ + undo_no_t undo_no);/*!< in: undo number */ +/*******************************************************************//** +Rollback or clean up any incomplete transactions which were +encountered in crash recovery. If the transaction already was +committed, then we clean up a possible insert undo log. If the +transaction was not yet committed, then we roll it back. */ +UNIV_INTERN +void +trx_rollback_or_clean_recovered( +/*============================*/ + ibool all); /*!< in: FALSE=roll back dictionary transactions; + TRUE=roll back all non-PREPARED transactions */ +/*******************************************************************//** +Rollback or clean up any incomplete transactions which were +encountered in crash recovery. If the transaction already was +committed, then we clean up a possible insert undo log. If the +transaction was not yet committed, then we roll it back. +Note: this is done in a background thread. +@return a dummy parameter */ +extern "C" UNIV_INTERN +os_thread_ret_t +DECLARE_THREAD(trx_rollback_or_clean_all_recovered)( +/*================================================*/ + void* arg __attribute__((unused))); + /*!< in: a dummy parameter required by + os_thread_create */ +/*********************************************************************//** +Creates a rollback command node struct. +@return own: rollback node struct */ +UNIV_INTERN +roll_node_t* +roll_node_create( +/*=============*/ + mem_heap_t* heap); /*!< in: mem heap where created */ +/***********************************************************//** +Performs an execution step for a rollback command node in a query graph. +@return query thread to run next, or NULL */ +UNIV_INTERN +que_thr_t* +trx_rollback_step( +/*==============*/ + que_thr_t* thr); /*!< in: query thread */ +/*******************************************************************//** +Rollback a transaction used in MySQL. +@return error code or DB_SUCCESS */ +UNIV_INTERN +dberr_t +trx_rollback_for_mysql( +/*===================*/ + trx_t* trx) /*!< in/out: transaction */ + __attribute__((nonnull)); +/*******************************************************************//** +Rollback the latest SQL statement for MySQL. +@return error code or DB_SUCCESS */ +UNIV_INTERN +dberr_t +trx_rollback_last_sql_stat_for_mysql( +/*=================================*/ + trx_t* trx) /*!< in/out: transaction */ + __attribute__((nonnull)); +/*******************************************************************//** +Rollback a transaction to a given savepoint or do a complete rollback. +@return error code or DB_SUCCESS */ +UNIV_INTERN +dberr_t +trx_rollback_to_savepoint( +/*======================*/ + trx_t* trx, /*!< in: transaction handle */ + trx_savept_t* savept) /*!< in: pointer to savepoint undo number, if + partial rollback requested, or NULL for + complete rollback */ + __attribute__((nonnull(1))); +/*******************************************************************//** +Rolls back a transaction back to a named savepoint. Modifications after the +savepoint are undone but InnoDB does NOT release the corresponding locks +which are stored in memory. If a lock is 'implicit', that is, a new inserted +row holds a lock where the lock information is carried by the trx id stored in +the row, these locks are naturally released in the rollback. Savepoints which +were set after this savepoint are deleted. +@return if no savepoint of the name found then DB_NO_SAVEPOINT, +otherwise DB_SUCCESS */ +UNIV_INTERN +dberr_t +trx_rollback_to_savepoint_for_mysql( +/*================================*/ + trx_t* trx, /*!< in: transaction handle */ + const char* savepoint_name, /*!< in: savepoint name */ + ib_int64_t* mysql_binlog_cache_pos) /*!< out: the MySQL binlog cache + position corresponding to this + savepoint; MySQL needs this + information to remove the + binlog entries of the queries + executed after the savepoint */ + __attribute__((nonnull, warn_unused_result)); +/*******************************************************************//** +Creates a named savepoint. If the transaction is not yet started, starts it. +If there is already a savepoint of the same name, this call erases that old +savepoint and replaces it with a new. Savepoints are deleted in a transaction +commit or rollback. +@return always DB_SUCCESS */ +UNIV_INTERN +dberr_t +trx_savepoint_for_mysql( +/*====================*/ + trx_t* trx, /*!< in: transaction handle */ + const char* savepoint_name, /*!< in: savepoint name */ + ib_int64_t binlog_cache_pos) /*!< in: MySQL binlog cache + position corresponding to this + connection at the time of the + savepoint */ + __attribute__((nonnull)); +/*******************************************************************//** +Releases a named savepoint. Savepoints which +were set after this savepoint are deleted. +@return if no savepoint of the name found then DB_NO_SAVEPOINT, +otherwise DB_SUCCESS */ +UNIV_INTERN +dberr_t +trx_release_savepoint_for_mysql( +/*============================*/ + trx_t* trx, /*!< in: transaction handle */ + const char* savepoint_name) /*!< in: savepoint name */ + __attribute__((nonnull, warn_unused_result)); +/*******************************************************************//** +Frees savepoint structs starting from savep. */ +UNIV_INTERN +void +trx_roll_savepoints_free( +/*=====================*/ + trx_t* trx, /*!< in: transaction handle */ + trx_named_savept_t* savep); /*!< in: free all savepoints > this one; + if this is NULL, free all savepoints + of trx */ + +/** A cell of trx_undo_arr_t; used during a rollback and a purge */ +struct trx_undo_inf_t{ + ibool in_use; /*!< true if cell is being used */ + trx_id_t trx_no; /*!< transaction number: not defined during + a rollback */ + undo_no_t undo_no;/*!< undo number of an undo record */ +}; + +/** During a rollback and a purge, undo numbers of undo records currently being +processed are stored in this array */ + +struct trx_undo_arr_t{ + ulint n_cells; /*!< number of cells in the array */ + ulint n_used; /*!< number of cells in use */ + trx_undo_inf_t* infos; /*!< the array of undo infos */ + mem_heap_t* heap; /*!< memory heap from which allocated */ +}; + +/** Rollback node states */ +enum roll_node_state { + ROLL_NODE_NONE = 0, /*!< Unknown state */ + ROLL_NODE_SEND, /*!< about to send a rollback signal to + the transaction */ + ROLL_NODE_WAIT /*!< rollback signal sent to the + transaction, waiting for completion */ +}; + +/** Rollback command node in a query graph */ +struct roll_node_t{ + que_common_t common; /*!< node type: QUE_NODE_ROLLBACK */ + enum roll_node_state state; /*!< node execution state */ + ibool partial;/*!< TRUE if we want a partial + rollback */ + trx_savept_t savept; /*!< savepoint to which to + roll back, in the case of a + partial rollback */ + que_thr_t* undo_thr;/*!< undo query graph */ +}; + +/** A savepoint set with SQL's "SAVEPOINT savepoint_id" command */ +struct trx_named_savept_t{ + char* name; /*!< savepoint name */ + trx_savept_t savept; /*!< the undo number corresponding to + the savepoint */ + ib_int64_t mysql_binlog_cache_pos; + /*!< the MySQL binlog cache position + corresponding to this savepoint, not + defined if the MySQL binlogging is not + enabled */ + UT_LIST_NODE_T(trx_named_savept_t) + trx_savepoints; /*!< the list of savepoints of a + transaction */ +}; + +#ifndef UNIV_NONINL +#include "trx0roll.ic" +#endif + +#endif diff --git a/storage/innobase/include/trx0roll.ic b/storage/innobase/include/trx0roll.ic new file mode 100644 index 00000000000..178e9bb730a --- /dev/null +++ b/storage/innobase/include/trx0roll.ic @@ -0,0 +1,40 @@ +/***************************************************************************** + +Copyright (c) 1996, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/trx0roll.ic +Transaction rollback + +Created 3/26/1996 Heikki Tuuri +*******************************************************/ + +/*******************************************************************//** +Returns pointer to nth element in an undo number array. +@return pointer to the nth element */ +UNIV_INLINE +trx_undo_inf_t* +trx_undo_arr_get_nth_info( +/*======================*/ + trx_undo_arr_t* arr, /*!< in: undo number array */ + ulint n) /*!< in: position */ +{ + ut_ad(arr); + ut_ad(n < arr->n_cells); + + return(arr->infos + n); +} diff --git a/storage/innobase/include/trx0rseg.h b/storage/innobase/include/trx0rseg.h new file mode 100644 index 00000000000..185b05876b4 --- /dev/null +++ b/storage/innobase/include/trx0rseg.h @@ -0,0 +1,230 @@ +/***************************************************************************** + +Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/trx0rseg.h +Rollback segment + +Created 3/26/1996 Heikki Tuuri +*******************************************************/ + +#ifndef trx0rseg_h +#define trx0rseg_h + +#include "univ.i" +#include "trx0types.h" +#include "trx0sys.h" +#include "ut0bh.h" + +/******************************************************************//** +Gets a rollback segment header. +@return rollback segment header, page x-latched */ +UNIV_INLINE +trx_rsegf_t* +trx_rsegf_get( +/*==========*/ + ulint space, /*!< in: space where placed */ + ulint zip_size, /*!< in: compressed page size in bytes + or 0 for uncompressed pages */ + ulint page_no, /*!< in: page number of the header */ + mtr_t* mtr); /*!< in: mtr */ +/******************************************************************//** +Gets a newly created rollback segment header. +@return rollback segment header, page x-latched */ +UNIV_INLINE +trx_rsegf_t* +trx_rsegf_get_new( +/*==============*/ + ulint space, /*!< in: space where placed */ + ulint zip_size, /*!< in: compressed page size in bytes + or 0 for uncompressed pages */ + ulint page_no, /*!< in: page number of the header */ + mtr_t* mtr); /*!< in: mtr */ +/***************************************************************//** +Gets the file page number of the nth undo log slot. +@return page number of the undo log segment */ +UNIV_INLINE +ulint +trx_rsegf_get_nth_undo( +/*===================*/ + trx_rsegf_t* rsegf, /*!< in: rollback segment header */ + ulint n, /*!< in: index of slot */ + mtr_t* mtr); /*!< in: mtr */ +/***************************************************************//** +Sets the file page number of the nth undo log slot. */ +UNIV_INLINE +void +trx_rsegf_set_nth_undo( +/*===================*/ + trx_rsegf_t* rsegf, /*!< in: rollback segment header */ + ulint n, /*!< in: index of slot */ + ulint page_no,/*!< in: page number of the undo log segment */ + mtr_t* mtr); /*!< in: mtr */ +/****************************************************************//** +Looks for a free slot for an undo log segment. +@return slot index or ULINT_UNDEFINED if not found */ +UNIV_INLINE +ulint +trx_rsegf_undo_find_free( +/*=====================*/ + trx_rsegf_t* rsegf, /*!< in: rollback segment header */ + mtr_t* mtr); /*!< in: mtr */ +/******************************************************************//** +Looks for a rollback segment, based on the rollback segment id. +@return rollback segment */ +UNIV_INLINE +trx_rseg_t* +trx_rseg_get_on_id( +/*===============*/ + ulint id); /*!< in: rollback segment id */ +/****************************************************************//** +Creates a rollback segment header. This function is called only when +a new rollback segment is created in the database. +@return page number of the created segment, FIL_NULL if fail */ +UNIV_INTERN +ulint +trx_rseg_header_create( +/*===================*/ + ulint space, /*!< in: space id */ + ulint zip_size, /*!< in: compressed page size in bytes + or 0 for uncompressed pages */ + ulint max_size, /*!< in: max size in pages */ + ulint rseg_slot_no, /*!< in: rseg id == slot number in trx sys */ + mtr_t* mtr); /*!< in: mtr */ +/*********************************************************************//** +Creates the memory copies for rollback segments and initializes the +rseg array in trx_sys at a database startup. */ +UNIV_INTERN +void +trx_rseg_array_init( +/*================*/ + trx_sysf_t* sys_header, /*!< in/out: trx system header */ + ib_bh_t* ib_bh, /*!< in: rseg queue */ + mtr_t* mtr); /*!< in/out: mtr */ +/*************************************************************************** +Free's an instance of the rollback segment in memory. */ +UNIV_INTERN +void +trx_rseg_mem_free( +/*==============*/ + trx_rseg_t* rseg); /*!< in, own: instance to free */ + +/********************************************************************* +Creates a rollback segment. */ +UNIV_INTERN +trx_rseg_t* +trx_rseg_create( +/*============*/ + ulint space); /*!< in: id of UNDO tablespace */ + +/******************************************************************** +Get the number of unique rollback tablespaces in use except space id 0. +The last space id will be the sentinel value ULINT_UNDEFINED. The array +will be sorted on space id. Note: space_ids should have have space for +TRX_SYS_N_RSEGS + 1 elements. +@return number of unique rollback tablespaces in use. */ +UNIV_INTERN +ulint +trx_rseg_get_n_undo_tablespaces( +/*============================*/ + ulint* space_ids); /*!< out: array of space ids of + UNDO tablespaces */ +/* Number of undo log slots in a rollback segment file copy */ +#define TRX_RSEG_N_SLOTS (UNIV_PAGE_SIZE / 16) + +/* Maximum number of transactions supported by a single rollback segment */ +#define TRX_RSEG_MAX_N_TRXS (TRX_RSEG_N_SLOTS / 2) + +/* The rollback segment memory object */ +struct trx_rseg_t{ + /*--------------------------------------------------------*/ + ulint id; /*!< rollback segment id == the index of + its slot in the trx system file copy */ + ib_mutex_t mutex; /*!< mutex protecting the fields in this + struct except id, which is constant */ + ulint space; /*!< space where the rollback segment is + header is placed */ + ulint zip_size;/* compressed page size of space + in bytes, or 0 for uncompressed spaces */ + ulint page_no;/* page number of the rollback segment + header */ + ulint max_size;/* maximum allowed size in pages */ + ulint curr_size;/* current size in pages */ + /*--------------------------------------------------------*/ + /* Fields for update undo logs */ + UT_LIST_BASE_NODE_T(trx_undo_t) update_undo_list; + /* List of update undo logs */ + UT_LIST_BASE_NODE_T(trx_undo_t) update_undo_cached; + /* List of update undo log segments + cached for fast reuse */ + /*--------------------------------------------------------*/ + /* Fields for insert undo logs */ + UT_LIST_BASE_NODE_T(trx_undo_t) insert_undo_list; + /* List of insert undo logs */ + UT_LIST_BASE_NODE_T(trx_undo_t) insert_undo_cached; + /* List of insert undo log segments + cached for fast reuse */ + /*--------------------------------------------------------*/ + ulint last_page_no; /*!< Page number of the last not yet + purged log header in the history list; + FIL_NULL if all list purged */ + ulint last_offset; /*!< Byte offset of the last not yet + purged log header */ + trx_id_t last_trx_no; /*!< Transaction number of the last not + yet purged log */ + ibool last_del_marks; /*!< TRUE if the last not yet purged log + needs purging */ +}; + +/** For prioritising the rollback segments for purge. */ +struct rseg_queue_t { + trx_id_t trx_no; /*!< trx_rseg_t::last_trx_no */ + trx_rseg_t* rseg; /*!< Rollback segment */ +}; + +/* Undo log segment slot in a rollback segment header */ +/*-------------------------------------------------------------*/ +#define TRX_RSEG_SLOT_PAGE_NO 0 /* Page number of the header page of + an undo log segment */ +/*-------------------------------------------------------------*/ +/* Slot size */ +#define TRX_RSEG_SLOT_SIZE 4 + +/* The offset of the rollback segment header on its page */ +#define TRX_RSEG FSEG_PAGE_DATA + +/* Transaction rollback segment header */ +/*-------------------------------------------------------------*/ +#define TRX_RSEG_MAX_SIZE 0 /* Maximum allowed size for rollback + segment in pages */ +#define TRX_RSEG_HISTORY_SIZE 4 /* Number of file pages occupied + by the logs in the history list */ +#define TRX_RSEG_HISTORY 8 /* The update undo logs for committed + transactions */ +#define TRX_RSEG_FSEG_HEADER (8 + FLST_BASE_NODE_SIZE) + /* Header for the file segment where + this page is placed */ +#define TRX_RSEG_UNDO_SLOTS (8 + FLST_BASE_NODE_SIZE + FSEG_HEADER_SIZE) + /* Undo log segment slots */ +/*-------------------------------------------------------------*/ + +#ifndef UNIV_NONINL +#include "trx0rseg.ic" +#endif + +#endif diff --git a/storage/innobase/include/trx0rseg.ic b/storage/innobase/include/trx0rseg.ic new file mode 100644 index 00000000000..30743da9b8c --- /dev/null +++ b/storage/innobase/include/trx0rseg.ic @@ -0,0 +1,167 @@ +/***************************************************************************** + +Copyright (c) 1996, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/trx0rseg.ic +Rollback segment + +Created 3/26/1996 Heikki Tuuri +*******************************************************/ + +#include "srv0srv.h" +#include "mtr0log.h" +#include "trx0sys.h" + +/******************************************************************//** +Gets a rollback segment header. +@return rollback segment header, page x-latched */ +UNIV_INLINE +trx_rsegf_t* +trx_rsegf_get( +/*==========*/ + ulint space, /*!< in: space where placed */ + ulint zip_size, /*!< in: compressed page size in bytes + or 0 for uncompressed pages */ + ulint page_no, /*!< in: page number of the header */ + mtr_t* mtr) /*!< in: mtr */ +{ + buf_block_t* block; + trx_rsegf_t* header; + + block = buf_page_get(space, zip_size, page_no, RW_X_LATCH, mtr); + buf_block_dbg_add_level(block, SYNC_RSEG_HEADER); + + header = TRX_RSEG + buf_block_get_frame(block); + + return(header); +} + +/******************************************************************//** +Gets a newly created rollback segment header. +@return rollback segment header, page x-latched */ +UNIV_INLINE +trx_rsegf_t* +trx_rsegf_get_new( +/*==============*/ + ulint space, /*!< in: space where placed */ + ulint zip_size, /*!< in: compressed page size in bytes + or 0 for uncompressed pages */ + ulint page_no, /*!< in: page number of the header */ + mtr_t* mtr) /*!< in: mtr */ +{ + buf_block_t* block; + trx_rsegf_t* header; + + block = buf_page_get(space, zip_size, page_no, RW_X_LATCH, mtr); + buf_block_dbg_add_level(block, SYNC_RSEG_HEADER_NEW); + + header = TRX_RSEG + buf_block_get_frame(block); + + return(header); +} + +/***************************************************************//** +Gets the file page number of the nth undo log slot. +@return page number of the undo log segment */ +UNIV_INLINE +ulint +trx_rsegf_get_nth_undo( +/*===================*/ + trx_rsegf_t* rsegf, /*!< in: rollback segment header */ + ulint n, /*!< in: index of slot */ + mtr_t* mtr) /*!< in: mtr */ +{ + if (n >= TRX_RSEG_N_SLOTS) { + fprintf(stderr, + "InnoDB: Error: trying to get slot %lu of rseg\n", + (ulong) n); + ut_error; + } + + return(mtr_read_ulint(rsegf + TRX_RSEG_UNDO_SLOTS + + n * TRX_RSEG_SLOT_SIZE, MLOG_4BYTES, mtr)); +} + +/***************************************************************//** +Sets the file page number of the nth undo log slot. */ +UNIV_INLINE +void +trx_rsegf_set_nth_undo( +/*===================*/ + trx_rsegf_t* rsegf, /*!< in: rollback segment header */ + ulint n, /*!< in: index of slot */ + ulint page_no,/*!< in: page number of the undo log segment */ + mtr_t* mtr) /*!< in: mtr */ +{ + if (n >= TRX_RSEG_N_SLOTS) { + fprintf(stderr, + "InnoDB: Error: trying to set slot %lu of rseg\n", + (ulong) n); + ut_error; + } + + mlog_write_ulint(rsegf + TRX_RSEG_UNDO_SLOTS + n * TRX_RSEG_SLOT_SIZE, + page_no, MLOG_4BYTES, mtr); +} + +/****************************************************************//** +Looks for a free slot for an undo log segment. +@return slot index or ULINT_UNDEFINED if not found */ +UNIV_INLINE +ulint +trx_rsegf_undo_find_free( +/*=====================*/ + trx_rsegf_t* rsegf, /*!< in: rollback segment header */ + mtr_t* mtr) /*!< in: mtr */ +{ + ulint i; + ulint page_no; + + for (i = 0; +#ifndef UNIV_DEBUG + i < TRX_RSEG_N_SLOTS; +#else + i < (trx_rseg_n_slots_debug ? trx_rseg_n_slots_debug : TRX_RSEG_N_SLOTS); +#endif + i++) { + + page_no = trx_rsegf_get_nth_undo(rsegf, i, mtr); + + if (page_no == FIL_NULL) { + + return(i); + } + } + + return(ULINT_UNDEFINED); +} + +/******************************************************************//** +Looks for a rollback segment, based on the rollback segment id. +@return rollback segment */ +UNIV_INLINE +trx_rseg_t* +trx_rseg_get_on_id( +/*===============*/ + ulint id) /*!< in: rollback segment id */ +{ + ut_a(id < TRX_SYS_N_RSEGS); + + return(trx_sys->rseg_array[id]); +} + diff --git a/storage/innobase/include/trx0sys.h b/storage/innobase/include/trx0sys.h new file mode 100644 index 00000000000..70f214d1ac7 --- /dev/null +++ b/storage/innobase/include/trx0sys.h @@ -0,0 +1,674 @@ +/***************************************************************************** + +Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/trx0sys.h +Transaction system + +Created 3/26/1996 Heikki Tuuri +*******************************************************/ + +#ifndef trx0sys_h +#define trx0sys_h + +#include "univ.i" + +#include "trx0types.h" +#include "fsp0types.h" +#include "fil0fil.h" +#include "buf0buf.h" +#ifndef UNIV_HOTBACKUP +#include "mtr0mtr.h" +#include "ut0byte.h" +#include "mem0mem.h" +#include "sync0sync.h" +#include "ut0lst.h" +#include "ut0bh.h" +#include "read0types.h" +#include "page0types.h" +#include "ut0bh.h" + +typedef UT_LIST_BASE_NODE_T(trx_t) trx_list_t; + +/** In a MySQL replication slave, in crash recovery we store the master log +file name and position here. */ +/* @{ */ +/** Master binlog file name */ +extern char trx_sys_mysql_master_log_name[]; +/** Master binlog file position. We have successfully got the updates +up to this position. -1 means that no crash recovery was needed, or +there was no master log position info inside InnoDB.*/ +extern ib_int64_t trx_sys_mysql_master_log_pos; +/* @} */ + +/** If this MySQL server uses binary logging, after InnoDB has been inited +and if it has done a crash recovery, we store the binlog file name and position +here. */ +/* @{ */ +/** Binlog file name */ +extern char trx_sys_mysql_bin_log_name[]; +/** Binlog file position, or -1 if unknown */ +extern ib_int64_t trx_sys_mysql_bin_log_pos; +/* @} */ + +/** The transaction system */ +extern trx_sys_t* trx_sys; + +/***************************************************************//** +Checks if a page address is the trx sys header page. +@return TRUE if trx sys header page */ +UNIV_INLINE +ibool +trx_sys_hdr_page( +/*=============*/ + ulint space, /*!< in: space */ + ulint page_no);/*!< in: page number */ +/*****************************************************************//** +Creates and initializes the central memory structures for the transaction +system. This is called when the database is started. +@return min binary heap of rsegs to purge */ +UNIV_INTERN +ib_bh_t* +trx_sys_init_at_db_start(void); +/*==========================*/ +/*****************************************************************//** +Creates the trx_sys instance and initializes ib_bh and mutex. */ +UNIV_INTERN +void +trx_sys_create(void); +/*================*/ +/*****************************************************************//** +Creates and initializes the transaction system at the database creation. */ +UNIV_INTERN +void +trx_sys_create_sys_pages(void); +/*==========================*/ +/****************************************************************//** +Looks for a free slot for a rollback segment in the trx system file copy. +@return slot index or ULINT_UNDEFINED if not found */ +UNIV_INTERN +ulint +trx_sysf_rseg_find_free( +/*====================*/ + mtr_t* mtr); /*!< in: mtr */ +/***************************************************************//** +Gets the pointer in the nth slot of the rseg array. +@return pointer to rseg object, NULL if slot not in use */ +UNIV_INLINE +trx_rseg_t* +trx_sys_get_nth_rseg( +/*=================*/ + trx_sys_t* sys, /*!< in: trx system */ + ulint n); /*!< in: index of slot */ +/**********************************************************************//** +Gets a pointer to the transaction system file copy and x-locks its page. +@return pointer to system file copy, page x-locked */ +UNIV_INLINE +trx_sysf_t* +trx_sysf_get( +/*=========*/ + mtr_t* mtr); /*!< in: mtr */ +/*****************************************************************//** +Gets the space of the nth rollback segment slot in the trx system +file copy. +@return space id */ +UNIV_INLINE +ulint +trx_sysf_rseg_get_space( +/*====================*/ + trx_sysf_t* sys_header, /*!< in: trx sys file copy */ + ulint i, /*!< in: slot index == rseg id */ + mtr_t* mtr); /*!< in: mtr */ +/*****************************************************************//** +Gets the page number of the nth rollback segment slot in the trx system +file copy. +@return page number, FIL_NULL if slot unused */ +UNIV_INLINE +ulint +trx_sysf_rseg_get_page_no( +/*======================*/ + trx_sysf_t* sys_header, /*!< in: trx sys file copy */ + ulint i, /*!< in: slot index == rseg id */ + mtr_t* mtr); /*!< in: mtr */ +/*****************************************************************//** +Sets the space id of the nth rollback segment slot in the trx system +file copy. */ +UNIV_INLINE +void +trx_sysf_rseg_set_space( +/*====================*/ + trx_sysf_t* sys_header, /*!< in: trx sys file copy */ + ulint i, /*!< in: slot index == rseg id */ + ulint space, /*!< in: space id */ + mtr_t* mtr); /*!< in: mtr */ +/*****************************************************************//** +Sets the page number of the nth rollback segment slot in the trx system +file copy. */ +UNIV_INLINE +void +trx_sysf_rseg_set_page_no( +/*======================*/ + trx_sysf_t* sys_header, /*!< in: trx sys file copy */ + ulint i, /*!< in: slot index == rseg id */ + ulint page_no, /*!< in: page number, FIL_NULL if + the slot is reset to unused */ + mtr_t* mtr); /*!< in: mtr */ +/*****************************************************************//** +Allocates a new transaction id. +@return new, allocated trx id */ +UNIV_INLINE +trx_id_t +trx_sys_get_new_trx_id(void); +/*========================*/ +/*****************************************************************//** +Determines the maximum transaction id. +@return maximum currently allocated trx id; will be stale after the +next call to trx_sys_get_new_trx_id() */ +UNIV_INLINE +trx_id_t +trx_sys_get_max_trx_id(void); +/*========================*/ + +#ifdef UNIV_DEBUG +/* Flag to control TRX_RSEG_N_SLOTS behavior debugging. */ +extern uint trx_rseg_n_slots_debug; +#endif + +/*****************************************************************//** +Writes a trx id to an index page. In case that the id size changes in +some future version, this function should be used instead of +mach_write_... */ +UNIV_INLINE +void +trx_write_trx_id( +/*=============*/ + byte* ptr, /*!< in: pointer to memory where written */ + trx_id_t id); /*!< in: id */ +/*****************************************************************//** +Reads a trx id from an index page. In case that the id size changes in +some future version, this function should be used instead of +mach_read_... +@return id */ +UNIV_INLINE +trx_id_t +trx_read_trx_id( +/*============*/ + const byte* ptr); /*!< in: pointer to memory from where to read */ +/****************************************************************//** +Looks for the trx instance with the given id in the rw trx_list. +The caller must be holding trx_sys->mutex. +@return the trx handle or NULL if not found; +the pointer must not be dereferenced unless lock_sys->mutex was +acquired before calling this function and is still being held */ +UNIV_INLINE +trx_t* +trx_get_rw_trx_by_id( +/*=================*/ + trx_id_t trx_id);/*!< in: trx id to search for */ +/****************************************************************//** +Returns the minimum trx id in rw trx list. This is the smallest id for which +the trx can possibly be active. (But, you must look at the trx->state to +find out if the minimum trx id transaction itself is active, or already +committed.) +@return the minimum trx id, or trx_sys->max_trx_id if the trx list is empty */ +UNIV_INLINE +trx_id_t +trx_rw_min_trx_id(void); +/*===================*/ +/****************************************************************//** +Checks if a rw transaction with the given id is active. Caller must hold +trx_sys->mutex in shared mode. If the caller is not holding +lock_sys->mutex, the transaction may already have been committed. +@return transaction instance if active, or NULL; +the pointer must not be dereferenced unless lock_sys->mutex was +acquired before calling this function and is still being held */ +UNIV_INLINE +trx_t* +trx_rw_is_active_low( +/*=================*/ + trx_id_t trx_id, /*!< in: trx id of the transaction */ + ibool* corrupt); /*!< in: NULL or pointer to a flag + that will be set if corrupt */ +/****************************************************************//** +Checks if a rw transaction with the given id is active. If the caller is +not holding lock_sys->mutex, the transaction may already have been +committed. +@return transaction instance if active, or NULL; +the pointer must not be dereferenced unless lock_sys->mutex was +acquired before calling this function and is still being held */ +UNIV_INLINE +trx_t* +trx_rw_is_active( +/*=============*/ + trx_id_t trx_id, /*!< in: trx id of the transaction */ + ibool* corrupt); /*!< in: NULL or pointer to a flag + that will be set if corrupt */ +#ifdef UNIV_DEBUG +/****************************************************************//** +Checks whether a trx is in one of rw_trx_list or ro_trx_list. +@return TRUE if is in */ +UNIV_INTERN +ibool +trx_in_trx_list( +/*============*/ + const trx_t* in_trx) /*!< in: transaction */ + __attribute__((nonnull, warn_unused_result)); +#endif /* UNIV_DEBUG */ +#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG +/***********************************************************//** +Assert that a transaction has been recovered. +@return TRUE */ +UNIV_INLINE +ibool +trx_assert_recovered( +/*=================*/ + trx_id_t trx_id) /*!< in: transaction identifier */ + __attribute__((warn_unused_result)); +#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ +/*****************************************************************//** +Updates the offset information about the end of the MySQL binlog entry +which corresponds to the transaction just being committed. In a MySQL +replication slave updates the latest master binlog position up to which +replication has proceeded. */ +UNIV_INTERN +void +trx_sys_update_mysql_binlog_offset( +/*===============================*/ + const char* file_name,/*!< in: MySQL log file name */ + ib_int64_t offset, /*!< in: position in that log file */ + ulint field, /*!< in: offset of the MySQL log info field in + the trx sys header */ + mtr_t* mtr); /*!< in: mtr */ +/*****************************************************************//** +Prints to stderr the MySQL binlog offset info in the trx system header if +the magic number shows it valid. */ +UNIV_INTERN +void +trx_sys_print_mysql_binlog_offset(void); +/*===================================*/ +/*****************************************************************//** +Prints to stderr the MySQL master log offset info in the trx system header if +the magic number shows it valid. */ +UNIV_INTERN +void +trx_sys_print_mysql_master_log_pos(void); +/*====================================*/ +/*****************************************************************//** +Initializes the tablespace tag system. */ +UNIV_INTERN +void +trx_sys_file_format_init(void); +/*==========================*/ +/*****************************************************************//** +Closes the tablespace tag system. */ +UNIV_INTERN +void +trx_sys_file_format_close(void); +/*===========================*/ +/********************************************************************//** +Tags the system table space with minimum format id if it has not been +tagged yet. +WARNING: This function is only called during the startup and AFTER the +redo log application during recovery has finished. */ +UNIV_INTERN +void +trx_sys_file_format_tag_init(void); +/*==============================*/ +/*****************************************************************//** +Shutdown/Close the transaction system. */ +UNIV_INTERN +void +trx_sys_close(void); +/*===============*/ +/*****************************************************************//** +Get the name representation of the file format from its id. +@return pointer to the name */ +UNIV_INTERN +const char* +trx_sys_file_format_id_to_name( +/*===========================*/ + const ulint id); /*!< in: id of the file format */ +/*****************************************************************//** +Set the file format id unconditionally except if it's already the +same value. +@return TRUE if value updated */ +UNIV_INTERN +ibool +trx_sys_file_format_max_set( +/*========================*/ + ulint format_id, /*!< in: file format id */ + const char** name); /*!< out: max file format name or + NULL if not needed. */ +/********************************************************************* +Creates the rollback segments +@return number of rollback segments that are active. */ +UNIV_INTERN +ulint +trx_sys_create_rsegs( +/*=================*/ + ulint n_spaces, /*!< number of tablespaces for UNDO logs */ + ulint n_rsegs); /*!< number of rollback segments to create */ +/*****************************************************************//** +Get the number of transaction in the system, independent of their state. +@return count of transactions in trx_sys_t::trx_list */ +UNIV_INLINE +ulint +trx_sys_get_n_rw_trx(void); +/*======================*/ + +/********************************************************************* +Check if there are any active (non-prepared) transactions. +@return total number of active transactions or 0 if none */ +UNIV_INTERN +ulint +trx_sys_any_active_transactions(void); +/*=================================*/ +#else /* !UNIV_HOTBACKUP */ +/*****************************************************************//** +Prints to stderr the MySQL binlog info in the system header if the +magic number shows it valid. */ +UNIV_INTERN +void +trx_sys_print_mysql_binlog_offset_from_page( +/*========================================*/ + const byte* page); /*!< in: buffer containing the trx + system header page, i.e., page number + TRX_SYS_PAGE_NO in the tablespace */ +/*****************************************************************//** +Reads the file format id from the first system table space file. +Even if the call succeeds and returns TRUE, the returned format id +may be ULINT_UNDEFINED signalling that the format id was not present +in the data file. +@return TRUE if call succeeds */ +UNIV_INTERN +ibool +trx_sys_read_file_format_id( +/*========================*/ + const char *pathname, /*!< in: pathname of the first system + table space file */ + ulint *format_id); /*!< out: file format of the system table + space */ +/*****************************************************************//** +Reads the file format id from the given per-table data file. +@return TRUE if call succeeds */ +UNIV_INTERN +ibool +trx_sys_read_pertable_file_format_id( +/*=================================*/ + const char *pathname, /*!< in: pathname of a per-table + datafile */ + ulint *format_id); /*!< out: file format of the per-table + data file */ +#endif /* !UNIV_HOTBACKUP */ +/*****************************************************************//** +Get the name representation of the file format from its id. +@return pointer to the max format name */ +UNIV_INTERN +const char* +trx_sys_file_format_max_get(void); +/*=============================*/ +/*****************************************************************//** +Check for the max file format tag stored on disk. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +trx_sys_file_format_max_check( +/*==========================*/ + ulint max_format_id); /*!< in: the max format id to check */ +/********************************************************************//** +Update the file format tag in the system tablespace only if the given +format id is greater than the known max id. +@return TRUE if format_id was bigger than the known max id */ +UNIV_INTERN +ibool +trx_sys_file_format_max_upgrade( +/*============================*/ + const char** name, /*!< out: max file format name */ + ulint format_id); /*!< in: file format identifier */ +/*****************************************************************//** +Get the name representation of the file format from its id. +@return pointer to the name */ +UNIV_INTERN +const char* +trx_sys_file_format_id_to_name( +/*===========================*/ + const ulint id); /*!< in: id of the file format */ + +#ifdef UNIV_DEBUG +/*************************************************************//** +Validate the trx_sys_t::trx_list. */ +UNIV_INTERN +ibool +trx_sys_validate_trx_list(void); +/*===========================*/ +#endif /* UNIV_DEBUG */ + +/* The automatically created system rollback segment has this id */ +#define TRX_SYS_SYSTEM_RSEG_ID 0 + +/* Space id and page no where the trx system file copy resides */ +#define TRX_SYS_SPACE 0 /* the SYSTEM tablespace */ +#include "fsp0fsp.h" +#define TRX_SYS_PAGE_NO FSP_TRX_SYS_PAGE_NO + +/* The offset of the transaction system header on the page */ +#define TRX_SYS FSEG_PAGE_DATA + +/** Transaction system header */ +/*------------------------------------------------------------- @{ */ +#define TRX_SYS_TRX_ID_STORE 0 /*!< the maximum trx id or trx + number modulo + TRX_SYS_TRX_ID_UPDATE_MARGIN + written to a file page by any + transaction; the assignment of + transaction ids continues from + this number rounded up by + TRX_SYS_TRX_ID_UPDATE_MARGIN + plus + TRX_SYS_TRX_ID_UPDATE_MARGIN + when the database is + started */ +#define TRX_SYS_FSEG_HEADER 8 /*!< segment header for the + tablespace segment the trx + system is created into */ +#define TRX_SYS_RSEGS (8 + FSEG_HEADER_SIZE) + /*!< the start of the array of + rollback segment specification + slots */ +/*------------------------------------------------------------- @} */ + +/* Max number of rollback segments: the number of segment specification slots +in the transaction system array; rollback segment id must fit in one (signed) +byte, therefore 128; each slot is currently 8 bytes in size. If you want +to raise the level to 256 then you will need to fix some assertions that +impose the 7 bit restriction. e.g., mach_write_to_3() */ +#define TRX_SYS_N_RSEGS 128 +/* Originally, InnoDB defined TRX_SYS_N_RSEGS as 256 but created only one +rollback segment. It initialized some arrays with this number of entries. +We must remember this limit in order to keep file compatibility. */ +#define TRX_SYS_OLD_N_RSEGS 256 + +/** Maximum length of MySQL binlog file name, in bytes. +@see trx_sys_mysql_master_log_name +@see trx_sys_mysql_bin_log_name */ +#define TRX_SYS_MYSQL_LOG_NAME_LEN 512 +/** Contents of TRX_SYS_MYSQL_LOG_MAGIC_N_FLD */ +#define TRX_SYS_MYSQL_LOG_MAGIC_N 873422344 + +#if UNIV_PAGE_SIZE_MIN < 4096 +# error "UNIV_PAGE_SIZE_MIN < 4096" +#endif +/** The offset of the MySQL replication info in the trx system header; +this contains the same fields as TRX_SYS_MYSQL_LOG_INFO below */ +#define TRX_SYS_MYSQL_MASTER_LOG_INFO (UNIV_PAGE_SIZE - 2000) + +/** The offset of the MySQL binlog offset info in the trx system header */ +#define TRX_SYS_MYSQL_LOG_INFO (UNIV_PAGE_SIZE - 1000) +#define TRX_SYS_MYSQL_LOG_MAGIC_N_FLD 0 /*!< magic number which is + TRX_SYS_MYSQL_LOG_MAGIC_N + if we have valid data in the + MySQL binlog info */ +#define TRX_SYS_MYSQL_LOG_OFFSET_HIGH 4 /*!< high 4 bytes of the offset + within that file */ +#define TRX_SYS_MYSQL_LOG_OFFSET_LOW 8 /*!< low 4 bytes of the offset + within that file */ +#define TRX_SYS_MYSQL_LOG_NAME 12 /*!< MySQL log file name */ + +/** Doublewrite buffer */ +/* @{ */ +/** The offset of the doublewrite buffer header on the trx system header page */ +#define TRX_SYS_DOUBLEWRITE (UNIV_PAGE_SIZE - 200) +/*-------------------------------------------------------------*/ +#define TRX_SYS_DOUBLEWRITE_FSEG 0 /*!< fseg header of the fseg + containing the doublewrite + buffer */ +#define TRX_SYS_DOUBLEWRITE_MAGIC FSEG_HEADER_SIZE + /*!< 4-byte magic number which + shows if we already have + created the doublewrite + buffer */ +#define TRX_SYS_DOUBLEWRITE_BLOCK1 (4 + FSEG_HEADER_SIZE) + /*!< page number of the + first page in the first + sequence of 64 + (= FSP_EXTENT_SIZE) consecutive + pages in the doublewrite + buffer */ +#define TRX_SYS_DOUBLEWRITE_BLOCK2 (8 + FSEG_HEADER_SIZE) + /*!< page number of the + first page in the second + sequence of 64 consecutive + pages in the doublewrite + buffer */ +#define TRX_SYS_DOUBLEWRITE_REPEAT 12 /*!< we repeat + TRX_SYS_DOUBLEWRITE_MAGIC, + TRX_SYS_DOUBLEWRITE_BLOCK1, + TRX_SYS_DOUBLEWRITE_BLOCK2 + so that if the trx sys + header is half-written + to disk, we still may + be able to recover the + information */ +/** If this is not yet set to TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N, +we must reset the doublewrite buffer, because starting from 4.1.x the +space id of a data page is stored into +FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID. */ +#define TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED (24 + FSEG_HEADER_SIZE) + +/*-------------------------------------------------------------*/ +/** Contents of TRX_SYS_DOUBLEWRITE_MAGIC */ +#define TRX_SYS_DOUBLEWRITE_MAGIC_N 536853855 +/** Contents of TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED */ +#define TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N 1783657386 + +/** Size of the doublewrite block in pages */ +#define TRX_SYS_DOUBLEWRITE_BLOCK_SIZE FSP_EXTENT_SIZE +/* @} */ + +/** File format tag */ +/* @{ */ +/** The offset of the file format tag on the trx system header page +(TRX_SYS_PAGE_NO of TRX_SYS_SPACE) */ +#define TRX_SYS_FILE_FORMAT_TAG (UNIV_PAGE_SIZE - 16) + +/** Contents of TRX_SYS_FILE_FORMAT_TAG when valid. The file format +identifier is added to this constant. */ +#define TRX_SYS_FILE_FORMAT_TAG_MAGIC_N_LOW 3645922177UL +/** Contents of TRX_SYS_FILE_FORMAT_TAG+4 when valid */ +#define TRX_SYS_FILE_FORMAT_TAG_MAGIC_N_HIGH 2745987765UL +/** Contents of TRX_SYS_FILE_FORMAT_TAG when valid. The file format +identifier is added to this 64-bit constant. */ +#define TRX_SYS_FILE_FORMAT_TAG_MAGIC_N \ + ((ib_uint64_t) TRX_SYS_FILE_FORMAT_TAG_MAGIC_N_HIGH << 32 \ + | TRX_SYS_FILE_FORMAT_TAG_MAGIC_N_LOW) +/* @} */ + +#ifndef UNIV_HOTBACKUP +/** The transaction system central memory data structure. */ +struct trx_sys_t{ + + ib_mutex_t mutex; /*!< mutex protecting most fields in + this structure except when noted + otherwise */ + ulint n_prepared_trx; /*!< Number of transactions currently + in the XA PREPARED state */ + ulint n_prepared_recovered_trx; /*!< Number of transactions + currently in XA PREPARED state that are + also recovered. Such transactions cannot + be added during runtime. They can only + occur after recovery if mysqld crashed + while there were XA PREPARED + transactions. We disable query cache + if such transactions exist. */ + trx_id_t max_trx_id; /*!< The smallest number not yet + assigned as a transaction id or + transaction number */ +#ifdef UNIV_DEBUG + trx_id_t rw_max_trx_id; /*!< Max trx id of read-write transactions + which exist or existed */ +#endif + trx_list_t rw_trx_list; /*!< List of active and committed in + memory read-write transactions, sorted + on trx id, biggest first. Recovered + transactions are always on this list. */ + trx_list_t ro_trx_list; /*!< List of active and committed in + memory read-only transactions, sorted + on trx id, biggest first. NOTE: + The order for read-only transactions + is not necessary. We should exploit + this and increase concurrency during + add/remove. */ + trx_list_t mysql_trx_list; /*!< List of transactions created + for MySQL. All transactions on + ro_trx_list are on mysql_trx_list. The + rw_trx_list can contain system + transactions and recovered transactions + that will not be in the mysql_trx_list. + There can be active non-locking + auto-commit read only transactions that + are on this list but not on ro_trx_list. + mysql_trx_list may additionally contain + transactions that have not yet been + started in InnoDB. */ + trx_rseg_t* const rseg_array[TRX_SYS_N_RSEGS]; + /*!< Pointer array to rollback + segments; NULL if slot not in use; + created and destroyed in + single-threaded mode; not protected + by any mutex, because it is read-only + during multi-threaded operation */ + ulint rseg_history_len;/*!< Length of the TRX_RSEG_HISTORY + list (update undo logs for committed + transactions), protected by + rseg->mutex */ + UT_LIST_BASE_NODE_T(read_view_t) view_list; + /*!< List of read views sorted + on trx no, biggest first */ +}; + +/** When a trx id which is zero modulo this number (which must be a power of +two) is assigned, the field TRX_SYS_TRX_ID_STORE on the transaction system +page is updated */ +#define TRX_SYS_TRX_ID_WRITE_MARGIN 256 +#endif /* !UNIV_HOTBACKUP */ + +#ifndef UNIV_NONINL +#include "trx0sys.ic" +#endif + +#endif diff --git a/storage/innobase/include/trx0sys.ic b/storage/innobase/include/trx0sys.ic new file mode 100644 index 00000000000..e097e29b551 --- /dev/null +++ b/storage/innobase/include/trx0sys.ic @@ -0,0 +1,512 @@ +/***************************************************************************** + +Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/trx0sys.ic +Transaction system + +Created 3/26/1996 Heikki Tuuri +*******************************************************/ + +#include "trx0trx.h" +#include "data0type.h" +#ifndef UNIV_HOTBACKUP +# include "srv0srv.h" +# include "mtr0log.h" + +/* The typedef for rseg slot in the file copy */ +typedef byte trx_sysf_rseg_t; + +/* Rollback segment specification slot offsets */ +/*-------------------------------------------------------------*/ +#define TRX_SYS_RSEG_SPACE 0 /* space where the segment + header is placed; starting with + MySQL/InnoDB 5.1.7, this is + UNIV_UNDEFINED if the slot is unused */ +#define TRX_SYS_RSEG_PAGE_NO 4 /* page number where the segment + header is placed; this is FIL_NULL + if the slot is unused */ +/*-------------------------------------------------------------*/ +/* Size of a rollback segment specification slot */ +#define TRX_SYS_RSEG_SLOT_SIZE 8 + +/*****************************************************************//** +Writes the value of max_trx_id to the file based trx system header. */ +UNIV_INTERN +void +trx_sys_flush_max_trx_id(void); +/*==========================*/ + +/***************************************************************//** +Checks if a page address is the trx sys header page. +@return TRUE if trx sys header page */ +UNIV_INLINE +ibool +trx_sys_hdr_page( +/*=============*/ + ulint space, /*!< in: space */ + ulint page_no)/*!< in: page number */ +{ + if ((space == TRX_SYS_SPACE) && (page_no == TRX_SYS_PAGE_NO)) { + + return(TRUE); + } + + return(FALSE); +} + +/***************************************************************//** +Gets the pointer in the nth slot of the rseg array. +@return pointer to rseg object, NULL if slot not in use */ +UNIV_INLINE +trx_rseg_t* +trx_sys_get_nth_rseg( +/*=================*/ + trx_sys_t* sys, /*!< in: trx system */ + ulint n) /*!< in: index of slot */ +{ + ut_ad(n < TRX_SYS_N_RSEGS); + + return(sys->rseg_array[n]); +} + +/**********************************************************************//** +Gets a pointer to the transaction system header and x-latches its page. +@return pointer to system header, page x-latched. */ +UNIV_INLINE +trx_sysf_t* +trx_sysf_get( +/*=========*/ + mtr_t* mtr) /*!< in: mtr */ +{ + buf_block_t* block; + trx_sysf_t* header; + + ut_ad(mtr); + + block = buf_page_get(TRX_SYS_SPACE, 0, TRX_SYS_PAGE_NO, + RW_X_LATCH, mtr); + buf_block_dbg_add_level(block, SYNC_TRX_SYS_HEADER); + + header = TRX_SYS + buf_block_get_frame(block); + + return(header); +} + +/*****************************************************************//** +Gets the space of the nth rollback segment slot in the trx system +file copy. +@return space id */ +UNIV_INLINE +ulint +trx_sysf_rseg_get_space( +/*====================*/ + trx_sysf_t* sys_header, /*!< in: trx sys header */ + ulint i, /*!< in: slot index == rseg id */ + mtr_t* mtr) /*!< in: mtr */ +{ + ut_ad(sys_header); + ut_ad(i < TRX_SYS_N_RSEGS); + + return(mtr_read_ulint(sys_header + TRX_SYS_RSEGS + + i * TRX_SYS_RSEG_SLOT_SIZE + + TRX_SYS_RSEG_SPACE, MLOG_4BYTES, mtr)); +} + +/*****************************************************************//** +Gets the page number of the nth rollback segment slot in the trx system +header. +@return page number, FIL_NULL if slot unused */ +UNIV_INLINE +ulint +trx_sysf_rseg_get_page_no( +/*======================*/ + trx_sysf_t* sys_header, /*!< in: trx system header */ + ulint i, /*!< in: slot index == rseg id */ + mtr_t* mtr) /*!< in: mtr */ +{ + ut_ad(sys_header); + ut_ad(i < TRX_SYS_N_RSEGS); + + return(mtr_read_ulint(sys_header + TRX_SYS_RSEGS + + i * TRX_SYS_RSEG_SLOT_SIZE + + TRX_SYS_RSEG_PAGE_NO, MLOG_4BYTES, mtr)); +} + +/*****************************************************************//** +Sets the space id of the nth rollback segment slot in the trx system +file copy. */ +UNIV_INLINE +void +trx_sysf_rseg_set_space( +/*====================*/ + trx_sysf_t* sys_header, /*!< in: trx sys file copy */ + ulint i, /*!< in: slot index == rseg id */ + ulint space, /*!< in: space id */ + mtr_t* mtr) /*!< in: mtr */ +{ + ut_ad(sys_header); + ut_ad(i < TRX_SYS_N_RSEGS); + + mlog_write_ulint(sys_header + TRX_SYS_RSEGS + + i * TRX_SYS_RSEG_SLOT_SIZE + + TRX_SYS_RSEG_SPACE, + space, + MLOG_4BYTES, mtr); +} + +/*****************************************************************//** +Sets the page number of the nth rollback segment slot in the trx system +header. */ +UNIV_INLINE +void +trx_sysf_rseg_set_page_no( +/*======================*/ + trx_sysf_t* sys_header, /*!< in: trx sys header */ + ulint i, /*!< in: slot index == rseg id */ + ulint page_no, /*!< in: page number, FIL_NULL if the + slot is reset to unused */ + mtr_t* mtr) /*!< in: mtr */ +{ + ut_ad(sys_header); + ut_ad(i < TRX_SYS_N_RSEGS); + + mlog_write_ulint(sys_header + TRX_SYS_RSEGS + + i * TRX_SYS_RSEG_SLOT_SIZE + + TRX_SYS_RSEG_PAGE_NO, + page_no, + MLOG_4BYTES, mtr); +} +#endif /* !UNIV_HOTBACKUP */ + +/*****************************************************************//** +Writes a trx id to an index page. In case that the id size changes in +some future version, this function should be used instead of +mach_write_... */ +UNIV_INLINE +void +trx_write_trx_id( +/*=============*/ + byte* ptr, /*!< in: pointer to memory where written */ + trx_id_t id) /*!< in: id */ +{ +#if DATA_TRX_ID_LEN != 6 +# error "DATA_TRX_ID_LEN != 6" +#endif + mach_write_to_6(ptr, id); +} + +#ifndef UNIV_HOTBACKUP +/*****************************************************************//** +Reads a trx id from an index page. In case that the id size changes in +some future version, this function should be used instead of +mach_read_... +@return id */ +UNIV_INLINE +trx_id_t +trx_read_trx_id( +/*============*/ + const byte* ptr) /*!< in: pointer to memory from where to read */ +{ +#if DATA_TRX_ID_LEN != 6 +# error "DATA_TRX_ID_LEN != 6" +#endif + return(mach_read_from_6(ptr)); +} + +/****************************************************************//** +Looks for the trx handle with the given id in rw_trx_list. +The caller must be holding trx_sys->mutex. +@return the trx handle or NULL if not found; +the pointer must not be dereferenced unless lock_sys->mutex was +acquired before calling this function and is still being held */ +UNIV_INLINE +trx_t* +trx_get_rw_trx_by_id( +/*=================*/ + trx_id_t trx_id) /*!< in: trx id to search for */ +{ + trx_t* trx; + ulint len; + trx_t* first; + + ut_ad(mutex_own(&trx_sys->mutex)); + + len = UT_LIST_GET_LEN(trx_sys->rw_trx_list); + + if (len == 0) { + return(NULL); + } + + /* Because the list is ordered on trx id in descending order, + we try to speed things up a bit. */ + + trx = UT_LIST_GET_FIRST(trx_sys->rw_trx_list); + assert_trx_in_rw_list(trx); + + if (trx_id == trx->id) { + return(trx); + } else if (len == 1 || trx_id > trx->id) { + return(NULL); + } + + first = trx; + + trx = UT_LIST_GET_LAST(trx_sys->rw_trx_list); + assert_trx_in_rw_list(trx); + + if (trx_id == trx->id) { + return(trx); + } else if (len == 2 || trx_id < trx->id) { + return(NULL); + } + + /* Search the list from the lower end (tail). */ + if (trx_id < (first->id + trx->id) >> 1) { + for (trx = UT_LIST_GET_PREV(trx_list, trx); + trx != NULL && trx_id > trx->id; + trx = UT_LIST_GET_PREV(trx_list, trx)) { + assert_trx_in_rw_list(trx); + } + } else { + for (trx = UT_LIST_GET_NEXT(trx_list, first); + trx != NULL && trx_id < trx->id; + trx = UT_LIST_GET_NEXT(trx_list, trx)) { + assert_trx_in_rw_list(trx); + } + } + + return((trx != NULL && trx->id == trx_id) ? trx : NULL); +} + +/****************************************************************//** +Returns the minimum trx id in trx list. This is the smallest id for which +the trx can possibly be active. (But, you must look at the trx->state +to find out if the minimum trx id transaction itself is active, or already +committed.). The caller must be holding the trx_sys_t::mutex in shared mode. +@return the minimum trx id, or trx_sys->max_trx_id if the trx list is empty */ +UNIV_INLINE +trx_id_t +trx_rw_min_trx_id_low(void) +/*=======================*/ +{ + trx_id_t id; + const trx_t* trx; + + ut_ad(mutex_own(&trx_sys->mutex)); + + trx = UT_LIST_GET_LAST(trx_sys->rw_trx_list); + + if (trx == NULL) { + id = trx_sys->max_trx_id; + } else { + assert_trx_in_rw_list(trx); + id = trx->id; + } + + return(id); +} + +#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG +/***********************************************************//** +Assert that a transaction has been recovered. +@return TRUE */ +UNIV_INLINE +ibool +trx_assert_recovered( +/*=================*/ + trx_id_t trx_id) /*!< in: transaction identifier */ +{ + const trx_t* trx; + + mutex_enter(&trx_sys->mutex); + + trx = trx_get_rw_trx_by_id(trx_id); + ut_a(trx->is_recovered); + + mutex_exit(&trx_sys->mutex); + + return(TRUE); +} +#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ + +/****************************************************************//** +Returns the minimum trx id in rw trx list. This is the smallest id for which +the rw trx can possibly be active. (But, you must look at the trx->state +to find out if the minimum trx id transaction itself is active, or already +committed.) +@return the minimum trx id, or trx_sys->max_trx_id if rw trx list is empty */ +UNIV_INLINE +trx_id_t +trx_rw_min_trx_id(void) +/*===================*/ +{ + trx_id_t id; + + mutex_enter(&trx_sys->mutex); + + id = trx_rw_min_trx_id_low(); + + mutex_exit(&trx_sys->mutex); + + return(id); +} + +/****************************************************************//** +Checks if a rw transaction with the given id is active. Caller must hold +trx_sys->mutex. If the caller is not holding lock_sys->mutex, the +transaction may already have been committed. +@return transaction instance if active, or NULL; +the pointer must not be dereferenced unless lock_sys->mutex was +acquired before calling this function and is still being held */ +UNIV_INLINE +trx_t* +trx_rw_is_active_low( +/*=================*/ + trx_id_t trx_id, /*!< in: trx id of the transaction */ + ibool* corrupt) /*!< in: NULL or pointer to a flag + that will be set if corrupt */ +{ + trx_t* trx; + + ut_ad(mutex_own(&trx_sys->mutex)); + + if (trx_id < trx_rw_min_trx_id_low()) { + + trx = NULL; + } else if (trx_id >= trx_sys->max_trx_id) { + + /* There must be corruption: we let the caller handle the + diagnostic prints in this case. */ + + trx = NULL; + if (corrupt != NULL) { + *corrupt = TRUE; + } + } else { + trx = trx_get_rw_trx_by_id(trx_id); + + if (trx != NULL + && trx_state_eq(trx, TRX_STATE_COMMITTED_IN_MEMORY)) { + + trx = NULL; + } + } + + return(trx); +} + +/****************************************************************//** +Checks if a rw transaction with the given id is active. If the caller is +not holding lock_sys->mutex, the transaction may already have been +committed. +@return transaction instance if active, or NULL; +the pointer must not be dereferenced unless lock_sys->mutex was +acquired before calling this function and is still being held */ +UNIV_INLINE +trx_t* +trx_rw_is_active( +/*=============*/ + trx_id_t trx_id, /*!< in: trx id of the transaction */ + ibool* corrupt) /*!< in: NULL or pointer to a flag + that will be set if corrupt */ +{ + trx_t* trx; + + mutex_enter(&trx_sys->mutex); + + trx = trx_rw_is_active_low(trx_id, corrupt); + + mutex_exit(&trx_sys->mutex); + + return(trx); +} + +/*****************************************************************//** +Allocates a new transaction id. +@return new, allocated trx id */ +UNIV_INLINE +trx_id_t +trx_sys_get_new_trx_id(void) +/*========================*/ +{ + ut_ad(mutex_own(&trx_sys->mutex)); + + /* VERY important: after the database is started, max_trx_id value is + divisible by TRX_SYS_TRX_ID_WRITE_MARGIN, and the following if + will evaluate to TRUE when this function is first time called, + and the value for trx id will be written to disk-based header! + Thus trx id values will not overlap when the database is + repeatedly started! */ + + if (!(trx_sys->max_trx_id % (trx_id_t) TRX_SYS_TRX_ID_WRITE_MARGIN)) { + + trx_sys_flush_max_trx_id(); + } + + return(trx_sys->max_trx_id++); +} + +/*****************************************************************//** +Determines the maximum transaction id. +@return maximum currently allocated trx id; will be stale after the +next call to trx_sys_get_new_trx_id() */ +UNIV_INLINE +trx_id_t +trx_sys_get_max_trx_id(void) +/*========================*/ +{ +#if UNIV_WORD_SIZE < DATA_TRX_ID_LEN + trx_id_t max_trx_id; +#endif + + ut_ad(!mutex_own(&trx_sys->mutex)); + +#if UNIV_WORD_SIZE < DATA_TRX_ID_LEN + /* Avoid torn reads. */ + mutex_enter(&trx_sys->mutex); + max_trx_id = trx_sys->max_trx_id; + mutex_exit(&trx_sys->mutex); + return(max_trx_id); +#else + /* Perform a dirty read. Callers should be prepared for stale + values, and we know that the value fits in a machine word, so + that it will be read and written atomically. */ + return(trx_sys->max_trx_id); +#endif +} + +/*****************************************************************//** +Get the number of transaction in the system, independent of their state. +@return count of transactions in trx_sys_t::rw_trx_list */ +UNIV_INLINE +ulint +trx_sys_get_n_rw_trx(void) +/*======================*/ +{ + ulint n_trx; + + mutex_enter(&trx_sys->mutex); + + n_trx = UT_LIST_GET_LEN(trx_sys->rw_trx_list); + + mutex_exit(&trx_sys->mutex); + + return(n_trx); +} +#endif /* !UNIV_HOTBACKUP */ diff --git a/storage/innobase/include/trx0trx.h b/storage/innobase/include/trx0trx.h new file mode 100644 index 00000000000..144e1803975 --- /dev/null +++ b/storage/innobase/include/trx0trx.h @@ -0,0 +1,1116 @@ +/***************************************************************************** + +Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/trx0trx.h +The transaction + +Created 3/26/1996 Heikki Tuuri +*******************************************************/ + +#ifndef trx0trx_h +#define trx0trx_h + +#include "univ.i" +#include "trx0types.h" +#include "dict0types.h" +#ifndef UNIV_HOTBACKUP +#include "lock0types.h" +#include "log0log.h" +#include "usr0types.h" +#include "que0types.h" +#include "mem0mem.h" +#include "read0types.h" +#include "trx0xa.h" +#include "ut0vec.h" +#include "fts0fts.h" + +/** Dummy session used currently in MySQL interface */ +extern sess_t* trx_dummy_sess; + +/********************************************************************//** +Releases the search latch if trx has reserved it. */ +UNIV_INLINE +void +trx_search_latch_release_if_reserved( +/*=================================*/ + trx_t* trx); /*!< in: transaction */ +/******************************************************************//** +Set detailed error message for the transaction. */ +UNIV_INTERN +void +trx_set_detailed_error( +/*===================*/ + trx_t* trx, /*!< in: transaction struct */ + const char* msg); /*!< in: detailed error message */ +/*************************************************************//** +Set detailed error message for the transaction from a file. Note that the +file is rewinded before reading from it. */ +UNIV_INTERN +void +trx_set_detailed_error_from_file( +/*=============================*/ + trx_t* trx, /*!< in: transaction struct */ + FILE* file); /*!< in: file to read message from */ +/****************************************************************//** +Retrieves the error_info field from a trx. +@return the error info */ +UNIV_INLINE +const dict_index_t* +trx_get_error_info( +/*===============*/ + const trx_t* trx); /*!< in: trx object */ +/********************************************************************//** +Creates a transaction object for MySQL. +@return own: transaction object */ +UNIV_INTERN +trx_t* +trx_allocate_for_mysql(void); +/*========================*/ +/********************************************************************//** +Creates a transaction object for background operations by the master thread. +@return own: transaction object */ +UNIV_INTERN +trx_t* +trx_allocate_for_background(void); +/*=============================*/ +/********************************************************************//** +Frees a transaction object of a background operation of the master thread. */ +UNIV_INTERN +void +trx_free_for_background( +/*====================*/ + trx_t* trx); /*!< in, own: trx object */ +/********************************************************************//** +At shutdown, frees a transaction object that is in the PREPARED state. */ +UNIV_INTERN +void +trx_free_prepared( +/*==============*/ + trx_t* trx) /*!< in, own: trx object */ + UNIV_COLD __attribute__((nonnull)); +/********************************************************************//** +Frees a transaction object for MySQL. */ +UNIV_INTERN +void +trx_free_for_mysql( +/*===============*/ + trx_t* trx); /*!< in, own: trx object */ +/****************************************************************//** +Creates trx objects for transactions and initializes the trx list of +trx_sys at database start. Rollback segment and undo log lists must +already exist when this function is called, because the lists of +transactions to be rolled back or cleaned up are built based on the +undo log lists. */ +UNIV_INTERN +void +trx_lists_init_at_db_start(void); +/*============================*/ + +#ifdef UNIV_DEBUG +#define trx_start_if_not_started_xa(t) \ + { \ + (t)->start_line = __LINE__; \ + (t)->start_file = __FILE__; \ + trx_start_if_not_started_xa_low((t)); \ + } +#else +#define trx_start_if_not_started_xa(t) \ + trx_start_if_not_started_xa_low((t)) +#endif /* UNIV_DEBUG */ + +/*************************************************************//** +Starts the transaction if it is not yet started. */ +UNIV_INTERN +void +trx_start_if_not_started_xa_low( +/*============================*/ + trx_t* trx); /*!< in: transaction */ +/*************************************************************//** +Starts the transaction if it is not yet started. */ +UNIV_INTERN +void +trx_start_if_not_started_low( +/*=========================*/ + trx_t* trx); /*!< in: transaction */ + +#ifdef UNIV_DEBUG +#define trx_start_if_not_started(t) \ + { \ + (t)->start_line = __LINE__; \ + (t)->start_file = __FILE__; \ + trx_start_if_not_started_low((t)); \ + } +#else +#define trx_start_if_not_started(t) \ + trx_start_if_not_started_low((t)) +#endif /* UNIV_DEBUG */ + +/*************************************************************//** +Starts the transaction for a DDL operation. */ +UNIV_INTERN +void +trx_start_for_ddl_low( +/*==================*/ + trx_t* trx, /*!< in/out: transaction */ + trx_dict_op_t op) /*!< in: dictionary operation type */ + __attribute__((nonnull)); + +#ifdef UNIV_DEBUG +#define trx_start_for_ddl(t, o) \ + { \ + ut_ad((t)->start_file == 0); \ + (t)->start_line = __LINE__; \ + (t)->start_file = __FILE__; \ + trx_start_for_ddl_low((t), (o)); \ + } +#else +#define trx_start_for_ddl(t, o) \ + trx_start_for_ddl_low((t), (o)) +#endif /* UNIV_DEBUG */ + +/****************************************************************//** +Commits a transaction. */ +UNIV_INTERN +void +trx_commit( +/*=======*/ + trx_t* trx) /*!< in/out: transaction */ + __attribute__((nonnull)); +/****************************************************************//** +Commits a transaction and a mini-transaction. */ +UNIV_INTERN +void +trx_commit_low( +/*===========*/ + trx_t* trx, /*!< in/out: transaction */ + mtr_t* mtr) /*!< in/out: mini-transaction (will be committed), + or NULL if trx made no modifications */ + __attribute__((nonnull(1))); +/****************************************************************//** +Cleans up a transaction at database startup. The cleanup is needed if +the transaction already got to the middle of a commit when the database +crashed, and we cannot roll it back. */ +UNIV_INTERN +void +trx_cleanup_at_db_startup( +/*======================*/ + trx_t* trx); /*!< in: transaction */ +/**********************************************************************//** +Does the transaction commit for MySQL. +@return DB_SUCCESS or error number */ +UNIV_INTERN +dberr_t +trx_commit_for_mysql( +/*=================*/ + trx_t* trx); /*!< in/out: transaction */ +/**********************************************************************//** +Does the transaction prepare for MySQL. */ +UNIV_INTERN +void +trx_prepare_for_mysql( +/*==================*/ + trx_t* trx); /*!< in/out: trx handle */ +/**********************************************************************//** +This function is used to find number of prepared transactions and +their transaction objects for a recovery. +@return number of prepared transactions */ +UNIV_INTERN +int +trx_recover_for_mysql( +/*==================*/ + XID* xid_list, /*!< in/out: prepared transactions */ + ulint len); /*!< in: number of slots in xid_list */ +/*******************************************************************//** +This function is used to find one X/Open XA distributed transaction +which is in the prepared state +@return trx or NULL; on match, the trx->xid will be invalidated; +note that the trx may have been committed, unless the caller is +holding lock_sys->mutex */ +UNIV_INTERN +trx_t * +trx_get_trx_by_xid( +/*===============*/ + const XID* xid); /*!< in: X/Open XA transaction identifier */ +/**********************************************************************//** +If required, flushes the log to disk if we called trx_commit_for_mysql() +with trx->flush_log_later == TRUE. */ +UNIV_INTERN +void +trx_commit_complete_for_mysql( +/*==========================*/ + trx_t* trx) /*!< in/out: transaction */ + __attribute__((nonnull)); +/**********************************************************************//** +Marks the latest SQL statement ended. */ +UNIV_INTERN +void +trx_mark_sql_stat_end( +/*==================*/ + trx_t* trx); /*!< in: trx handle */ +/********************************************************************//** +Assigns a read view for a consistent read query. All the consistent reads +within the same transaction will get the same read view, which is created +when this function is first called for a new started transaction. +@return consistent read view */ +UNIV_INTERN +read_view_t* +trx_assign_read_view( +/*=================*/ + trx_t* trx); /*!< in: active transaction */ +/****************************************************************//** +Prepares a transaction for commit/rollback. */ +UNIV_INTERN +void +trx_commit_or_rollback_prepare( +/*===========================*/ + trx_t* trx); /*!< in/out: transaction */ +/*********************************************************************//** +Creates a commit command node struct. +@return own: commit node struct */ +UNIV_INTERN +commit_node_t* +trx_commit_node_create( +/*===================*/ + mem_heap_t* heap); /*!< in: mem heap where created */ +/***********************************************************//** +Performs an execution step for a commit type node in a query graph. +@return query thread to run next, or NULL */ +UNIV_INTERN +que_thr_t* +trx_commit_step( +/*============*/ + que_thr_t* thr); /*!< in: query thread */ + +/**********************************************************************//** +Prints info about a transaction. +Caller must hold trx_sys->mutex. */ +UNIV_INTERN +void +trx_print_low( +/*==========*/ + FILE* f, + /*!< in: output stream */ + const trx_t* trx, + /*!< in: transaction */ + ulint max_query_len, + /*!< in: max query length to print, + or 0 to use the default max length */ + ulint n_rec_locks, + /*!< in: lock_number_of_rows_locked(&trx->lock) */ + ulint n_trx_locks, + /*!< in: length of trx->lock.trx_locks */ + ulint heap_size) + /*!< in: mem_heap_get_size(trx->lock.lock_heap) */ + __attribute__((nonnull)); + +/**********************************************************************//** +Prints info about a transaction. +The caller must hold lock_sys->mutex and trx_sys->mutex. +When possible, use trx_print() instead. */ +UNIV_INTERN +void +trx_print_latched( +/*==============*/ + FILE* f, /*!< in: output stream */ + const trx_t* trx, /*!< in: transaction */ + ulint max_query_len) /*!< in: max query length to print, + or 0 to use the default max length */ + __attribute__((nonnull)); + +/**********************************************************************//** +Prints info about a transaction. +Acquires and releases lock_sys->mutex and trx_sys->mutex. */ +UNIV_INTERN +void +trx_print( +/*======*/ + FILE* f, /*!< in: output stream */ + const trx_t* trx, /*!< in: transaction */ + ulint max_query_len) /*!< in: max query length to print, + or 0 to use the default max length */ + __attribute__((nonnull)); + +/**********************************************************************//** +Determine if a transaction is a dictionary operation. +@return dictionary operation mode */ +UNIV_INLINE +enum trx_dict_op_t +trx_get_dict_operation( +/*===================*/ + const trx_t* trx) /*!< in: transaction */ + __attribute__((pure)); +/**********************************************************************//** +Flag a transaction a dictionary operation. */ +UNIV_INLINE +void +trx_set_dict_operation( +/*===================*/ + trx_t* trx, /*!< in/out: transaction */ + enum trx_dict_op_t op); /*!< in: operation, not + TRX_DICT_OP_NONE */ + +#ifndef UNIV_HOTBACKUP +/**********************************************************************//** +Determines if a transaction is in the given state. +The caller must hold trx_sys->mutex, or it must be the thread +that is serving a running transaction. +A running transaction must be in trx_sys->ro_trx_list or trx_sys->rw_trx_list +unless it is a non-locking autocommit read only transaction, which is only +in trx_sys->mysql_trx_list. +@return TRUE if trx->state == state */ +UNIV_INLINE +ibool +trx_state_eq( +/*=========*/ + const trx_t* trx, /*!< in: transaction */ + trx_state_t state) /*!< in: state; + if state != TRX_STATE_NOT_STARTED + asserts that + trx->state != TRX_STATE_NOT_STARTED */ + __attribute__((nonnull, warn_unused_result)); +# ifdef UNIV_DEBUG +/**********************************************************************//** +Asserts that a transaction has been started. +The caller must hold trx_sys->mutex. +@return TRUE if started */ +UNIV_INTERN +ibool +trx_assert_started( +/*===============*/ + const trx_t* trx) /*!< in: transaction */ + __attribute__((nonnull, warn_unused_result)); +# endif /* UNIV_DEBUG */ + +/**********************************************************************//** +Determines if the currently running transaction has been interrupted. +@return TRUE if interrupted */ +UNIV_INTERN +ibool +trx_is_interrupted( +/*===============*/ + const trx_t* trx); /*!< in: transaction */ +/**********************************************************************//** +Determines if the currently running transaction is in strict mode. +@return TRUE if strict */ +UNIV_INTERN +ibool +trx_is_strict( +/*==========*/ + trx_t* trx); /*!< in: transaction */ +#else /* !UNIV_HOTBACKUP */ +#define trx_is_interrupted(trx) FALSE +#endif /* !UNIV_HOTBACKUP */ + +/*******************************************************************//** +Calculates the "weight" of a transaction. The weight of one transaction +is estimated as the number of altered rows + the number of locked rows. +@param t transaction +@return transaction weight */ +#define TRX_WEIGHT(t) ((t)->undo_no + UT_LIST_GET_LEN((t)->lock.trx_locks)) + +/*******************************************************************//** +Compares the "weight" (or size) of two transactions. Transactions that +have edited non-transactional tables are considered heavier than ones +that have not. +@return TRUE if weight(a) >= weight(b) */ +UNIV_INTERN +ibool +trx_weight_ge( +/*==========*/ + const trx_t* a, /*!< in: the first transaction to be compared */ + const trx_t* b); /*!< in: the second transaction to be compared */ + +/* Maximum length of a string that can be returned by +trx_get_que_state_str(). */ +#define TRX_QUE_STATE_STR_MAX_LEN 12 /* "ROLLING BACK" */ + +/*******************************************************************//** +Retrieves transaction's que state in a human readable string. The string +should not be free()'d or modified. +@return string in the data segment */ +UNIV_INLINE +const char* +trx_get_que_state_str( +/*==================*/ + const trx_t* trx); /*!< in: transaction */ + +/****************************************************************//** +Assign a read-only transaction a rollback-segment, if it is attempting +to write to a TEMPORARY table. */ +UNIV_INTERN +void +trx_assign_rseg( +/*============*/ + trx_t* trx); /*!< A read-only transaction that + needs to be assigned a RBS. */ +/*******************************************************************//** +Transactions that aren't started by the MySQL server don't set +the trx_t::mysql_thd field. For such transactions we set the lock +wait timeout to 0 instead of the user configured value that comes +from innodb_lock_wait_timeout via trx_t::mysql_thd. +@param trx transaction +@return lock wait timeout in seconds */ +#define trx_lock_wait_timeout_get(trx) \ + ((trx)->mysql_thd != NULL \ + ? thd_lock_wait_timeout((trx)->mysql_thd) \ + : 0) + +/*******************************************************************//** +Determine if the transaction is a non-locking autocommit select +(implied read-only). +@param t transaction +@return true if non-locking autocommit select transaction. */ +#define trx_is_autocommit_non_locking(t) \ +((t)->auto_commit && (t)->will_lock == 0) + +/*******************************************************************//** +Determine if the transaction is a non-locking autocommit select +with an explicit check for the read-only status. +@param t transaction +@return true if non-locking autocommit read-only transaction. */ +#define trx_is_ac_nl_ro(t) \ +((t)->read_only && trx_is_autocommit_non_locking((t))) + +/*******************************************************************//** +Assert that the transaction is in the trx_sys_t::rw_trx_list */ +#define assert_trx_in_rw_list(t) do { \ + ut_ad(!(t)->read_only); \ + assert_trx_in_list(t); \ +} while (0) + +/*******************************************************************//** +Assert that the transaction is either in trx_sys->ro_trx_list or +trx_sys->rw_trx_list but not both and it cannot be an autocommit +non-locking select */ +#define assert_trx_in_list(t) do { \ + ut_ad((t)->in_ro_trx_list == (t)->read_only); \ + ut_ad((t)->in_rw_trx_list == !(t)->read_only); \ + ut_ad(!trx_is_autocommit_non_locking((t))); \ + switch ((t)->state) { \ + case TRX_STATE_PREPARED: \ + /* fall through */ \ + case TRX_STATE_ACTIVE: \ + case TRX_STATE_COMMITTED_IN_MEMORY: \ + continue; \ + case TRX_STATE_NOT_STARTED: \ + break; \ + } \ + ut_error; \ +} while (0) + +#ifdef UNIV_DEBUG +/*******************************************************************//** +Assert that an autocommit non-locking select cannot be in the +ro_trx_list nor the rw_trx_list and that it is a read-only transaction. +The tranasction must be in the mysql_trx_list. */ +# define assert_trx_nonlocking_or_in_list(t) \ + do { \ + if (trx_is_autocommit_non_locking(t)) { \ + trx_state_t t_state = (t)->state; \ + ut_ad((t)->read_only); \ + ut_ad(!(t)->is_recovered); \ + ut_ad(!(t)->in_ro_trx_list); \ + ut_ad(!(t)->in_rw_trx_list); \ + ut_ad((t)->in_mysql_trx_list); \ + ut_ad(t_state == TRX_STATE_NOT_STARTED \ + || t_state == TRX_STATE_ACTIVE); \ + } else { \ + assert_trx_in_list(t); \ + } \ + } while (0) +#else /* UNIV_DEBUG */ +/*******************************************************************//** +Assert that an autocommit non-locking slect cannot be in the +ro_trx_list nor the rw_trx_list and that it is a read-only transaction. +The tranasction must be in the mysql_trx_list. */ +# define assert_trx_nonlocking_or_in_list(trx) ((void)0) +#endif /* UNIV_DEBUG */ + +/*******************************************************************//** +Latching protocol for trx_lock_t::que_state. trx_lock_t::que_state +captures the state of the query thread during the execution of a query. +This is different from a transaction state. The query state of a transaction +can be updated asynchronously by other threads. The other threads can be +system threads, like the timeout monitor thread or user threads executing +other queries. Another thing to be mindful of is that there is a delay between +when a query thread is put into LOCK_WAIT state and before it actually starts +waiting. Between these two events it is possible that the query thread is +granted the lock it was waiting for, which implies that the state can be changed +asynchronously. + +All these operations take place within the context of locking. Therefore state +changes within the locking code must acquire both the lock mutex and the +trx->mutex when changing trx->lock.que_state to TRX_QUE_LOCK_WAIT or +trx->lock.wait_lock to non-NULL but when the lock wait ends it is sufficient +to only acquire the trx->mutex. +To query the state either of the mutexes is sufficient within the locking +code and no mutex is required when the query thread is no longer waiting. */ + +/** The locks and state of an active transaction. Protected by +lock_sys->mutex, trx->mutex or both. */ +struct trx_lock_t { + ulint n_active_thrs; /*!< number of active query threads */ + + trx_que_t que_state; /*!< valid when trx->state + == TRX_STATE_ACTIVE: TRX_QUE_RUNNING, + TRX_QUE_LOCK_WAIT, ... */ + + lock_t* wait_lock; /*!< if trx execution state is + TRX_QUE_LOCK_WAIT, this points to + the lock request, otherwise this is + NULL; set to non-NULL when holding + both trx->mutex and lock_sys->mutex; + set to NULL when holding + lock_sys->mutex; readers should + hold lock_sys->mutex, except when + they are holding trx->mutex and + wait_lock==NULL */ + ib_uint64_t deadlock_mark; /*!< A mark field that is initialized + to and checked against lock_mark_counter + by lock_deadlock_recursive(). */ + ibool was_chosen_as_deadlock_victim; + /*!< when the transaction decides to + wait for a lock, it sets this to FALSE; + if another transaction chooses this + transaction as a victim in deadlock + resolution, it sets this to TRUE. + Protected by trx->mutex. */ + time_t wait_started; /*!< lock wait started at this time, + protected only by lock_sys->mutex */ + + que_thr_t* wait_thr; /*!< query thread belonging to this + trx that is in QUE_THR_LOCK_WAIT + state. For threads suspended in a + lock wait, this is protected by + lock_sys->mutex. Otherwise, this may + only be modified by the thread that is + serving the running transaction. */ + + mem_heap_t* lock_heap; /*!< memory heap for trx_locks; + protected by lock_sys->mutex */ + + UT_LIST_BASE_NODE_T(lock_t) + trx_locks; /*!< locks requested + by the transaction; + insertions are protected by trx->mutex + and lock_sys->mutex; removals are + protected by lock_sys->mutex */ + + ib_vector_t* table_locks; /*!< All table locks requested by this + transaction, including AUTOINC locks */ + + ibool cancel; /*!< TRUE if the transaction is being + rolled back either via deadlock + detection or due to lock timeout. The + caller has to acquire the trx_t::mutex + in order to cancel the locks. In + lock_trx_table_locks_remove() we + check for this cancel of a transaction's + locks and avoid reacquiring the trx + mutex to prevent recursive deadlocks. + Protected by both the lock sys mutex + and the trx_t::mutex. */ +}; + +#define TRX_MAGIC_N 91118598 + +/** The transaction handle + +Normally, there is a 1:1 relationship between a transaction handle +(trx) and a session (client connection). One session is associated +with exactly one user transaction. There are some exceptions to this: + +* For DDL operations, a subtransaction is allocated that modifies the +data dictionary tables. Lock waits and deadlocks are prevented by +acquiring the dict_operation_lock before starting the subtransaction +and releasing it after committing the subtransaction. + +* The purge system uses a special transaction that is not associated +with any session. + +* If the system crashed or it was quickly shut down while there were +transactions in the ACTIVE or PREPARED state, these transactions would +no longer be associated with a session when the server is restarted. + +A session may be served by at most one thread at a time. The serving +thread of a session might change in some MySQL implementations. +Therefore we do not have os_thread_get_curr_id() assertions in the code. + +Normally, only the thread that is currently associated with a running +transaction may access (read and modify) the trx object, and it may do +so without holding any mutex. The following are exceptions to this: + +* trx_rollback_resurrected() may access resurrected (connectionless) +transactions while the system is already processing new user +transactions. The trx_sys->mutex prevents a race condition between it +and lock_trx_release_locks() [invoked by trx_commit()]. + +* trx_print_low() may access transactions not associated with the current +thread. The caller must be holding trx_sys->mutex and lock_sys->mutex. + +* When a transaction handle is in the trx_sys->mysql_trx_list or +trx_sys->trx_list, some of its fields must not be modified without +holding trx_sys->mutex exclusively. + +* The locking code (in particular, lock_deadlock_recursive() and +lock_rec_convert_impl_to_expl()) will access transactions associated +to other connections. The locks of transactions are protected by +lock_sys->mutex and sometimes by trx->mutex. */ + +struct trx_t{ + ulint magic_n; + + ib_mutex_t mutex; /*!< Mutex protecting the fields + state and lock + (except some fields of lock, which + are protected by lock_sys->mutex) */ + + /** State of the trx from the point of view of concurrency control + and the valid state transitions. + + Possible states: + + TRX_STATE_NOT_STARTED + TRX_STATE_ACTIVE + TRX_STATE_PREPARED + TRX_STATE_COMMITTED_IN_MEMORY (alias below COMMITTED) + + Valid state transitions are: + + Regular transactions: + * NOT_STARTED -> ACTIVE -> COMMITTED -> NOT_STARTED + + Auto-commit non-locking read-only: + * NOT_STARTED -> ACTIVE -> NOT_STARTED + + XA (2PC): + * NOT_STARTED -> ACTIVE -> PREPARED -> COMMITTED -> NOT_STARTED + + Recovered XA: + * NOT_STARTED -> PREPARED -> COMMITTED -> (freed) + + XA (2PC) (shutdown before ROLLBACK or COMMIT): + * NOT_STARTED -> PREPARED -> (freed) + + Latching and various transaction lists membership rules: + + XA (2PC) transactions are always treated as non-autocommit. + + Transitions to ACTIVE or NOT_STARTED occur when + !in_rw_trx_list and !in_ro_trx_list (no trx_sys->mutex needed). + + Autocommit non-locking read-only transactions move between states + without holding any mutex. They are !in_rw_trx_list, !in_ro_trx_list. + + When a transaction is NOT_STARTED, it can be in_mysql_trx_list if + it is a user transaction. It cannot be in ro_trx_list or rw_trx_list. + + ACTIVE->PREPARED->COMMITTED is only possible when trx->in_rw_trx_list. + The transition ACTIVE->PREPARED is protected by trx_sys->mutex. + + ACTIVE->COMMITTED is possible when the transaction is in + ro_trx_list or rw_trx_list. + + Transitions to COMMITTED are protected by both lock_sys->mutex + and trx->mutex. + + NOTE: Some of these state change constraints are an overkill, + currently only required for a consistent view for printing stats. + This unnecessarily adds a huge cost for the general case. + + NOTE: In the future we should add read only transactions to the + ro_trx_list the first time they try to acquire a lock ie. by default + we treat all read-only transactions as non-locking. */ + trx_state_t state; + + trx_lock_t lock; /*!< Information about the transaction + locks and state. Protected by + trx->mutex or lock_sys->mutex + or both */ + ulint is_recovered; /*!< 0=normal transaction, + 1=recovered, must be rolled back, + protected by trx_sys->mutex when + trx->in_rw_trx_list holds */ + + /* These fields are not protected by any mutex. */ + const char* op_info; /*!< English text describing the + current operation, or an empty + string */ + ulint isolation_level;/*!< TRX_ISO_REPEATABLE_READ, ... */ + ulint check_foreigns; /*!< normally TRUE, but if the user + wants to suppress foreign key checks, + (in table imports, for example) we + set this FALSE */ + /*------------------------------*/ + /* MySQL has a transaction coordinator to coordinate two phase + commit between multiple storage engines and the binary log. When + an engine participates in a transaction, it's responsible for + registering itself using the trans_register_ha() API. */ + unsigned is_registered:1;/* This flag is set to 1 after the + transaction has been registered with + the coordinator using the XA API, and + is set to 0 after commit or rollback. */ + unsigned owns_prepare_mutex:1;/* 1 if owns prepare mutex, if + this is set to 1 then registered should + also be set to 1. This is used in the + XA code */ + /*------------------------------*/ + ulint check_unique_secondary; + /*!< normally TRUE, but if the user + wants to speed up inserts by + suppressing unique key checks + for secondary indexes when we decide + if we can use the insert buffer for + them, we set this FALSE */ + ulint support_xa; /*!< normally we do the XA two-phase + commit steps, but by setting this to + FALSE, one can save CPU time and about + 150 bytes in the undo log size as then + we skip XA steps */ + ulint flush_log_later;/* In 2PC, we hold the + prepare_commit mutex across + both phases. In that case, we + defer flush of the logs to disk + until after we release the + mutex. */ + ulint must_flush_log_later;/*!< this flag is set to TRUE in + trx_commit() if flush_log_later was + TRUE, and there were modifications by + the transaction; in that case we must + flush the log in + trx_commit_complete_for_mysql() */ + ulint duplicates; /*!< TRX_DUP_IGNORE | TRX_DUP_REPLACE */ + ulint has_search_latch; + /*!< TRUE if this trx has latched the + search system latch in S-mode */ + ulint search_latch_timeout; + /*!< If we notice that someone is + waiting for our S-lock on the search + latch to be released, we wait in + row0sel.cc for BTR_SEA_TIMEOUT new + searches until we try to keep + the search latch again over + calls from MySQL; this is intended + to reduce contention on the search + latch */ + trx_dict_op_t dict_operation; /**< @see enum trx_dict_op */ + + /* Fields protected by the srv_conc_mutex. */ + ulint declared_to_be_inside_innodb; + /*!< this is TRUE if we have declared + this transaction in + srv_conc_enter_innodb to be inside the + InnoDB engine */ + ulint n_tickets_to_enter_innodb; + /*!< this can be > 0 only when + declared_to_... is TRUE; when we come + to srv_conc_innodb_enter, if the value + here is > 0, we decrement this by 1 */ + ulint dict_operation_lock_mode; + /*!< 0, RW_S_LATCH, or RW_X_LATCH: + the latch mode trx currently holds + on dict_operation_lock. Protected + by dict_operation_lock. */ + + trx_id_t no; /*!< transaction serialization number: + max trx id shortly before the + transaction is moved to + COMMITTED_IN_MEMORY state. + Protected by trx_sys_t::mutex + when trx->in_rw_trx_list. Initially + set to TRX_ID_MAX. */ + + time_t start_time; /*!< time the trx state last time became + TRX_STATE_ACTIVE */ + trx_id_t id; /*!< transaction id */ + XID xid; /*!< X/Open XA transaction + identification to identify a + transaction branch */ + lsn_t commit_lsn; /*!< lsn at the time of the commit */ + table_id_t table_id; /*!< Table to drop iff dict_operation + == TRX_DICT_OP_TABLE, or 0. */ + /*------------------------------*/ + THD* mysql_thd; /*!< MySQL thread handle corresponding + to this trx, or NULL */ + const char* mysql_log_file_name; + /*!< if MySQL binlog is used, this field + contains a pointer to the latest file + name; this is NULL if binlog is not + used */ + ib_int64_t mysql_log_offset; + /*!< if MySQL binlog is used, this + field contains the end offset of the + binlog entry */ + /*------------------------------*/ + ulint n_mysql_tables_in_use; /*!< number of Innobase tables + used in the processing of the current + SQL statement in MySQL */ + ulint mysql_n_tables_locked; + /*!< how many tables the current SQL + statement uses, except those + in consistent read */ + /*------------------------------*/ + UT_LIST_NODE_T(trx_t) + trx_list; /*!< list of transactions; + protected by trx_sys->mutex. + The same node is used for both + trx_sys_t::ro_trx_list and + trx_sys_t::rw_trx_list */ +#ifdef UNIV_DEBUG + /** The following two fields are mutually exclusive. */ + /* @{ */ + + ibool in_ro_trx_list; /*!< TRUE if in trx_sys->ro_trx_list */ + ibool in_rw_trx_list; /*!< TRUE if in trx_sys->rw_trx_list */ + /* @} */ +#endif /* UNIV_DEBUG */ + UT_LIST_NODE_T(trx_t) + mysql_trx_list; /*!< list of transactions created for + MySQL; protected by trx_sys->mutex */ +#ifdef UNIV_DEBUG + ibool in_mysql_trx_list; + /*!< TRUE if in + trx_sys->mysql_trx_list */ +#endif /* UNIV_DEBUG */ + /*------------------------------*/ + dberr_t error_state; /*!< 0 if no error, otherwise error + number; NOTE That ONLY the thread + doing the transaction is allowed to + set this field: this is NOT protected + by any mutex */ + const dict_index_t*error_info; /*!< if the error number indicates a + duplicate key error, a pointer to + the problematic index is stored here */ + ulint error_key_num; /*!< if the index creation fails to a + duplicate key error, a mysql key + number of that index is stored here */ + sess_t* sess; /*!< session of the trx, NULL if none */ + que_t* graph; /*!< query currently run in the session, + or NULL if none; NOTE that the query + belongs to the session, and it can + survive over a transaction commit, if + it is a stored procedure with a COMMIT + WORK statement, for instance */ + mem_heap_t* global_read_view_heap; + /*!< memory heap for the global read + view */ + read_view_t* global_read_view; + /*!< consistent read view associated + to a transaction or NULL */ + read_view_t* read_view; /*!< consistent read view used in the + transaction or NULL, this read view + if defined can be normal read view + associated to a transaction (i.e. + same as global_read_view) or read view + associated to a cursor */ + /*------------------------------*/ + UT_LIST_BASE_NODE_T(trx_named_savept_t) + trx_savepoints; /*!< savepoints set with SAVEPOINT ..., + oldest first */ + /*------------------------------*/ + ib_mutex_t undo_mutex; /*!< mutex protecting the fields in this + section (down to undo_no_arr), EXCEPT + last_sql_stat_start, which can be + accessed only when we know that there + cannot be any activity in the undo + logs! */ + undo_no_t undo_no; /*!< next undo log record number to + assign; since the undo log is + private for a transaction, this + is a simple ascending sequence + with no gaps; thus it represents + the number of modified/inserted + rows in a transaction */ + trx_savept_t last_sql_stat_start; + /*!< undo_no when the last sql statement + was started: in case of an error, trx + is rolled back down to this undo + number; see note at undo_mutex! */ + trx_rseg_t* rseg; /*!< rollback segment assigned to the + transaction, or NULL if not assigned + yet */ + trx_undo_t* insert_undo; /*!< pointer to the insert undo log, or + NULL if no inserts performed yet */ + trx_undo_t* update_undo; /*!< pointer to the update undo log, or + NULL if no update performed yet */ + undo_no_t roll_limit; /*!< least undo number to undo during + a rollback */ + ulint pages_undone; /*!< number of undo log pages undone + since the last undo log truncation */ + trx_undo_arr_t* undo_no_arr; /*!< array of undo numbers of undo log + records which are currently processed + by a rollback operation */ + /*------------------------------*/ + ulint n_autoinc_rows; /*!< no. of AUTO-INC rows required for + an SQL statement. This is useful for + multi-row INSERTs */ + ib_vector_t* autoinc_locks; /* AUTOINC locks held by this + transaction. Note that these are + also in the lock list trx_locks. This + vector needs to be freed explicitly + when the trx instance is destroyed. + Protected by lock_sys->mutex. */ + /*------------------------------*/ + ibool read_only; /*!< TRUE if transaction is flagged + as a READ-ONLY transaction. + if !auto_commit || will_lock > 0 + then it will added to the list + trx_sys_t::ro_trx_list. A read only + transaction will not be assigned an + UNDO log. Non-locking auto-commit + read-only transaction will not be on + either list. */ + ibool auto_commit; /*!< TRUE if it is an autocommit */ + ulint will_lock; /*!< Will acquire some locks. Increment + each time we determine that a lock will + be acquired by the MySQL layer. */ + bool ddl; /*!< true if it is a transaction that + is being started for a DDL operation */ + /*------------------------------*/ + fts_trx_t* fts_trx; /*!< FTS information, or NULL if + transaction hasn't modified tables + with FTS indexes (yet). */ + doc_id_t fts_next_doc_id;/* The document id used for updates */ + /*------------------------------*/ + ulint flush_tables; /*!< if "covering" the FLUSH TABLES", + count of tables being flushed. */ + + /*------------------------------*/ +#ifdef UNIV_DEBUG + ulint start_line; /*!< Track where it was started from */ + const char* start_file; /*!< Filename where it was started */ +#endif /* UNIV_DEBUG */ + /*------------------------------*/ + bool api_trx; /*!< trx started by InnoDB API */ + bool api_auto_commit;/*!< automatic commit */ + bool read_write; /*!< if read and write operation */ + + /*------------------------------*/ + char detailed_error[256]; /*!< detailed error message for last + error, or empty. */ +}; + +/* Transaction isolation levels (trx->isolation_level) */ +#define TRX_ISO_READ_UNCOMMITTED 0 /* dirty read: non-locking + SELECTs are performed so that + we do not look at a possible + earlier version of a record; + thus they are not 'consistent' + reads under this isolation + level; otherwise like level + 2 */ + +#define TRX_ISO_READ_COMMITTED 1 /* somewhat Oracle-like + isolation, except that in + range UPDATE and DELETE we + must block phantom rows + with next-key locks; + SELECT ... FOR UPDATE and ... + LOCK IN SHARE MODE only lock + the index records, NOT the + gaps before them, and thus + allow free inserting; + each consistent read reads its + own snapshot */ + +#define TRX_ISO_REPEATABLE_READ 2 /* this is the default; + all consistent reads in the + same trx read the same + snapshot; + full next-key locking used + in locking reads to block + insertions into gaps */ + +#define TRX_ISO_SERIALIZABLE 3 /* all plain SELECTs are + converted to LOCK IN SHARE + MODE reads */ + +/* Treatment of duplicate values (trx->duplicates; for example, in inserts). +Multiple flags can be combined with bitwise OR. */ +#define TRX_DUP_IGNORE 1 /* duplicate rows are to be updated */ +#define TRX_DUP_REPLACE 2 /* duplicate rows are to be replaced */ + + +/* Types of a trx signal */ +#define TRX_SIG_NO_SIGNAL 0 +#define TRX_SIG_TOTAL_ROLLBACK 1 +#define TRX_SIG_ROLLBACK_TO_SAVEPT 2 +#define TRX_SIG_COMMIT 3 +#define TRX_SIG_BREAK_EXECUTION 5 + +/* Sender types of a signal */ +#define TRX_SIG_SELF 0 /* sent by the session itself, or + by an error occurring within this + session */ +#define TRX_SIG_OTHER_SESS 1 /* sent by another session (which + must hold rights to this) */ + +/** Commit node states */ +enum commit_node_state { + COMMIT_NODE_SEND = 1, /*!< about to send a commit signal to + the transaction */ + COMMIT_NODE_WAIT /*!< commit signal sent to the transaction, + waiting for completion */ +}; + +/** Commit command node in a query graph */ +struct commit_node_t{ + que_common_t common; /*!< node type: QUE_NODE_COMMIT */ + enum commit_node_state + state; /*!< node execution state */ +}; + + +/** Test if trx->mutex is owned. */ +#define trx_mutex_own(t) mutex_own(&t->mutex) + +/** Acquire the trx->mutex. */ +#define trx_mutex_enter(t) do { \ + mutex_enter(&t->mutex); \ +} while (0) + +/** Release the trx->mutex. */ +#define trx_mutex_exit(t) do { \ + mutex_exit(&t->mutex); \ +} while (0) + +/** @brief The latch protecting the adaptive search system + +This latch protects the +(1) hash index; +(2) columns of a record to which we have a pointer in the hash index; + +but does NOT protect: + +(3) next record offset field in a record; +(4) next or previous records on the same page. + +Bear in mind (3) and (4) when using the hash index. +*/ +extern rw_lock_t* btr_search_latch_temp; + +/** The latch protecting the adaptive search system */ +#define btr_search_latch (*btr_search_latch_temp) + +#ifndef UNIV_NONINL +#include "trx0trx.ic" +#endif +#endif /* !UNIV_HOTBACKUP */ + +#endif diff --git a/storage/innobase/include/trx0trx.ic b/storage/innobase/include/trx0trx.ic new file mode 100644 index 00000000000..69ee17ea98b --- /dev/null +++ b/storage/innobase/include/trx0trx.ic @@ -0,0 +1,180 @@ +/***************************************************************************** + +Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/trx0trx.ic +The transaction + +Created 3/26/1996 Heikki Tuuri +*******************************************************/ + +/**********************************************************************//** +Determines if a transaction is in the given state. +The caller must hold trx_sys->mutex, or it must be the thread +that is serving a running transaction. +A running transaction must be in trx_sys->ro_trx_list or trx_sys->rw_trx_list +unless it is a non-locking autocommit read only transaction, which is only +in trx_sys->mysql_trx_list. +@return TRUE if trx->state == state */ +UNIV_INLINE +ibool +trx_state_eq( +/*=========*/ + const trx_t* trx, /*!< in: transaction */ + trx_state_t state) /*!< in: state; + if state != TRX_STATE_NOT_STARTED + asserts that + trx->state != TRX_STATE_NOT_STARTED */ +{ +#ifdef UNIV_DEBUG + switch (trx->state) { + case TRX_STATE_PREPARED: + ut_ad(!trx_is_autocommit_non_locking(trx)); + return(trx->state == state); + + case TRX_STATE_ACTIVE: + assert_trx_nonlocking_or_in_list(trx); + return(state == trx->state); + + case TRX_STATE_COMMITTED_IN_MEMORY: + assert_trx_in_list(trx); + return(state == trx->state); + + case TRX_STATE_NOT_STARTED: + /* This state is not allowed for running transactions. */ + ut_a(state == TRX_STATE_NOT_STARTED); + ut_ad(!trx->in_rw_trx_list); + ut_ad(!trx->in_ro_trx_list); + return(state == trx->state); + } + ut_error; +#endif /* UNIV_DEBUG */ + return(trx->state == state); +} + +/****************************************************************//** +Retrieves the error_info field from a trx. +@return the error info */ +UNIV_INLINE +const dict_index_t* +trx_get_error_info( +/*===============*/ + const trx_t* trx) /*!< in: trx object */ +{ + return(trx->error_info); +} + +/*******************************************************************//** +Retrieves transaction's que state in a human readable string. The string +should not be free()'d or modified. +@return string in the data segment */ +UNIV_INLINE +const char* +trx_get_que_state_str( +/*==================*/ + const trx_t* trx) /*!< in: transaction */ +{ + /* be sure to adjust TRX_QUE_STATE_STR_MAX_LEN if you change this */ + switch (trx->lock.que_state) { + case TRX_QUE_RUNNING: + return("RUNNING"); + case TRX_QUE_LOCK_WAIT: + return("LOCK WAIT"); + case TRX_QUE_ROLLING_BACK: + return("ROLLING BACK"); + case TRX_QUE_COMMITTING: + return("COMMITTING"); + default: + return("UNKNOWN"); + } +} + +/**********************************************************************//** +Determine if a transaction is a dictionary operation. +@return dictionary operation mode */ +UNIV_INLINE +enum trx_dict_op_t +trx_get_dict_operation( +/*===================*/ + const trx_t* trx) /*!< in: transaction */ +{ + trx_dict_op_t op = static_cast<trx_dict_op_t>(trx->dict_operation); + +#ifdef UNIV_DEBUG + switch (op) { + case TRX_DICT_OP_NONE: + case TRX_DICT_OP_TABLE: + case TRX_DICT_OP_INDEX: + return(op); + } + ut_error; +#endif /* UNIV_DEBUG */ + return(op); +} +/**********************************************************************//** +Flag a transaction a dictionary operation. */ +UNIV_INLINE +void +trx_set_dict_operation( +/*===================*/ + trx_t* trx, /*!< in/out: transaction */ + enum trx_dict_op_t op) /*!< in: operation, not + TRX_DICT_OP_NONE */ +{ +#ifdef UNIV_DEBUG + enum trx_dict_op_t old_op = trx_get_dict_operation(trx); + + switch (op) { + case TRX_DICT_OP_NONE: + ut_error; + break; + case TRX_DICT_OP_TABLE: + switch (old_op) { + case TRX_DICT_OP_NONE: + case TRX_DICT_OP_INDEX: + case TRX_DICT_OP_TABLE: + goto ok; + } + ut_error; + break; + case TRX_DICT_OP_INDEX: + ut_ad(old_op == TRX_DICT_OP_NONE); + break; + } +ok: +#endif /* UNIV_DEBUG */ + + trx->ddl = true; + trx->dict_operation = op; +} + +/********************************************************************//** +Releases the search latch if trx has reserved it. */ +UNIV_INLINE +void +trx_search_latch_release_if_reserved( +/*=================================*/ + trx_t* trx) /*!< in: transaction */ +{ + if (trx->has_search_latch) { + rw_lock_s_unlock(&btr_search_latch); + + trx->has_search_latch = FALSE; + } +} + diff --git a/storage/innobase/include/trx0types.h b/storage/innobase/include/trx0types.h new file mode 100644 index 00000000000..7ca95131328 --- /dev/null +++ b/storage/innobase/include/trx0types.h @@ -0,0 +1,147 @@ +/***************************************************************************** + +Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/trx0types.h +Transaction system global type definitions + +Created 3/26/1996 Heikki Tuuri +*******************************************************/ + +#ifndef trx0types_h +#define trx0types_h + +#include "ut0byte.h" + +/** printf(3) format used for printing DB_TRX_ID and other system fields */ +#define TRX_ID_FMT IB_ID_FMT + +/** maximum length that a formatted trx_t::id could take, not including +the terminating NUL character. */ +#define TRX_ID_MAX_LEN 17 + +/** Transaction execution states when trx->state == TRX_STATE_ACTIVE */ +enum trx_que_t { + TRX_QUE_RUNNING, /*!< transaction is running */ + TRX_QUE_LOCK_WAIT, /*!< transaction is waiting for + a lock */ + TRX_QUE_ROLLING_BACK, /*!< transaction is rolling back */ + TRX_QUE_COMMITTING /*!< transaction is committing */ +}; + +/** Transaction states (trx_t::state) */ +enum trx_state_t { + TRX_STATE_NOT_STARTED, + TRX_STATE_ACTIVE, + TRX_STATE_PREPARED, /* Support for 2PC/XA */ + TRX_STATE_COMMITTED_IN_MEMORY +}; + +/** Type of data dictionary operation */ +enum trx_dict_op_t { + /** The transaction is not modifying the data dictionary. */ + TRX_DICT_OP_NONE = 0, + /** The transaction is creating a table or an index, or + dropping a table. The table must be dropped in crash + recovery. This and TRX_DICT_OP_NONE are the only possible + operation modes in crash recovery. */ + TRX_DICT_OP_TABLE = 1, + /** The transaction is creating or dropping an index in an + existing table. In crash recovery, the data dictionary + must be locked, but the table must not be dropped. */ + TRX_DICT_OP_INDEX = 2 +}; + +/** Memory objects */ +/* @{ */ +/** Transaction */ +struct trx_t; +/** The locks and state of an active transaction */ +struct trx_lock_t; +/** Transaction system */ +struct trx_sys_t; +/** Signal */ +struct trx_sig_t; +/** Rollback segment */ +struct trx_rseg_t; +/** Transaction undo log */ +struct trx_undo_t; +/** Array of undo numbers of undo records being rolled back or purged */ +struct trx_undo_arr_t; +/** A cell of trx_undo_arr_t */ +struct trx_undo_inf_t; +/** The control structure used in the purge operation */ +struct trx_purge_t; +/** Rollback command node in a query graph */ +struct roll_node_t; +/** Commit command node in a query graph */ +struct commit_node_t; +/** SAVEPOINT command node in a query graph */ +struct trx_named_savept_t; +/* @} */ + +/** Rollback contexts */ +enum trx_rb_ctx { + RB_NONE = 0, /*!< no rollback */ + RB_NORMAL, /*!< normal rollback */ + RB_RECOVERY_PURGE_REC, + /*!< rolling back an incomplete transaction, + in crash recovery, rolling back an + INSERT that was performed by updating a + delete-marked record; if the delete-marked record + no longer exists in an active read view, it will + be purged */ + RB_RECOVERY /*!< rolling back an incomplete transaction, + in crash recovery */ +}; + +/** Row identifier (DB_ROW_ID, DATA_ROW_ID) */ +typedef ib_id_t row_id_t; +/** Transaction identifier (DB_TRX_ID, DATA_TRX_ID) */ +typedef ib_id_t trx_id_t; +/** Rollback pointer (DB_ROLL_PTR, DATA_ROLL_PTR) */ +typedef ib_id_t roll_ptr_t; +/** Undo number */ +typedef ib_id_t undo_no_t; + +/** Maximum transaction identifier */ +#define TRX_ID_MAX IB_ID_MAX + +/** Transaction savepoint */ +struct trx_savept_t{ + undo_no_t least_undo_no; /*!< least undo number to undo */ +}; + +/** File objects */ +/* @{ */ +/** Transaction system header */ +typedef byte trx_sysf_t; +/** Rollback segment header */ +typedef byte trx_rsegf_t; +/** Undo segment header */ +typedef byte trx_usegf_t; +/** Undo log header */ +typedef byte trx_ulogf_t; +/** Undo log page header */ +typedef byte trx_upagef_t; + +/** Undo log record */ +typedef byte trx_undo_rec_t; +/* @} */ + +#endif diff --git a/storage/innobase/include/trx0undo.h b/storage/innobase/include/trx0undo.h new file mode 100644 index 00000000000..61b0dabb1e6 --- /dev/null +++ b/storage/innobase/include/trx0undo.h @@ -0,0 +1,604 @@ +/***************************************************************************** + +Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/trx0undo.h +Transaction undo log + +Created 3/26/1996 Heikki Tuuri +*******************************************************/ + +#ifndef trx0undo_h +#define trx0undo_h + +#include "univ.i" +#include "trx0types.h" +#include "mtr0mtr.h" +#include "trx0sys.h" +#include "page0types.h" +#include "trx0xa.h" + +#ifndef UNIV_HOTBACKUP +/***********************************************************************//** +Builds a roll pointer. +@return roll pointer */ +UNIV_INLINE +roll_ptr_t +trx_undo_build_roll_ptr( +/*====================*/ + ibool is_insert, /*!< in: TRUE if insert undo log */ + ulint rseg_id, /*!< in: rollback segment id */ + ulint page_no, /*!< in: page number */ + ulint offset); /*!< in: offset of the undo entry within page */ +/***********************************************************************//** +Decodes a roll pointer. */ +UNIV_INLINE +void +trx_undo_decode_roll_ptr( +/*=====================*/ + roll_ptr_t roll_ptr, /*!< in: roll pointer */ + ibool* is_insert, /*!< out: TRUE if insert undo log */ + ulint* rseg_id, /*!< out: rollback segment id */ + ulint* page_no, /*!< out: page number */ + ulint* offset); /*!< out: offset of the undo + entry within page */ +/***********************************************************************//** +Returns TRUE if the roll pointer is of the insert type. +@return TRUE if insert undo log */ +UNIV_INLINE +ibool +trx_undo_roll_ptr_is_insert( +/*========================*/ + roll_ptr_t roll_ptr); /*!< in: roll pointer */ +/***********************************************************************//** +Returns true if the record is of the insert type. +@return true if the record was freshly inserted (not updated). */ +UNIV_INLINE +bool +trx_undo_trx_id_is_insert( +/*======================*/ + const byte* trx_id) /*!< in: DB_TRX_ID, followed by DB_ROLL_PTR */ + __attribute__((nonnull, pure, warn_unused_result)); +#endif /* !UNIV_HOTBACKUP */ +/*****************************************************************//** +Writes a roll ptr to an index page. In case that the size changes in +some future version, this function should be used instead of +mach_write_... */ +UNIV_INLINE +void +trx_write_roll_ptr( +/*===============*/ + byte* ptr, /*!< in: pointer to memory where + written */ + roll_ptr_t roll_ptr); /*!< in: roll ptr */ +/*****************************************************************//** +Reads a roll ptr from an index page. In case that the roll ptr size +changes in some future version, this function should be used instead of +mach_read_... +@return roll ptr */ +UNIV_INLINE +roll_ptr_t +trx_read_roll_ptr( +/*==============*/ + const byte* ptr); /*!< in: pointer to memory from where to read */ +#ifndef UNIV_HOTBACKUP +/******************************************************************//** +Gets an undo log page and x-latches it. +@return pointer to page x-latched */ +UNIV_INLINE +page_t* +trx_undo_page_get( +/*==============*/ + ulint space, /*!< in: space where placed */ + ulint zip_size, /*!< in: compressed page size in bytes + or 0 for uncompressed pages */ + ulint page_no, /*!< in: page number */ + mtr_t* mtr); /*!< in: mtr */ +/******************************************************************//** +Gets an undo log page and s-latches it. +@return pointer to page s-latched */ +UNIV_INLINE +page_t* +trx_undo_page_get_s_latched( +/*========================*/ + ulint space, /*!< in: space where placed */ + ulint zip_size, /*!< in: compressed page size in bytes + or 0 for uncompressed pages */ + ulint page_no, /*!< in: page number */ + mtr_t* mtr); /*!< in: mtr */ +/******************************************************************//** +Returns the previous undo record on the page in the specified log, or +NULL if none exists. +@return pointer to record, NULL if none */ +UNIV_INLINE +trx_undo_rec_t* +trx_undo_page_get_prev_rec( +/*=======================*/ + trx_undo_rec_t* rec, /*!< in: undo log record */ + ulint page_no,/*!< in: undo log header page number */ + ulint offset);/*!< in: undo log header offset on page */ +/******************************************************************//** +Returns the next undo log record on the page in the specified log, or +NULL if none exists. +@return pointer to record, NULL if none */ +UNIV_INLINE +trx_undo_rec_t* +trx_undo_page_get_next_rec( +/*=======================*/ + trx_undo_rec_t* rec, /*!< in: undo log record */ + ulint page_no,/*!< in: undo log header page number */ + ulint offset);/*!< in: undo log header offset on page */ +/******************************************************************//** +Returns the last undo record on the page in the specified undo log, or +NULL if none exists. +@return pointer to record, NULL if none */ +UNIV_INLINE +trx_undo_rec_t* +trx_undo_page_get_last_rec( +/*=======================*/ + page_t* undo_page,/*!< in: undo log page */ + ulint page_no,/*!< in: undo log header page number */ + ulint offset); /*!< in: undo log header offset on page */ +/******************************************************************//** +Returns the first undo record on the page in the specified undo log, or +NULL if none exists. +@return pointer to record, NULL if none */ +UNIV_INLINE +trx_undo_rec_t* +trx_undo_page_get_first_rec( +/*========================*/ + page_t* undo_page,/*!< in: undo log page */ + ulint page_no,/*!< in: undo log header page number */ + ulint offset);/*!< in: undo log header offset on page */ +/***********************************************************************//** +Gets the previous record in an undo log. +@return undo log record, the page s-latched, NULL if none */ +UNIV_INTERN +trx_undo_rec_t* +trx_undo_get_prev_rec( +/*==================*/ + trx_undo_rec_t* rec, /*!< in: undo record */ + ulint page_no,/*!< in: undo log header page number */ + ulint offset, /*!< in: undo log header offset on page */ + bool shared, /*!< in: true=S-latch, false=X-latch */ + mtr_t* mtr); /*!< in: mtr */ +/***********************************************************************//** +Gets the next record in an undo log. +@return undo log record, the page s-latched, NULL if none */ +UNIV_INTERN +trx_undo_rec_t* +trx_undo_get_next_rec( +/*==================*/ + trx_undo_rec_t* rec, /*!< in: undo record */ + ulint page_no,/*!< in: undo log header page number */ + ulint offset, /*!< in: undo log header offset on page */ + mtr_t* mtr); /*!< in: mtr */ +/***********************************************************************//** +Gets the first record in an undo log. +@return undo log record, the page latched, NULL if none */ +UNIV_INTERN +trx_undo_rec_t* +trx_undo_get_first_rec( +/*===================*/ + ulint space, /*!< in: undo log header space */ + ulint zip_size,/*!< in: compressed page size in bytes + or 0 for uncompressed pages */ + ulint page_no,/*!< in: undo log header page number */ + ulint offset, /*!< in: undo log header offset on page */ + ulint mode, /*!< in: latching mode: RW_S_LATCH or RW_X_LATCH */ + mtr_t* mtr); /*!< in: mtr */ +/********************************************************************//** +Tries to add a page to the undo log segment where the undo log is placed. +@return X-latched block if success, else NULL */ +UNIV_INTERN +buf_block_t* +trx_undo_add_page( +/*==============*/ + trx_t* trx, /*!< in: transaction */ + trx_undo_t* undo, /*!< in: undo log memory object */ + mtr_t* mtr) /*!< in: mtr which does not have a latch to any + undo log page; the caller must have reserved + the rollback segment mutex */ + __attribute__((nonnull, warn_unused_result)); +/********************************************************************//** +Frees the last undo log page. +The caller must hold the rollback segment mutex. */ +UNIV_INTERN +void +trx_undo_free_last_page_func( +/*==========================*/ +#ifdef UNIV_DEBUG + const trx_t* trx, /*!< in: transaction */ +#endif /* UNIV_DEBUG */ + trx_undo_t* undo, /*!< in/out: undo log memory copy */ + mtr_t* mtr) /*!< in/out: mini-transaction which does not + have a latch to any undo log page or which + has allocated the undo log page */ + __attribute__((nonnull)); +#ifdef UNIV_DEBUG +# define trx_undo_free_last_page(trx,undo,mtr) \ + trx_undo_free_last_page_func(trx,undo,mtr) +#else /* UNIV_DEBUG */ +# define trx_undo_free_last_page(trx,undo,mtr) \ + trx_undo_free_last_page_func(undo,mtr) +#endif /* UNIV_DEBUG */ + +/***********************************************************************//** +Truncates an undo log from the end. This function is used during a rollback +to free space from an undo log. */ +UNIV_INTERN +void +trx_undo_truncate_end_func( +/*=======================*/ +#ifdef UNIV_DEBUG + const trx_t* trx, /*!< in: transaction whose undo log it is */ +#endif /* UNIV_DEBUG */ + trx_undo_t* undo, /*!< in/out: undo log */ + undo_no_t limit) /*!< in: all undo records with undo number + >= this value should be truncated */ + __attribute__((nonnull)); +#ifdef UNIV_DEBUG +# define trx_undo_truncate_end(trx,undo,limit) \ + trx_undo_truncate_end_func(trx,undo,limit) +#else /* UNIV_DEBUG */ +# define trx_undo_truncate_end(trx,undo,limit) \ + trx_undo_truncate_end_func(undo,limit) +#endif /* UNIV_DEBUG */ + +/***********************************************************************//** +Truncates an undo log from the start. This function is used during a purge +operation. */ +UNIV_INTERN +void +trx_undo_truncate_start( +/*====================*/ + trx_rseg_t* rseg, /*!< in: rollback segment */ + ulint space, /*!< in: space id of the log */ + ulint hdr_page_no, /*!< in: header page number */ + ulint hdr_offset, /*!< in: header offset on the page */ + undo_no_t limit); /*!< in: all undo pages with + undo numbers < this value + should be truncated; NOTE that + the function only frees whole + pages; the header page is not + freed, but emptied, if all the + records there are < limit */ +/********************************************************************//** +Initializes the undo log lists for a rollback segment memory copy. +This function is only called when the database is started or a new +rollback segment created. +@return the combined size of undo log segments in pages */ +UNIV_INTERN +ulint +trx_undo_lists_init( +/*================*/ + trx_rseg_t* rseg); /*!< in: rollback segment memory object */ +/**********************************************************************//** +Assigns an undo log for a transaction. A new undo log is created or a cached +undo log reused. +@return DB_SUCCESS if undo log assign successful, possible error codes +are: DB_TOO_MANY_CONCURRENT_TRXS DB_OUT_OF_FILE_SPACE DB_READ_ONLY +DB_OUT_OF_MEMORY */ +UNIV_INTERN +dberr_t +trx_undo_assign_undo( +/*=================*/ + trx_t* trx, /*!< in: transaction */ + ulint type) /*!< in: TRX_UNDO_INSERT or TRX_UNDO_UPDATE */ + __attribute__((nonnull, warn_unused_result)); +/******************************************************************//** +Sets the state of the undo log segment at a transaction finish. +@return undo log segment header page, x-latched */ +UNIV_INTERN +page_t* +trx_undo_set_state_at_finish( +/*=========================*/ + trx_undo_t* undo, /*!< in: undo log memory copy */ + mtr_t* mtr); /*!< in: mtr */ +/******************************************************************//** +Sets the state of the undo log segment at a transaction prepare. +@return undo log segment header page, x-latched */ +UNIV_INTERN +page_t* +trx_undo_set_state_at_prepare( +/*==========================*/ + trx_t* trx, /*!< in: transaction */ + trx_undo_t* undo, /*!< in: undo log memory copy */ + mtr_t* mtr); /*!< in: mtr */ + +/**********************************************************************//** +Adds the update undo log header as the first in the history list, and +frees the memory object, or puts it to the list of cached update undo log +segments. */ +UNIV_INTERN +void +trx_undo_update_cleanup( +/*====================*/ + trx_t* trx, /*!< in: trx owning the update undo log */ + page_t* undo_page, /*!< in: update undo log header page, + x-latched */ + mtr_t* mtr); /*!< in: mtr */ +/******************************************************************//** +Frees or caches an insert undo log after a transaction commit or rollback. +Knowledge of inserts is not needed after a commit or rollback, therefore +the data can be discarded. */ +UNIV_INTERN +void +trx_undo_insert_cleanup( +/*====================*/ + trx_t* trx); /*!< in: transaction handle */ + +/********************************************************************//** +At shutdown, frees the undo logs of a PREPARED transaction. */ +UNIV_INTERN +void +trx_undo_free_prepared( +/*===================*/ + trx_t* trx) /*!< in/out: PREPARED transaction */ + UNIV_COLD __attribute__((nonnull)); +#endif /* !UNIV_HOTBACKUP */ +/***********************************************************//** +Parses the redo log entry of an undo log page initialization. +@return end of log record or NULL */ +UNIV_INTERN +byte* +trx_undo_parse_page_init( +/*=====================*/ + byte* ptr, /*!< in: buffer */ + byte* end_ptr,/*!< in: buffer end */ + page_t* page, /*!< in: page or NULL */ + mtr_t* mtr); /*!< in: mtr or NULL */ +/***********************************************************//** +Parses the redo log entry of an undo log page header create or reuse. +@return end of log record or NULL */ +UNIV_INTERN +byte* +trx_undo_parse_page_header( +/*=======================*/ + ulint type, /*!< in: MLOG_UNDO_HDR_CREATE or MLOG_UNDO_HDR_REUSE */ + byte* ptr, /*!< in: buffer */ + byte* end_ptr,/*!< in: buffer end */ + page_t* page, /*!< in: page or NULL */ + mtr_t* mtr); /*!< in: mtr or NULL */ +/***********************************************************//** +Parses the redo log entry of an undo log page header discard. +@return end of log record or NULL */ +UNIV_INTERN +byte* +trx_undo_parse_discard_latest( +/*==========================*/ + byte* ptr, /*!< in: buffer */ + byte* end_ptr,/*!< in: buffer end */ + page_t* page, /*!< in: page or NULL */ + mtr_t* mtr); /*!< in: mtr or NULL */ +/************************************************************************ +Frees an undo log memory copy. */ +UNIV_INTERN +void +trx_undo_mem_free( +/*==============*/ + trx_undo_t* undo); /* in: the undo object to be freed */ + +/* Types of an undo log segment */ +#define TRX_UNDO_INSERT 1 /* contains undo entries for inserts */ +#define TRX_UNDO_UPDATE 2 /* contains undo entries for updates + and delete markings: in short, + modifys (the name 'UPDATE' is a + historical relic) */ +/* States of an undo log segment */ +#define TRX_UNDO_ACTIVE 1 /* contains an undo log of an active + transaction */ +#define TRX_UNDO_CACHED 2 /* cached for quick reuse */ +#define TRX_UNDO_TO_FREE 3 /* insert undo segment can be freed */ +#define TRX_UNDO_TO_PURGE 4 /* update undo segment will not be + reused: it can be freed in purge when + all undo data in it is removed */ +#define TRX_UNDO_PREPARED 5 /* contains an undo log of an + prepared transaction */ + +#ifndef UNIV_HOTBACKUP +/** Transaction undo log memory object; this is protected by the undo_mutex +in the corresponding transaction object */ + +struct trx_undo_t{ + /*-----------------------------*/ + ulint id; /*!< undo log slot number within the + rollback segment */ + ulint type; /*!< TRX_UNDO_INSERT or + TRX_UNDO_UPDATE */ + ulint state; /*!< state of the corresponding undo log + segment */ + ibool del_marks; /*!< relevant only in an update undo + log: this is TRUE if the transaction may + have delete marked records, because of + a delete of a row or an update of an + indexed field; purge is then + necessary; also TRUE if the transaction + has updated an externally stored + field */ + trx_id_t trx_id; /*!< id of the trx assigned to the undo + log */ + XID xid; /*!< X/Open XA transaction + identification */ + ibool dict_operation; /*!< TRUE if a dict operation trx */ + table_id_t table_id; /*!< if a dict operation, then the table + id */ + trx_rseg_t* rseg; /*!< rseg where the undo log belongs */ + /*-----------------------------*/ + ulint space; /*!< space id where the undo log + placed */ + ulint zip_size; /*!< compressed page size of space + in bytes, or 0 for uncompressed */ + ulint hdr_page_no; /*!< page number of the header page in + the undo log */ + ulint hdr_offset; /*!< header offset of the undo log on + the page */ + ulint last_page_no; /*!< page number of the last page in the + undo log; this may differ from + top_page_no during a rollback */ + ulint size; /*!< current size in pages */ + /*-----------------------------*/ + ulint empty; /*!< TRUE if the stack of undo log + records is currently empty */ + ulint top_page_no; /*!< page number where the latest undo + log record was catenated; during + rollback the page from which the latest + undo record was chosen */ + ulint top_offset; /*!< offset of the latest undo record, + i.e., the topmost element in the undo + log if we think of it as a stack */ + undo_no_t top_undo_no; /*!< undo number of the latest record */ + buf_block_t* guess_block; /*!< guess for the buffer block where + the top page might reside */ + /*-----------------------------*/ + UT_LIST_NODE_T(trx_undo_t) undo_list; + /*!< undo log objects in the rollback + segment are chained into lists */ +}; +#endif /* !UNIV_HOTBACKUP */ + +/** The offset of the undo log page header on pages of the undo log */ +#define TRX_UNDO_PAGE_HDR FSEG_PAGE_DATA +/*-------------------------------------------------------------*/ +/** Transaction undo log page header offsets */ +/* @{ */ +#define TRX_UNDO_PAGE_TYPE 0 /*!< TRX_UNDO_INSERT or + TRX_UNDO_UPDATE */ +#define TRX_UNDO_PAGE_START 2 /*!< Byte offset where the undo log + records for the LATEST transaction + start on this page (remember that + in an update undo log, the first page + can contain several undo logs) */ +#define TRX_UNDO_PAGE_FREE 4 /*!< On each page of the undo log this + field contains the byte offset of the + first free byte on the page */ +#define TRX_UNDO_PAGE_NODE 6 /*!< The file list node in the chain + of undo log pages */ +/*-------------------------------------------------------------*/ +#define TRX_UNDO_PAGE_HDR_SIZE (6 + FLST_NODE_SIZE) + /*!< Size of the transaction undo + log page header, in bytes */ +/* @} */ + +/** An update undo segment with just one page can be reused if it has +at most this many bytes used; we must leave space at least for one new undo +log header on the page */ + +#define TRX_UNDO_PAGE_REUSE_LIMIT (3 * UNIV_PAGE_SIZE / 4) + +/* An update undo log segment may contain several undo logs on its first page +if the undo logs took so little space that the segment could be cached and +reused. All the undo log headers are then on the first page, and the last one +owns the undo log records on subsequent pages if the segment is bigger than +one page. If an undo log is stored in a segment, then on the first page it is +allowed to have zero undo records, but if the segment extends to several +pages, then all the rest of the pages must contain at least one undo log +record. */ + +/** The offset of the undo log segment header on the first page of the undo +log segment */ + +#define TRX_UNDO_SEG_HDR (TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE) +/** Undo log segment header */ +/* @{ */ +/*-------------------------------------------------------------*/ +#define TRX_UNDO_STATE 0 /*!< TRX_UNDO_ACTIVE, ... */ +#define TRX_UNDO_LAST_LOG 2 /*!< Offset of the last undo log header + on the segment header page, 0 if + none */ +#define TRX_UNDO_FSEG_HEADER 4 /*!< Header for the file segment which + the undo log segment occupies */ +#define TRX_UNDO_PAGE_LIST (4 + FSEG_HEADER_SIZE) + /*!< Base node for the list of pages in + the undo log segment; defined only on + the undo log segment's first page */ +/*-------------------------------------------------------------*/ +/** Size of the undo log segment header */ +#define TRX_UNDO_SEG_HDR_SIZE (4 + FSEG_HEADER_SIZE + FLST_BASE_NODE_SIZE) +/* @} */ + + +/** The undo log header. There can be several undo log headers on the first +page of an update undo log segment. */ +/* @{ */ +/*-------------------------------------------------------------*/ +#define TRX_UNDO_TRX_ID 0 /*!< Transaction id */ +#define TRX_UNDO_TRX_NO 8 /*!< Transaction number of the + transaction; defined only if the log + is in a history list */ +#define TRX_UNDO_DEL_MARKS 16 /*!< Defined only in an update undo + log: TRUE if the transaction may have + done delete markings of records, and + thus purge is necessary */ +#define TRX_UNDO_LOG_START 18 /*!< Offset of the first undo log record + of this log on the header page; purge + may remove undo log record from the + log start, and therefore this is not + necessarily the same as this log + header end offset */ +#define TRX_UNDO_XID_EXISTS 20 /*!< TRUE if undo log header includes + X/Open XA transaction identification + XID */ +#define TRX_UNDO_DICT_TRANS 21 /*!< TRUE if the transaction is a table + create, index create, or drop + transaction: in recovery + the transaction cannot be rolled back + in the usual way: a 'rollback' rather + means dropping the created or dropped + table, if it still exists */ +#define TRX_UNDO_TABLE_ID 22 /*!< Id of the table if the preceding + field is TRUE */ +#define TRX_UNDO_NEXT_LOG 30 /*!< Offset of the next undo log header + on this page, 0 if none */ +#define TRX_UNDO_PREV_LOG 32 /*!< Offset of the previous undo log + header on this page, 0 if none */ +#define TRX_UNDO_HISTORY_NODE 34 /*!< If the log is put to the history + list, the file list node is here */ +/*-------------------------------------------------------------*/ +/** Size of the undo log header without XID information */ +#define TRX_UNDO_LOG_OLD_HDR_SIZE (34 + FLST_NODE_SIZE) + +/* Note: the writing of the undo log old header is coded by a log record +MLOG_UNDO_HDR_CREATE or MLOG_UNDO_HDR_REUSE. The appending of an XID to the +header is logged separately. In this sense, the XID is not really a member +of the undo log header. TODO: do not append the XID to the log header if XA +is not needed by the user. The XID wastes about 150 bytes of space in every +undo log. In the history list we may have millions of undo logs, which means +quite a large overhead. */ + +/** X/Open XA Transaction Identification (XID) */ +/* @{ */ +/** xid_t::formatID */ +#define TRX_UNDO_XA_FORMAT (TRX_UNDO_LOG_OLD_HDR_SIZE) +/** xid_t::gtrid_length */ +#define TRX_UNDO_XA_TRID_LEN (TRX_UNDO_XA_FORMAT + 4) +/** xid_t::bqual_length */ +#define TRX_UNDO_XA_BQUAL_LEN (TRX_UNDO_XA_TRID_LEN + 4) +/** Distributed transaction identifier data */ +#define TRX_UNDO_XA_XID (TRX_UNDO_XA_BQUAL_LEN + 4) +/*--------------------------------------------------------------*/ +#define TRX_UNDO_LOG_XA_HDR_SIZE (TRX_UNDO_XA_XID + XIDDATASIZE) + /*!< Total size of the undo log header + with the XA XID */ +/* @} */ + +#ifndef UNIV_NONINL +#include "trx0undo.ic" +#endif + +#endif diff --git a/storage/innobase/include/trx0undo.ic b/storage/innobase/include/trx0undo.ic new file mode 100644 index 00000000000..577759d6c3d --- /dev/null +++ b/storage/innobase/include/trx0undo.ic @@ -0,0 +1,363 @@ +/***************************************************************************** + +Copyright (c) 1996, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/trx0undo.ic +Transaction undo log + +Created 3/26/1996 Heikki Tuuri +*******************************************************/ + +#include "data0type.h" +#include "page0page.h" + +#ifndef UNIV_HOTBACKUP +/***********************************************************************//** +Builds a roll pointer. +@return roll pointer */ +UNIV_INLINE +roll_ptr_t +trx_undo_build_roll_ptr( +/*====================*/ + ibool is_insert, /*!< in: TRUE if insert undo log */ + ulint rseg_id, /*!< in: rollback segment id */ + ulint page_no, /*!< in: page number */ + ulint offset) /*!< in: offset of the undo entry within page */ +{ + roll_ptr_t roll_ptr; +#if DATA_ROLL_PTR_LEN != 7 +# error "DATA_ROLL_PTR_LEN != 7" +#endif + ut_ad(is_insert == 0 || is_insert == 1); + ut_ad(rseg_id < TRX_SYS_N_RSEGS); + ut_ad(offset < 65536); + + roll_ptr = (roll_ptr_t) is_insert << 55 + | (roll_ptr_t) rseg_id << 48 + | (roll_ptr_t) page_no << 16 + | offset; + return(roll_ptr); +} + +/***********************************************************************//** +Decodes a roll pointer. */ +UNIV_INLINE +void +trx_undo_decode_roll_ptr( +/*=====================*/ + roll_ptr_t roll_ptr, /*!< in: roll pointer */ + ibool* is_insert, /*!< out: TRUE if insert undo log */ + ulint* rseg_id, /*!< out: rollback segment id */ + ulint* page_no, /*!< out: page number */ + ulint* offset) /*!< out: offset of the undo + entry within page */ +{ +#if DATA_ROLL_PTR_LEN != 7 +# error "DATA_ROLL_PTR_LEN != 7" +#endif +#if TRUE != 1 +# error "TRUE != 1" +#endif + ut_ad(roll_ptr < (1ULL << 56)); + *offset = (ulint) roll_ptr & 0xFFFF; + roll_ptr >>= 16; + *page_no = (ulint) roll_ptr & 0xFFFFFFFF; + roll_ptr >>= 32; + *rseg_id = (ulint) roll_ptr & 0x7F; + roll_ptr >>= 7; + *is_insert = (ibool) roll_ptr; /* TRUE==1 */ +} + +/***********************************************************************//** +Returns TRUE if the roll pointer is of the insert type. +@return TRUE if insert undo log */ +UNIV_INLINE +ibool +trx_undo_roll_ptr_is_insert( +/*========================*/ + roll_ptr_t roll_ptr) /*!< in: roll pointer */ +{ +#if DATA_ROLL_PTR_LEN != 7 +# error "DATA_ROLL_PTR_LEN != 7" +#endif +#if TRUE != 1 +# error "TRUE != 1" +#endif + ut_ad(roll_ptr < (1ULL << 56)); + return((ibool) (roll_ptr >> 55)); +} + +/***********************************************************************//** +Returns true if the record is of the insert type. +@return true if the record was freshly inserted (not updated). */ +UNIV_INLINE +bool +trx_undo_trx_id_is_insert( +/*======================*/ + const byte* trx_id) /*!< in: DB_TRX_ID, followed by DB_ROLL_PTR */ +{ +#if DATA_TRX_ID + 1 != DATA_ROLL_PTR +# error +#endif + return(static_cast<bool>(trx_id[DATA_TRX_ID_LEN] >> 7)); +} +#endif /* !UNIV_HOTBACKUP */ + +/*****************************************************************//** +Writes a roll ptr to an index page. In case that the size changes in +some future version, this function should be used instead of +mach_write_... */ +UNIV_INLINE +void +trx_write_roll_ptr( +/*===============*/ + byte* ptr, /*!< in: pointer to memory where + written */ + roll_ptr_t roll_ptr) /*!< in: roll ptr */ +{ +#if DATA_ROLL_PTR_LEN != 7 +# error "DATA_ROLL_PTR_LEN != 7" +#endif + mach_write_to_7(ptr, roll_ptr); +} + +/*****************************************************************//** +Reads a roll ptr from an index page. In case that the roll ptr size +changes in some future version, this function should be used instead of +mach_read_... +@return roll ptr */ +UNIV_INLINE +roll_ptr_t +trx_read_roll_ptr( +/*==============*/ + const byte* ptr) /*!< in: pointer to memory from where to read */ +{ +#if DATA_ROLL_PTR_LEN != 7 +# error "DATA_ROLL_PTR_LEN != 7" +#endif + return(mach_read_from_7(ptr)); +} + +#ifndef UNIV_HOTBACKUP +/******************************************************************//** +Gets an undo log page and x-latches it. +@return pointer to page x-latched */ +UNIV_INLINE +page_t* +trx_undo_page_get( +/*==============*/ + ulint space, /*!< in: space where placed */ + ulint zip_size, /*!< in: compressed page size in bytes + or 0 for uncompressed pages */ + ulint page_no, /*!< in: page number */ + mtr_t* mtr) /*!< in: mtr */ +{ + buf_block_t* block = buf_page_get(space, zip_size, page_no, + RW_X_LATCH, mtr); + buf_block_dbg_add_level(block, SYNC_TRX_UNDO_PAGE); + + return(buf_block_get_frame(block)); +} + +/******************************************************************//** +Gets an undo log page and s-latches it. +@return pointer to page s-latched */ +UNIV_INLINE +page_t* +trx_undo_page_get_s_latched( +/*========================*/ + ulint space, /*!< in: space where placed */ + ulint zip_size, /*!< in: compressed page size in bytes + or 0 for uncompressed pages */ + ulint page_no, /*!< in: page number */ + mtr_t* mtr) /*!< in: mtr */ +{ + buf_block_t* block = buf_page_get(space, zip_size, page_no, + RW_S_LATCH, mtr); + buf_block_dbg_add_level(block, SYNC_TRX_UNDO_PAGE); + + return(buf_block_get_frame(block)); +} + +/******************************************************************//** +Returns the start offset of the undo log records of the specified undo +log on the page. +@return start offset */ +UNIV_INLINE +ulint +trx_undo_page_get_start( +/*====================*/ + page_t* undo_page,/*!< in: undo log page */ + ulint page_no,/*!< in: undo log header page number */ + ulint offset) /*!< in: undo log header offset on page */ +{ + ulint start; + + if (page_no == page_get_page_no(undo_page)) { + + start = mach_read_from_2(offset + undo_page + + TRX_UNDO_LOG_START); + } else { + start = TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE; + } + + return(start); +} + +/******************************************************************//** +Returns the end offset of the undo log records of the specified undo +log on the page. +@return end offset */ +UNIV_INLINE +ulint +trx_undo_page_get_end( +/*==================*/ + page_t* undo_page,/*!< in: undo log page */ + ulint page_no,/*!< in: undo log header page number */ + ulint offset) /*!< in: undo log header offset on page */ +{ + trx_ulogf_t* log_hdr; + ulint end; + + if (page_no == page_get_page_no(undo_page)) { + + log_hdr = undo_page + offset; + + end = mach_read_from_2(log_hdr + TRX_UNDO_NEXT_LOG); + + if (end == 0) { + end = mach_read_from_2(undo_page + TRX_UNDO_PAGE_HDR + + TRX_UNDO_PAGE_FREE); + } + } else { + end = mach_read_from_2(undo_page + TRX_UNDO_PAGE_HDR + + TRX_UNDO_PAGE_FREE); + } + + return(end); +} + +/******************************************************************//** +Returns the previous undo record on the page in the specified log, or +NULL if none exists. +@return pointer to record, NULL if none */ +UNIV_INLINE +trx_undo_rec_t* +trx_undo_page_get_prev_rec( +/*=======================*/ + trx_undo_rec_t* rec, /*!< in: undo log record */ + ulint page_no,/*!< in: undo log header page number */ + ulint offset) /*!< in: undo log header offset on page */ +{ + page_t* undo_page; + ulint start; + + undo_page = (page_t*) ut_align_down(rec, UNIV_PAGE_SIZE); + + start = trx_undo_page_get_start(undo_page, page_no, offset); + + if (start + undo_page == rec) { + + return(NULL); + } + + return(undo_page + mach_read_from_2(rec - 2)); +} + +/******************************************************************//** +Returns the next undo log record on the page in the specified log, or +NULL if none exists. +@return pointer to record, NULL if none */ +UNIV_INLINE +trx_undo_rec_t* +trx_undo_page_get_next_rec( +/*=======================*/ + trx_undo_rec_t* rec, /*!< in: undo log record */ + ulint page_no,/*!< in: undo log header page number */ + ulint offset) /*!< in: undo log header offset on page */ +{ + page_t* undo_page; + ulint end; + ulint next; + + undo_page = (page_t*) ut_align_down(rec, UNIV_PAGE_SIZE); + + end = trx_undo_page_get_end(undo_page, page_no, offset); + + next = mach_read_from_2(rec); + + if (next == end) { + + return(NULL); + } + + return(undo_page + next); +} + +/******************************************************************//** +Returns the last undo record on the page in the specified undo log, or +NULL if none exists. +@return pointer to record, NULL if none */ +UNIV_INLINE +trx_undo_rec_t* +trx_undo_page_get_last_rec( +/*=======================*/ + page_t* undo_page,/*!< in: undo log page */ + ulint page_no,/*!< in: undo log header page number */ + ulint offset) /*!< in: undo log header offset on page */ +{ + ulint start; + ulint end; + + start = trx_undo_page_get_start(undo_page, page_no, offset); + end = trx_undo_page_get_end(undo_page, page_no, offset); + + if (start == end) { + + return(NULL); + } + + return(undo_page + mach_read_from_2(undo_page + end - 2)); +} + +/******************************************************************//** +Returns the first undo record on the page in the specified undo log, or +NULL if none exists. +@return pointer to record, NULL if none */ +UNIV_INLINE +trx_undo_rec_t* +trx_undo_page_get_first_rec( +/*========================*/ + page_t* undo_page,/*!< in: undo log page */ + ulint page_no,/*!< in: undo log header page number */ + ulint offset) /*!< in: undo log header offset on page */ +{ + ulint start; + ulint end; + + start = trx_undo_page_get_start(undo_page, page_no, offset); + end = trx_undo_page_get_end(undo_page, page_no, offset); + + if (start == end) { + + return(NULL); + } + + return(undo_page + start); +} +#endif /* !UNIV_HOTBACKUP */ diff --git a/storage/innobase/include/trx0xa.h b/storage/innobase/include/trx0xa.h new file mode 100644 index 00000000000..7caddfb7ba4 --- /dev/null +++ b/storage/innobase/include/trx0xa.h @@ -0,0 +1,70 @@ +/***************************************************************************** + +Copyright (c) 1995, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/* + * Start of xa.h header + * + * Define a symbol to prevent multiple inclusions of this header file + */ +#ifndef XA_H +#define XA_H + +/* + * Transaction branch identification: XID and NULLXID: + */ +#ifndef XIDDATASIZE + +/** Sizes of transaction identifier */ +#define XIDDATASIZE 128 /*!< maximum size of a transaction + identifier, in bytes */ +#define MAXGTRIDSIZE 64 /*!< maximum size in bytes of gtrid */ +#define MAXBQUALSIZE 64 /*!< maximum size in bytes of bqual */ + +/** X/Open XA distributed transaction identifier */ +struct xid_t { + long formatID; /*!< format identifier; -1 + means that the XID is null */ + long gtrid_length; /*!< value from 1 through 64 */ + long bqual_length; /*!< value from 1 through 64 */ + char data[XIDDATASIZE]; /*!< distributed transaction + identifier */ +}; +/** X/Open XA distributed transaction identifier */ +typedef struct xid_t XID; +#endif +/** X/Open XA distributed transaction status codes */ +/* @{ */ +#define XA_OK 0 /*!< normal execution */ +#define XAER_ASYNC -2 /*!< asynchronous operation already + outstanding */ +#define XAER_RMERR -3 /*!< a resource manager error + occurred in the transaction + branch */ +#define XAER_NOTA -4 /*!< the XID is not valid */ +#define XAER_INVAL -5 /*!< invalid arguments were given */ +#define XAER_PROTO -6 /*!< routine invoked in an improper + context */ +#define XAER_RMFAIL -7 /*!< resource manager unavailable */ +#define XAER_DUPID -8 /*!< the XID already exists */ +#define XAER_OUTSIDE -9 /*!< resource manager doing + work outside transaction */ +/* @} */ +#endif /* ifndef XA_H */ +/* + * End of xa.h header + */ diff --git a/storage/innobase/include/univ.i b/storage/innobase/include/univ.i new file mode 100644 index 00000000000..8c325ecc88c --- /dev/null +++ b/storage/innobase/include/univ.i @@ -0,0 +1,667 @@ +/***************************************************************************** + +Copyright (c) 1994, 2013, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2008, Google Inc. + +Portions of this file contain modifications contributed and copyrighted by +Google, Inc. Those modifications are gratefully acknowledged and are described +briefly in the InnoDB documentation. The contributions by Google are +incorporated with their permission, and subject to the conditions contained in +the file COPYING.Google. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/***********************************************************************//** +@file include/univ.i +Version control for database, common definitions, and include files + +Created 1/20/1994 Heikki Tuuri +****************************************************************************/ + +#ifndef univ_i +#define univ_i + +#ifdef UNIV_HOTBACKUP +#include "hb_univ.i" +#endif /* UNIV_HOTBACKUP */ + +/* aux macros to convert M into "123" (string) if M is defined like +#define M 123 */ +#define _IB_TO_STR(s) #s +#define IB_TO_STR(s) _IB_TO_STR(s) + +#define INNODB_VERSION_MAJOR MYSQL_VERSION_MAJOR +#define INNODB_VERSION_MINOR MYSQL_VERSION_MINOR +#define INNODB_VERSION_BUGFIX MYSQL_VERSION_PATCH + +/* The following is the InnoDB version as shown in +SELECT plugin_version FROM information_schema.plugins; +calculated in make_version_string() in sql/sql_show.cc like this: +"version >> 8" . "version & 0xff" +because the version is shown with only one dot, we skip the last +component, i.e. we show M.N.P as M.N */ +#define INNODB_VERSION_SHORT \ + (INNODB_VERSION_MAJOR << 8 | INNODB_VERSION_MINOR) + +#define INNODB_VERSION_STR \ + IB_TO_STR(INNODB_VERSION_MAJOR) "." \ + IB_TO_STR(INNODB_VERSION_MINOR) "." \ + IB_TO_STR(INNODB_VERSION_BUGFIX) + +#define REFMAN "http://dev.mysql.com/doc/refman/" \ + IB_TO_STR(MYSQL_VERSION_MAJOR) "." \ + IB_TO_STR(MYSQL_VERSION_MINOR) "/en/" + +#ifdef MYSQL_DYNAMIC_PLUGIN +/* In the dynamic plugin, redefine some externally visible symbols +in order not to conflict with the symbols of a builtin InnoDB. */ + +/* Rename all C++ classes that contain virtual functions, because we +have not figured out how to apply the visibility=hidden attribute to +the virtual method table (vtable) in GCC 3. */ +# define ha_innobase ha_innodb +#endif /* MYSQL_DYNAMIC_PLUGIN */ + +#if (defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)) && !defined(MYSQL_SERVER) && !defined(__WIN__) +# undef __WIN__ +# define __WIN__ + +# include <windows.h> + +# ifdef _NT_ +# define __NT__ +# endif + +#else +/* The defines used with MySQL */ + +/* Include two header files from MySQL to make the Unix flavor used +in compiling more Posix-compatible. These headers also define __WIN__ +if we are compiling on Windows. */ + +#ifndef UNIV_HOTBACKUP +# include <my_global.h> +# include <my_pthread.h> +#endif /* UNIV_HOTBACKUP */ + +/* Include <sys/stat.h> to get S_I... macros defined for os0file.cc */ +# include <sys/stat.h> +# if !defined(__WIN__) +# include <sys/mman.h> /* mmap() for os0proc.cc */ +# endif + +/* Include the header file generated by GNU autoconf */ +# ifndef __WIN__ +# ifndef UNIV_HOTBACKUP +# include "config.h" +# endif /* UNIV_HOTBACKUP */ +# endif + +# ifdef HAVE_SCHED_H +# include <sched.h> +# endif + +/* We only try to do explicit inlining of functions with gcc and +Sun Studio */ + +# ifdef HAVE_PREAD +# define HAVE_PWRITE +# endif + +#endif /* #if (defined(WIN32) || ... */ + +#ifndef __WIN__ +#define __STDC_FORMAT_MACROS /* Enable C99 printf format macros */ +#include <inttypes.h> +#endif /* !__WIN__ */ + +/* Following defines are to enable performance schema +instrumentation in each of four InnoDB modules if +HAVE_PSI_INTERFACE is defined. */ +#if defined HAVE_PSI_INTERFACE && !defined UNIV_HOTBACKUP +# define UNIV_PFS_MUTEX +# define UNIV_PFS_RWLOCK +/* For I/O instrumentation, performance schema rely +on a native descriptor to identify the file, this +descriptor could conflict with our OS level descriptor. +Disable IO instrumentation on Windows until this is +resolved */ +# ifndef __WIN__ +# define UNIV_PFS_IO +# endif +# define UNIV_PFS_THREAD + +/* There are mutexes/rwlocks that we want to exclude from +instrumentation even if their corresponding performance schema +define is set. And this PFS_NOT_INSTRUMENTED is used +as the key value to identify those objects that would +be excluded from instrumentation. */ +# define PFS_NOT_INSTRUMENTED ULINT32_UNDEFINED + +# define PFS_IS_INSTRUMENTED(key) ((key) != PFS_NOT_INSTRUMENTED) + +#endif /* HAVE_PSI_INTERFACE */ + +#ifdef __WIN__ +# define YY_NO_UNISTD_H 1 +#endif /* __WIN__ */ + +/* DEBUG VERSION CONTROL + ===================== */ + +/* When this macro is defined then additional test functions will be +compiled. These functions live at the end of each relevant source file +and have "test_" prefix. These functions are not called from anywhere in +the code, they can be called from gdb after +innobase_start_or_create_for_mysql() has executed using the call +command. Not tested on Windows. */ +/* +#define UNIV_COMPILE_TEST_FUNCS +*/ + +#if defined HAVE_VALGRIND +# define UNIV_DEBUG_VALGRIND +#endif /* HAVE_VALGRIND */ +#if 0 +#define UNIV_DEBUG_VALGRIND /* Enable extra + Valgrind instrumentation */ +#define UNIV_DEBUG_PRINT /* Enable the compilation of + some debug print functions */ +#define UNIV_AHI_DEBUG /* Enable adaptive hash index + debugging without UNIV_DEBUG */ +#define UNIV_BUF_DEBUG /* Enable buffer pool + debugging without UNIV_DEBUG */ +#define UNIV_BLOB_LIGHT_DEBUG /* Enable off-page column + debugging without UNIV_DEBUG */ +#define UNIV_DEBUG /* Enable ut_ad() assertions + and disable UNIV_INLINE */ +#define UNIV_DEBUG_LOCK_VALIDATE /* Enable + ut_ad(lock_rec_validate_page()) + assertions. */ +#define UNIV_DEBUG_FILE_ACCESSES /* Enable freed block access + debugging without UNIV_DEBUG */ +#define UNIV_LRU_DEBUG /* debug the buffer pool LRU */ +#define UNIV_HASH_DEBUG /* debug HASH_ macros */ +#define UNIV_LIST_DEBUG /* debug UT_LIST_ macros */ +#define UNIV_LOG_LSN_DEBUG /* write LSN to the redo log; +this will break redo log file compatibility, but it may be useful when +debugging redo log application problems. */ +#define UNIV_MEM_DEBUG /* detect memory leaks etc */ +#define UNIV_IBUF_DEBUG /* debug the insert buffer */ +#define UNIV_BLOB_DEBUG /* track BLOB ownership; +assumes that no BLOBs survive server restart */ +#define UNIV_IBUF_COUNT_DEBUG /* debug the insert buffer; +this limits the database to IBUF_COUNT_N_SPACES and IBUF_COUNT_N_PAGES, +and the insert buffer must be empty when the database is started */ +#define UNIV_PERF_DEBUG /* debug flag that enables + light weight performance + related stuff. */ +#define UNIV_SYNC_DEBUG /* debug mutex and latch +operations (very slow); also UNIV_DEBUG must be defined */ +#define UNIV_SEARCH_DEBUG /* debug B-tree comparisons */ +#define UNIV_SYNC_PERF_STAT /* operation counts for + rw-locks and mutexes */ +#define UNIV_SEARCH_PERF_STAT /* statistics for the + adaptive hash index */ +#define UNIV_SRV_PRINT_LATCH_WAITS /* enable diagnostic output + in sync0sync.cc */ +#define UNIV_BTR_PRINT /* enable functions for + printing B-trees */ +#define UNIV_ZIP_DEBUG /* extensive consistency checks + for compressed pages */ +#define UNIV_ZIP_COPY /* call page_zip_copy_recs() + more often */ +#define UNIV_AIO_DEBUG /* prints info about + submitted and reaped AIO + requests to the log. */ +#define UNIV_STATS_DEBUG /* prints various stats + related debug info from + dict0stats.c */ +#define FTS_INTERNAL_DIAG_PRINT /* FTS internal debugging + info output */ +#endif + +#define UNIV_BTR_DEBUG /* check B-tree links */ +#define UNIV_LIGHT_MEM_DEBUG /* light memory debugging */ + +/* +#define UNIV_SQL_DEBUG +#define UNIV_LOG_DEBUG +*/ + /* the above option prevents forcing of log to disk + at a buffer page write: it should be tested with this + option off; also some ibuf tests are suppressed */ + +/* Linkage specifier for non-static InnoDB symbols (variables and functions) +that are only referenced from within InnoDB, not from MySQL. We disable the +GCC visibility directive on all Sun operating systems because there is no +easy way to get it to work. See http://bugs.mysql.com/bug.php?id=52263. */ +#if defined(__GNUC__) && (__GNUC__ >= 4) && !defined(sun) || defined(__INTEL_COMPILER) +# define UNIV_INTERN __attribute__((visibility ("hidden"))) +#else +# define UNIV_INTERN +#endif +#if defined(INNODB_COMPILER_HINTS) \ + && defined __GNUC__ \ + && (__GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__ >= 3) +/** Starting with GCC 4.3, the "cold" attribute is used to inform the +compiler that a function is unlikely executed. The function is +optimized for size rather than speed and on many targets it is placed +into special subsection of the text section so all cold functions +appears close together improving code locality of non-cold parts of +program. The paths leading to call of cold functions within code are +marked as unlikely by the branch prediction mechanism. optimize a +rarely invoked function for size instead for speed. */ +# define UNIV_COLD __attribute__((cold)) +#else +# define UNIV_COLD /* empty */ +#endif + +#ifndef UNIV_MUST_NOT_INLINE +/* Definition for inline version */ + +#define UNIV_INLINE static inline + +#else /* !UNIV_MUST_NOT_INLINE */ +/* If we want to compile a noninlined version we use the following macro +definitions: */ + +#define UNIV_NONINL +#define UNIV_INLINE UNIV_INTERN + +#endif /* !UNIV_MUST_NOT_INLINE */ + +#ifdef _WIN32 +#define UNIV_WORD_SIZE 4 +#elif defined(_WIN64) +#define UNIV_WORD_SIZE 8 +#else +/** MySQL config.h generated by GNU autoconf will define SIZEOF_LONG in Posix */ +#define UNIV_WORD_SIZE SIZEOF_LONG +#endif + +/** The following alignment is used in memory allocations in memory heap +management to ensure correct alignment for doubles etc. */ +#define UNIV_MEM_ALIGNMENT 8 + +/** The following alignment is used in aligning lints etc. */ +#define UNIV_WORD_ALIGNMENT UNIV_WORD_SIZE + +/* + DATABASE VERSION CONTROL + ======================== +*/ + +/** There are currently two InnoDB file formats which are used to group +features with similar restrictions and dependencies. Using an enum allows +switch statements to give a compiler warning when a new one is introduced. */ +enum innodb_file_formats_enum { + /** Antelope File Format: InnoDB/MySQL up to 5.1. + This format includes REDUNDANT and COMPACT row formats */ + UNIV_FORMAT_A = 0, + + /** Barracuda File Format: Introduced in InnoDB plugin for 5.1: + This format includes COMPRESSED and DYNAMIC row formats. It + includes the ability to create secondary indexes from data that + is not on the clustered index page and the ability to store more + data off the clustered index page. */ + UNIV_FORMAT_B = 1 +}; + +typedef enum innodb_file_formats_enum innodb_file_formats_t; + +/** Minimum supported file format */ +#define UNIV_FORMAT_MIN UNIV_FORMAT_A + +/** Maximum supported file format */ +#define UNIV_FORMAT_MAX UNIV_FORMAT_B + +/** The 2-logarithm of UNIV_PAGE_SIZE: */ +#define UNIV_PAGE_SIZE_SHIFT srv_page_size_shift + +/** The universal page size of the database */ +#define UNIV_PAGE_SIZE ((ulint) srv_page_size) + +/** log2 of smallest compressed page size (1<<10 == 1024 bytes) +Note: This must never change! */ +#define UNIV_ZIP_SIZE_SHIFT_MIN 10 + +/** log2 of largest compressed page size (1<<14 == 16384 bytes). +A compressed page directory entry reserves 14 bits for the start offset +and 2 bits for flags. This limits the uncompressed page size to 16k. +Even though a 16k uncompressed page can theoretically be compressed +into a larger compressed page, it is not a useful feature so we will +limit both with this same constant. */ +#define UNIV_ZIP_SIZE_SHIFT_MAX 14 + +/* Define the Min, Max, Default page sizes. */ +/** Minimum Page Size Shift (power of 2) */ +#define UNIV_PAGE_SIZE_SHIFT_MIN 12 +/** Maximum Page Size Shift (power of 2) */ +#define UNIV_PAGE_SIZE_SHIFT_MAX 14 +/** Default Page Size Shift (power of 2) */ +#define UNIV_PAGE_SIZE_SHIFT_DEF 14 +/** Original 16k InnoDB Page Size Shift, in case the default changes */ +#define UNIV_PAGE_SIZE_SHIFT_ORIG 14 + +/** Minimum page size InnoDB currently supports. */ +#define UNIV_PAGE_SIZE_MIN (1 << UNIV_PAGE_SIZE_SHIFT_MIN) +/** Maximum page size InnoDB currently supports. */ +#define UNIV_PAGE_SIZE_MAX (1 << UNIV_PAGE_SIZE_SHIFT_MAX) +/** Default page size for InnoDB tablespaces. */ +#define UNIV_PAGE_SIZE_DEF (1 << UNIV_PAGE_SIZE_SHIFT_DEF) +/** Original 16k page size for InnoDB tablespaces. */ +#define UNIV_PAGE_SIZE_ORIG (1 << UNIV_PAGE_SIZE_SHIFT_ORIG) + +/** Smallest compressed page size */ +#define UNIV_ZIP_SIZE_MIN (1 << UNIV_ZIP_SIZE_SHIFT_MIN) + +/** Largest compressed page size */ +#define UNIV_ZIP_SIZE_MAX (1 << UNIV_ZIP_SIZE_SHIFT_MAX) + +/** Number of supported page sizes (The convention 'ssize' is used +for 'log2 minus 9' or the number of shifts starting with 512.) +This number varies depending on UNIV_PAGE_SIZE. */ +#define UNIV_PAGE_SSIZE_MAX \ + (UNIV_PAGE_SIZE_SHIFT - UNIV_ZIP_SIZE_SHIFT_MIN + 1) + +/** Maximum number of parallel threads in a parallelized operation */ +#define UNIV_MAX_PARALLELISM 32 + +/** This is the "mbmaxlen" for my_charset_filename (defined in +strings/ctype-utf8.c), which is used to encode File and Database names. */ +#define FILENAME_CHARSET_MAXNAMLEN 5 + +/** The maximum length of an encode table name in bytes. The max +table and database names are NAME_CHAR_LEN (64) characters. After the +encoding, the max length would be NAME_CHAR_LEN (64) * +FILENAME_CHARSET_MAXNAMLEN (5) = 320 bytes. The number does not include a +terminating '\0'. InnoDB can handle longer names internally */ +#define MAX_TABLE_NAME_LEN 320 + +/** The maximum length of a database name. Like MAX_TABLE_NAME_LEN this is +the MySQL's NAME_LEN, see check_and_convert_db_name(). */ +#define MAX_DATABASE_NAME_LEN MAX_TABLE_NAME_LEN + +/** MAX_FULL_NAME_LEN defines the full name path including the +database name and table name. In addition, 14 bytes is added for: + 2 for surrounding quotes around table name + 1 for the separating dot (.) + 9 for the #mysql50# prefix */ +#define MAX_FULL_NAME_LEN \ + (MAX_TABLE_NAME_LEN + MAX_DATABASE_NAME_LEN + 14) + +/** The maximum length in bytes that a database name can occupy when stored in +UTF8, including the terminating '\0', see dict_fs2utf8(). You must include +mysql_com.h if you are to use this macro. */ +#define MAX_DB_UTF8_LEN (NAME_LEN + 1) + +/** The maximum length in bytes that a table name can occupy when stored in +UTF8, including the terminating '\0', see dict_fs2utf8(). You must include +mysql_com.h if you are to use this macro. */ +#define MAX_TABLE_UTF8_LEN (NAME_LEN + sizeof(srv_mysql50_table_name_prefix)) + +/* + UNIVERSAL TYPE DEFINITIONS + ========================== +*/ + +/* Note that inside MySQL 'byte' is defined as char on Linux! */ +#define byte unsigned char + +/* Another basic type we use is unsigned long integer which should be equal to +the word size of the machine, that is on a 32-bit platform 32 bits, and on a +64-bit platform 64 bits. We also give the printf format for the type as a +macro ULINTPF. */ + + +#ifdef __WIN__ +/* Use the integer types and formatting strings defined in Visual Studio. */ +# define UINT32PF "%I32u" +# define INT64PF "%I64d" +# define UINT64PF "%I64u" +# define UINT64PFx "%016I64x" +# define DBUG_LSN_PF "%llu" +typedef __int64 ib_int64_t; +typedef unsigned __int64 ib_uint64_t; +typedef unsigned __int32 ib_uint32_t; +#else +/* Use the integer types and formatting strings defined in the C99 standard. */ +# define UINT32PF "%" PRIu32 +# define INT64PF "%" PRId64 +# define UINT64PF "%" PRIu64 +# define UINT64PFx "%016" PRIx64 +# define DBUG_LSN_PF UINT64PF +typedef int64_t ib_int64_t; +typedef uint64_t ib_uint64_t; +typedef uint32_t ib_uint32_t; +# endif /* __WIN__ */ + +# define IB_ID_FMT UINT64PF + +#ifdef _WIN64 +typedef unsigned __int64 ulint; +typedef __int64 lint; +# define ULINTPF UINT64PF +#else +typedef unsigned long int ulint; +typedef long int lint; +# define ULINTPF "%lu" +#endif /* _WIN64 */ + +#ifndef UNIV_HOTBACKUP +typedef unsigned long long int ullint; +#endif /* UNIV_HOTBACKUP */ + +#ifndef __WIN__ +#if SIZEOF_LONG != SIZEOF_VOIDP +#error "Error: InnoDB's ulint must be of the same size as void*" +#endif +#endif + +/** The 'undefined' value for a ulint */ +#define ULINT_UNDEFINED ((ulint)(-1)) + +#define ULONG_UNDEFINED ((ulong)(-1)) + +/** The 'undefined' value for a ib_uint64_t */ +#define UINT64_UNDEFINED ((ib_uint64_t)(-1)) + +/** The bitmask of 32-bit unsigned integer */ +#define ULINT32_MASK 0xFFFFFFFF +/** The undefined 32-bit unsigned integer */ +#define ULINT32_UNDEFINED ULINT32_MASK + +/** Maximum value for a ulint */ +#define ULINT_MAX ((ulint)(-2)) + +/** Maximum value for ib_uint64_t */ +#define IB_UINT64_MAX ((ib_uint64_t) (~0ULL)) + +/** The generic InnoDB system object identifier data type */ +typedef ib_uint64_t ib_id_t; +#define IB_ID_MAX IB_UINT64_MAX + +/** The 'undefined' value for a ullint */ +#define ULLINT_UNDEFINED ((ullint)(-1)) + +/** This 'ibool' type is used within Innobase. Remember that different included +headers may define 'bool' differently. Do not assume that 'bool' is a ulint! */ +#define ibool ulint + +#ifndef TRUE + +#define TRUE 1 +#define FALSE 0 + +#endif + +#define UNIV_NOTHROW + +/** The following number as the length of a logical field means that the field +has the SQL NULL as its value. NOTE that because we assume that the length +of a field is a 32-bit integer when we store it, for example, to an undo log +on disk, we must have also this number fit in 32 bits, also in 64-bit +computers! */ + +#define UNIV_SQL_NULL ULINT32_UNDEFINED + +/** Lengths which are not UNIV_SQL_NULL, but bigger than the following +number indicate that a field contains a reference to an externally +stored part of the field in the tablespace. The length field then +contains the sum of the following flag and the locally stored len. */ + +#define UNIV_EXTERN_STORAGE_FIELD (UNIV_SQL_NULL - UNIV_PAGE_SIZE_MAX) + +#if defined(__GNUC__) && (__GNUC__ > 2) && ! defined(__INTEL_COMPILER) +#define HAVE_GCC_GT_2 +/* Tell the compiler that variable/function is unused. */ +# define UNIV_UNUSED __attribute__ ((unused)) +#else +# define UNIV_UNUSED +#endif /* CHECK FOR GCC VER_GT_2 */ + +/* Some macros to improve branch prediction and reduce cache misses */ +#if defined(INNODB_COMPILER_HINTS) && defined(HAVE_GCC_GT_2) +/* Tell the compiler that 'expr' probably evaluates to 'constant'. */ +# define UNIV_EXPECT(expr,constant) __builtin_expect(expr, constant) +/* Tell the compiler that a pointer is likely to be NULL */ +# define UNIV_LIKELY_NULL(ptr) __builtin_expect((ulint) ptr, 0) +/* Minimize cache-miss latency by moving data at addr into a cache before +it is read. */ +# define UNIV_PREFETCH_R(addr) __builtin_prefetch(addr, 0, 3) +/* Minimize cache-miss latency by moving data at addr into a cache before +it is read or written. */ +# define UNIV_PREFETCH_RW(addr) __builtin_prefetch(addr, 1, 3) + +/* Sun Studio includes sun_prefetch.h as of version 5.9 */ +#elif (defined(__SUNPRO_C) && __SUNPRO_C >= 0x590) \ + || (defined(__SUNPRO_CC) && __SUNPRO_CC >= 0x590) + +# include <sun_prefetch.h> + +#if __SUNPRO_C >= 0x550 +# undef UNIV_INTERN +# define UNIV_INTERN __hidden +#endif /* __SUNPRO_C >= 0x550 */ + +# define UNIV_EXPECT(expr,value) (expr) +# define UNIV_LIKELY_NULL(expr) (expr) + +# if defined(INNODB_COMPILER_HINTS) +//# define UNIV_PREFETCH_R(addr) sun_prefetch_read_many((void*) addr) +# define UNIV_PREFETCH_R(addr) ((void) 0) +# define UNIV_PREFETCH_RW(addr) sun_prefetch_write_many(addr) +# else +# define UNIV_PREFETCH_R(addr) ((void) 0) +# define UNIV_PREFETCH_RW(addr) ((void) 0) +# endif /* INNODB_COMPILER_HINTS */ + +#else +/* Dummy versions of the macros */ +# define UNIV_EXPECT(expr,value) (expr) +# define UNIV_LIKELY_NULL(expr) (expr) +# define UNIV_PREFETCH_R(addr) ((void) 0) +# define UNIV_PREFETCH_RW(addr) ((void) 0) +#endif + +/* Tell the compiler that cond is likely to hold */ +#define UNIV_LIKELY(cond) UNIV_EXPECT(cond, TRUE) +/* Tell the compiler that cond is unlikely to hold */ +#define UNIV_UNLIKELY(cond) UNIV_EXPECT(cond, FALSE) + +/* Compile-time constant of the given array's size. */ +#define UT_ARR_SIZE(a) (sizeof(a) / sizeof((a)[0])) + +/* The return type from a thread's start function differs between Unix and +Windows, so define a typedef for it and a macro to use at the end of such +functions. */ + +#ifdef __WIN__ +typedef ulint os_thread_ret_t; +#define OS_THREAD_DUMMY_RETURN return(0) +#else +typedef void* os_thread_ret_t; +#define OS_THREAD_DUMMY_RETURN return(NULL) +#endif + +#include <stdio.h> +#include "ut0dbg.h" +#include "ut0ut.h" +#include "db0err.h" +#ifdef UNIV_DEBUG_VALGRIND +# include <valgrind/memcheck.h> +# define UNIV_MEM_VALID(addr, size) VALGRIND_MAKE_MEM_DEFINED(addr, size) +# define UNIV_MEM_INVALID(addr, size) VALGRIND_MAKE_MEM_UNDEFINED(addr, size) +# define UNIV_MEM_FREE(addr, size) VALGRIND_MAKE_MEM_NOACCESS(addr, size) +# define UNIV_MEM_ALLOC(addr, size) VALGRIND_MAKE_MEM_UNDEFINED(addr, size) +# define UNIV_MEM_DESC(addr, size) VALGRIND_CREATE_BLOCK(addr, size, #addr) +# define UNIV_MEM_UNDESC(b) VALGRIND_DISCARD(b) +# define UNIV_MEM_ASSERT_RW_LOW(addr, size, should_abort) do { \ + const void* _p = (const void*) (ulint) \ + VALGRIND_CHECK_MEM_IS_DEFINED(addr, size); \ + if (UNIV_LIKELY_NULL(_p)) { \ + fprintf(stderr, "%s:%d: %p[%u] undefined at %ld\n", \ + __FILE__, __LINE__, \ + (const void*) (addr), (unsigned) (size), (long) \ + (((const char*) _p) - ((const char*) (addr)))); \ + if (should_abort) { \ + ut_error; \ + } \ + } \ +} while (0) +# define UNIV_MEM_ASSERT_RW(addr, size) \ + UNIV_MEM_ASSERT_RW_LOW(addr, size, false) +# define UNIV_MEM_ASSERT_RW_ABORT(addr, size) \ + UNIV_MEM_ASSERT_RW_LOW(addr, size, true) +# define UNIV_MEM_ASSERT_W(addr, size) do { \ + const void* _p = (const void*) (ulint) \ + VALGRIND_CHECK_MEM_IS_ADDRESSABLE(addr, size); \ + if (UNIV_LIKELY_NULL(_p)) \ + fprintf(stderr, "%s:%d: %p[%u] unwritable at %ld\n", \ + __FILE__, __LINE__, \ + (const void*) (addr), (unsigned) (size), (long) \ + (((const char*) _p) - ((const char*) (addr)))); \ + } while (0) +# define UNIV_MEM_TRASH(addr, c, size) do { \ + ut_d(memset(addr, c, size)); \ + UNIV_MEM_INVALID(addr, size); \ + } while (0) +#else +# define UNIV_MEM_VALID(addr, size) do {} while(0) +# define UNIV_MEM_INVALID(addr, size) do {} while(0) +# define UNIV_MEM_FREE(addr, size) do {} while(0) +# define UNIV_MEM_ALLOC(addr, size) do {} while(0) +# define UNIV_MEM_DESC(addr, size) do {} while(0) +# define UNIV_MEM_UNDESC(b) do {} while(0) +# define UNIV_MEM_ASSERT_RW_LOW(addr, size, should_abort) do {} while(0) +# define UNIV_MEM_ASSERT_RW(addr, size) do {} while(0) +# define UNIV_MEM_ASSERT_RW_ABORT(addr, size) do {} while(0) +# define UNIV_MEM_ASSERT_W(addr, size) do {} while(0) +# define UNIV_MEM_TRASH(addr, c, size) do {} while(0) +#endif +#define UNIV_MEM_ASSERT_AND_FREE(addr, size) do { \ + UNIV_MEM_ASSERT_W(addr, size); \ + UNIV_MEM_FREE(addr, size); \ +} while (0) +#define UNIV_MEM_ASSERT_AND_ALLOC(addr, size) do { \ + UNIV_MEM_ASSERT_W(addr, size); \ + UNIV_MEM_ALLOC(addr, size); \ +} while (0) + +extern ulong srv_page_size_shift; +extern ulong srv_page_size; + +#endif diff --git a/storage/innobase/include/usr0sess.h b/storage/innobase/include/usr0sess.h new file mode 100644 index 00000000000..b5c80b97b43 --- /dev/null +++ b/storage/innobase/include/usr0sess.h @@ -0,0 +1,77 @@ +/***************************************************************************** + +Copyright (c) 1996, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/usr0sess.h +Sessions + +Created 6/25/1996 Heikki Tuuri +*******************************************************/ + +#ifndef usr0sess_h +#define usr0sess_h + +#include "univ.i" +#include "ut0byte.h" +#include "trx0types.h" +#include "srv0srv.h" +#include "trx0types.h" +#include "usr0types.h" +#include "que0types.h" +#include "data0data.h" +#include "rem0rec.h" + +/*********************************************************************//** +Opens a session. +@return own: session object */ +UNIV_INTERN +sess_t* +sess_open(void); +/*============*/ +/*********************************************************************//** +Closes a session, freeing the memory occupied by it. */ +UNIV_INTERN +void +sess_close( +/*=======*/ + sess_t* sess); /* in, own: session object */ + +/* The session handle. This data structure is only used by purge and is +not really necessary. We should get rid of it. */ +struct sess_t{ + ulint state; /*!< state of the session */ + trx_t* trx; /*!< transaction object permanently + assigned for the session: the + transaction instance designated by the + trx id changes, but the memory + structure is preserved */ + UT_LIST_BASE_NODE_T(que_t) + graphs; /*!< query graphs belonging to this + session */ +}; + +/* Session states */ +#define SESS_ACTIVE 1 +#define SESS_ERROR 2 /* session contains an error message + which has not yet been communicated + to the client */ +#ifndef UNIV_NONINL +#include "usr0sess.ic" +#endif + +#endif diff --git a/storage/innobase/include/usr0sess.ic b/storage/innobase/include/usr0sess.ic new file mode 100644 index 00000000000..284e59537fe --- /dev/null +++ b/storage/innobase/include/usr0sess.ic @@ -0,0 +1,24 @@ +/***************************************************************************** + +Copyright (c) 1996, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/usr0sess.ic +Sessions + +Created 6/25/1996 Heikki Tuuri +*******************************************************/ diff --git a/storage/innobase/include/usr0types.h b/storage/innobase/include/usr0types.h new file mode 100644 index 00000000000..6ba937cacc8 --- /dev/null +++ b/storage/innobase/include/usr0types.h @@ -0,0 +1,31 @@ +/***************************************************************************** + +Copyright (c) 1996, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/usr0types.h +Users and sessions global types + +Created 6/25/1996 Heikki Tuuri +*******************************************************/ + +#ifndef usr0types_h +#define usr0types_h + +struct sess_t; + +#endif diff --git a/storage/innobase/include/ut0bh.h b/storage/innobase/include/ut0bh.h new file mode 100644 index 00000000000..1085736c7ab --- /dev/null +++ b/storage/innobase/include/ut0bh.h @@ -0,0 +1,152 @@ +/***************************************************************************//** + +Copyright (c) 2011, 2013, Oracle Corpn. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/ut0bh.h +Binary min-heap interface. + +Created 2010-05-28 by Sunny Bains +*******************************************************/ + +#ifndef INNOBASE_UT0BH_H +#define INNOBASE_UT0BH_H + +#include "univ.i" + +/** Comparison function for objects in the binary heap. */ +typedef int (*ib_bh_cmp_t)(const void* p1, const void* p2); + +struct ib_bh_t; + +/**********************************************************************//** +Get the number of elements in the binary heap. +@return number of elements */ +UNIV_INLINE +ulint +ib_bh_size( +/*=======*/ + const ib_bh_t* ib_bh); /*!< in: instance */ + +/**********************************************************************//** +Test if binary heap is empty. +@return TRUE if empty. */ +UNIV_INLINE +ibool +ib_bh_is_empty( +/*===========*/ + const ib_bh_t* ib_bh); /*!< in: instance */ + +/**********************************************************************//** +Test if binary heap is full. +@return TRUE if full. */ +UNIV_INLINE +ibool +ib_bh_is_full( +/*===========*/ + const ib_bh_t* ib_bh); /*!< in: instance */ + +/**********************************************************************//** +Get a pointer to the element. +@return pointer to element */ +UNIV_INLINE +void* +ib_bh_get( +/*=======*/ + ib_bh_t* ib_bh, /*!< in: instance */ + ulint i); /*!< in: index */ + +/**********************************************************************//** +Copy an element to the binary heap. +@return pointer to copied element */ +UNIV_INLINE +void* +ib_bh_set( +/*======*/ + ib_bh_t* ib_bh, /*!< in/out: instance */ + ulint i, /*!< in: index */ + const void* elem); /*!< in: element to add */ + +/**********************************************************************//** +Return the first element from the binary heap. +@return pointer to first element or NULL if empty. */ +UNIV_INLINE +void* +ib_bh_first( +/*========*/ + ib_bh_t* ib_bh); /*!< in: instance */ + +/**********************************************************************//** +Return the last element from the binary heap. +@return pointer to last element or NULL if empty. */ +UNIV_INLINE +void* +ib_bh_last( +/*========*/ + ib_bh_t* ib_bh); /*!< in/out: instance */ + +/**********************************************************************//** +Create a binary heap. +@return a new binary heap */ +UNIV_INTERN +ib_bh_t* +ib_bh_create( +/*=========*/ + ib_bh_cmp_t compare, /*!< in: comparator */ + ulint sizeof_elem, /*!< in: size of one element */ + ulint max_elems); /*!< in: max elements allowed */ + +/**********************************************************************//** +Free a binary heap. +@return a new binary heap */ +UNIV_INTERN +void +ib_bh_free( +/*=======*/ + ib_bh_t* ib_bh); /*!< in,own: instance */ + +/**********************************************************************//** +Add an element to the binary heap. Note: The element is copied. +@return pointer to added element or NULL if full. */ +UNIV_INTERN +void* +ib_bh_push( +/*=======*/ + ib_bh_t* ib_bh, /*!< in/out: instance */ + const void* elem); /*!< in: element to add */ + +/**********************************************************************//** +Remove the first element from the binary heap. */ +UNIV_INTERN +void +ib_bh_pop( +/*======*/ + ib_bh_t* ib_bh); /*!< in/out: instance */ + +/** Binary heap data structure */ +struct ib_bh_t { + ulint max_elems; /*!< max elements allowed */ + ulint n_elems; /*!< current size */ + ulint sizeof_elem; /*!< sizeof element */ + ib_bh_cmp_t compare; /*!< comparator */ +}; + +#ifndef UNIV_NONINL +#include "ut0bh.ic" +#endif + +#endif /* INNOBASE_UT0BH_H */ diff --git a/storage/innobase/include/ut0bh.ic b/storage/innobase/include/ut0bh.ic new file mode 100644 index 00000000000..b11de5b8b3e --- /dev/null +++ b/storage/innobase/include/ut0bh.ic @@ -0,0 +1,125 @@ +/***************************************************************************//** + +Copyright (c) 2011, 2013, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/ut0bh.ic +Binary min-heap implementation. + +Created 2011-01-15 by Sunny Bains +*******************************************************/ + +#include "ut0bh.h" +#include "ut0mem.h" /* For ut_memcpy() */ + +/**********************************************************************//** +Get the number of elements in the binary heap. +@return number of elements */ +UNIV_INLINE +ulint +ib_bh_size( +/*=======*/ + const ib_bh_t* ib_bh) /*!< in: instance */ +{ + return(ib_bh->n_elems); +} + +/**********************************************************************//** +Test if binary heap is empty. +@return TRUE if empty. */ +UNIV_INLINE +ibool +ib_bh_is_empty( +/*===========*/ + const ib_bh_t* ib_bh) /*!< in: instance */ +{ + return(ib_bh_size(ib_bh) == 0); +} + +/**********************************************************************//** +Test if binary heap is full. +@return TRUE if full. */ +UNIV_INLINE +ibool +ib_bh_is_full( +/*===========*/ + const ib_bh_t* ib_bh) /*!< in: instance */ +{ + return(ib_bh_size(ib_bh) >= ib_bh->max_elems); +} + +/**********************************************************************//** +Get a pointer to the element. +@return pointer to element */ +UNIV_INLINE +void* +ib_bh_get( +/*=======*/ + ib_bh_t* ib_bh, /*!< in: instance */ + ulint i) /*!< in: index */ +{ + byte* ptr = (byte*) (ib_bh + 1); + + ut_a(i < ib_bh_size(ib_bh)); + + return(ptr + (ib_bh->sizeof_elem * i)); +} + +/**********************************************************************//** +Copy an element to the binary heap. +@return pointer to copied element */ +UNIV_INLINE +void* +ib_bh_set( +/*======*/ + ib_bh_t* ib_bh, /*!< in/out: instance */ + ulint i, /*!< in: index */ + const void* elem) /*!< in: element to add */ +{ + void* ptr = ib_bh_get(ib_bh, i); + + ut_memcpy(ptr, elem, ib_bh->sizeof_elem); + + return(ptr); +} + +/**********************************************************************//** +Return the first element from the binary heap. +@return pointer to first element or NULL if empty. */ +UNIV_INLINE +void* +ib_bh_first( +/*========*/ + ib_bh_t* ib_bh) /*!< in: instance */ +{ + return(ib_bh_is_empty(ib_bh) ? NULL : ib_bh_get(ib_bh, 0)); +} + +/**********************************************************************//** +Return the last element from the binary heap. +@return pointer to last element or NULL if empty. */ +UNIV_INLINE +void* +ib_bh_last( +/*========*/ + ib_bh_t* ib_bh) /*!< in/out: instance */ +{ + return(ib_bh_is_empty(ib_bh) + ? NULL + : ib_bh_get(ib_bh, ib_bh_size(ib_bh) - 1)); +} + diff --git a/storage/innobase/include/ut0byte.h b/storage/innobase/include/ut0byte.h new file mode 100644 index 00000000000..5bdd553ca80 --- /dev/null +++ b/storage/innobase/include/ut0byte.h @@ -0,0 +1,119 @@ +/***************************************************************************** + +Copyright (c) 1994, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/ut0byte.h +Utilities for byte operations + +Created 1/20/1994 Heikki Tuuri +***********************************************************************/ + +#ifndef ut0byte_h +#define ut0byte_h + + + +#include "univ.i" + +/*******************************************************//** +Creates a 64-bit integer out of two 32-bit integers. +@return created integer */ +UNIV_INLINE +ib_uint64_t +ut_ull_create( +/*==========*/ + ulint high, /*!< in: high-order 32 bits */ + ulint low) /*!< in: low-order 32 bits */ + __attribute__((const)); + +/********************************************************//** +Rounds a 64-bit integer downward to a multiple of a power of 2. +@return rounded value */ +UNIV_INLINE +ib_uint64_t +ut_uint64_align_down( +/*=================*/ + ib_uint64_t n, /*!< in: number to be rounded */ + ulint align_no); /*!< in: align by this number + which must be a power of 2 */ +/********************************************************//** +Rounds ib_uint64_t upward to a multiple of a power of 2. +@return rounded value */ +UNIV_INLINE +ib_uint64_t +ut_uint64_align_up( +/*===============*/ + ib_uint64_t n, /*!< in: number to be rounded */ + ulint align_no); /*!< in: align by this number + which must be a power of 2 */ +/*********************************************************//** +The following function rounds up a pointer to the nearest aligned address. +@return aligned pointer */ +UNIV_INLINE +void* +ut_align( +/*=====*/ + const void* ptr, /*!< in: pointer */ + ulint align_no); /*!< in: align by this number */ +/*********************************************************//** +The following function rounds down a pointer to the nearest +aligned address. +@return aligned pointer */ +UNIV_INLINE +void* +ut_align_down( +/*==========*/ + const void* ptr, /*!< in: pointer */ + ulint align_no) /*!< in: align by this number */ + __attribute__((const)); +/*********************************************************//** +The following function computes the offset of a pointer from the nearest +aligned address. +@return distance from aligned pointer */ +UNIV_INLINE +ulint +ut_align_offset( +/*============*/ + const void* ptr, /*!< in: pointer */ + ulint align_no) /*!< in: align by this number */ + __attribute__((const)); +/*****************************************************************//** +Gets the nth bit of a ulint. +@return TRUE if nth bit is 1; 0th bit is defined to be the least significant */ +UNIV_INLINE +ibool +ut_bit_get_nth( +/*===========*/ + ulint a, /*!< in: ulint */ + ulint n); /*!< in: nth bit requested */ +/*****************************************************************//** +Sets the nth bit of a ulint. +@return the ulint with the bit set as requested */ +UNIV_INLINE +ulint +ut_bit_set_nth( +/*===========*/ + ulint a, /*!< in: ulint */ + ulint n, /*!< in: nth bit requested */ + ibool val); /*!< in: value for the bit to set */ + +#ifndef UNIV_NONINL +#include "ut0byte.ic" +#endif + +#endif diff --git a/storage/innobase/include/ut0byte.ic b/storage/innobase/include/ut0byte.ic new file mode 100644 index 00000000000..873d98c727e --- /dev/null +++ b/storage/innobase/include/ut0byte.ic @@ -0,0 +1,173 @@ +/***************************************************************************** + +Copyright (c) 1994, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************************//** +@file include/ut0byte.ic +Utilities for byte operations + +Created 5/30/1994 Heikki Tuuri +*******************************************************************/ + +/*******************************************************//** +Creates a 64-bit integer out of two 32-bit integers. +@return created integer */ +UNIV_INLINE +ib_uint64_t +ut_ull_create( +/*==========*/ + ulint high, /*!< in: high-order 32 bits */ + ulint low) /*!< in: low-order 32 bits */ +{ + ut_ad(high <= ULINT32_MASK); + ut_ad(low <= ULINT32_MASK); + return(((ib_uint64_t) high) << 32 | low); +} + +/********************************************************//** +Rounds a 64-bit integer downward to a multiple of a power of 2. +@return rounded value */ +UNIV_INLINE +ib_uint64_t +ut_uint64_align_down( +/*=================*/ + ib_uint64_t n, /*!< in: number to be rounded */ + ulint align_no) /*!< in: align by this number + which must be a power of 2 */ +{ + ut_ad(align_no > 0); + ut_ad(ut_is_2pow(align_no)); + + return(n & ~((ib_uint64_t) align_no - 1)); +} + +/********************************************************//** +Rounds ib_uint64_t upward to a multiple of a power of 2. +@return rounded value */ +UNIV_INLINE +ib_uint64_t +ut_uint64_align_up( +/*===============*/ + ib_uint64_t n, /*!< in: number to be rounded */ + ulint align_no) /*!< in: align by this number + which must be a power of 2 */ +{ + ib_uint64_t align_1 = (ib_uint64_t) align_no - 1; + + ut_ad(align_no > 0); + ut_ad(ut_is_2pow(align_no)); + + return((n + align_1) & ~align_1); +} + +/*********************************************************//** +The following function rounds up a pointer to the nearest aligned address. +@return aligned pointer */ +UNIV_INLINE +void* +ut_align( +/*=====*/ + const void* ptr, /*!< in: pointer */ + ulint align_no) /*!< in: align by this number */ +{ + ut_ad(align_no > 0); + ut_ad(((align_no - 1) & align_no) == 0); + ut_ad(ptr); + + ut_ad(sizeof(void*) == sizeof(ulint)); + + return((void*)((((ulint) ptr) + align_no - 1) & ~(align_no - 1))); +} + +/*********************************************************//** +The following function rounds down a pointer to the nearest +aligned address. +@return aligned pointer */ +UNIV_INLINE +void* +ut_align_down( +/*==========*/ + const void* ptr, /*!< in: pointer */ + ulint align_no) /*!< in: align by this number */ +{ + ut_ad(align_no > 0); + ut_ad(((align_no - 1) & align_no) == 0); + ut_ad(ptr); + + ut_ad(sizeof(void*) == sizeof(ulint)); + + return((void*)((((ulint) ptr)) & ~(align_no - 1))); +} + +/*********************************************************//** +The following function computes the offset of a pointer from the nearest +aligned address. +@return distance from aligned pointer */ +UNIV_INLINE +ulint +ut_align_offset( +/*============*/ + const void* ptr, /*!< in: pointer */ + ulint align_no) /*!< in: align by this number */ +{ + ut_ad(align_no > 0); + ut_ad(((align_no - 1) & align_no) == 0); + ut_ad(ptr); + + ut_ad(sizeof(void*) == sizeof(ulint)); + + return(((ulint) ptr) & (align_no - 1)); +} + +/*****************************************************************//** +Gets the nth bit of a ulint. +@return TRUE if nth bit is 1; 0th bit is defined to be the least significant */ +UNIV_INLINE +ibool +ut_bit_get_nth( +/*===========*/ + ulint a, /*!< in: ulint */ + ulint n) /*!< in: nth bit requested */ +{ + ut_ad(n < 8 * sizeof(ulint)); +#if TRUE != 1 +# error "TRUE != 1" +#endif + return(1 & (a >> n)); +} + +/*****************************************************************//** +Sets the nth bit of a ulint. +@return the ulint with the bit set as requested */ +UNIV_INLINE +ulint +ut_bit_set_nth( +/*===========*/ + ulint a, /*!< in: ulint */ + ulint n, /*!< in: nth bit requested */ + ibool val) /*!< in: value for the bit to set */ +{ + ut_ad(n < 8 * sizeof(ulint)); +#if TRUE != 1 +# error "TRUE != 1" +#endif + if (val) { + return(((ulint) 1 << n) | a); + } else { + return(~((ulint) 1 << n) & a); + } +} diff --git a/storage/innobase/include/ut0counter.h b/storage/innobase/include/ut0counter.h new file mode 100644 index 00000000000..fe0f36dfff2 --- /dev/null +++ b/storage/innobase/include/ut0counter.h @@ -0,0 +1,203 @@ +/***************************************************************************** + +Copyright (c) 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/ut0counter.h + +Counter utility class + +Created 2012/04/12 by Sunny Bains +*******************************************************/ + +#ifndef UT0COUNTER_H +#define UT0COUNTER_H + +#include "univ.i" +#include <string.h> +#include "os0thread.h" + +/** CPU cache line size */ +#define CACHE_LINE_SIZE 64 + +/** Default number of slots to use in ib_counter_t */ +#define IB_N_SLOTS 64 + +/** Get the offset into the counter array. */ +template <typename Type, int N> +struct generic_indexer_t { + /** Default constructor/destructor should be OK. */ + + /** @return offset within m_counter */ + size_t offset(size_t index) const UNIV_NOTHROW { + return(((index % N) + 1) * (CACHE_LINE_SIZE / sizeof(Type))); + } +}; + +#ifdef HAVE_SCHED_GETCPU +#include <utmpx.h> +/** Use the cpu id to index into the counter array. If it fails then +use the thread id. */ +template <typename Type, int N> +struct get_sched_indexer_t : public generic_indexer_t<Type, N> { + /** Default constructor/destructor should be OK. */ + + /* @return result from sched_getcpu(), the thread id if it fails. */ + size_t get_rnd_index() const UNIV_NOTHROW { + + size_t cpu = sched_getcpu(); + if (cpu == -1) { + cpu = (lint) os_thread_get_curr_id(); + } + + return(cpu); + } +}; +#endif /* HAVE_SCHED_GETCPU */ + +/** Use the thread id to index into the counter array. */ +template <typename Type, int N> +struct thread_id_indexer_t : public generic_indexer_t<Type, N> { + /** Default constructor/destructor should are OK. */ + + /* @return a random number, currently we use the thread id. Where + thread id is represented as a pointer, it may not work as + effectively. */ + size_t get_rnd_index() const UNIV_NOTHROW { + return((lint) os_thread_get_curr_id()); + } +}; + +/** For counters wher N=1 */ +template <typename Type, int N=1> +struct single_indexer_t { + /** Default constructor/destructor should are OK. */ + + /** @return offset within m_counter */ + size_t offset(size_t index) const UNIV_NOTHROW { + ut_ad(N == 1); + return((CACHE_LINE_SIZE / sizeof(Type))); + } + + /* @return 1 */ + size_t get_rnd_index() const UNIV_NOTHROW { + ut_ad(N == 1); + return(1); + } +}; + +/** Class for using fuzzy counters. The counter is not protected by any +mutex and the results are not guaranteed to be 100% accurate but close +enough. Creates an array of counters and separates each element by the +CACHE_LINE_SIZE bytes */ +template < + typename Type, + int N = IB_N_SLOTS, + template<typename, int> class Indexer = thread_id_indexer_t> +class ib_counter_t { +public: + ib_counter_t() { memset(m_counter, 0x0, sizeof(m_counter)); } + + ~ib_counter_t() + { + ut_ad(validate()); + } + + bool validate() UNIV_NOTHROW { +#ifdef UNIV_DEBUG + size_t n = (CACHE_LINE_SIZE / sizeof(Type)); + + /* Check that we aren't writing outside our defined bounds. */ + for (size_t i = 0; i < UT_ARR_SIZE(m_counter); i += n) { + for (size_t j = 1; j < n - 1; ++j) { + ut_ad(m_counter[i + j] == 0); + } + } +#endif /* UNIV_DEBUG */ + return(true); + } + + /** If you can't use a good index id. Increment by 1. */ + void inc() UNIV_NOTHROW { add(1); } + + /** If you can't use a good index id. + * @param n - is the amount to increment */ + void add(Type n) UNIV_NOTHROW { + size_t i = m_policy.offset(m_policy.get_rnd_index()); + + ut_ad(i < UT_ARR_SIZE(m_counter)); + + m_counter[i] += n; + } + + /** Use this if you can use a unique indentifier, saves a + call to get_rnd_index(). + @param i - index into a slot + @param n - amount to increment */ + void add(size_t index, Type n) UNIV_NOTHROW { + size_t i = m_policy.offset(index); + + ut_ad(i < UT_ARR_SIZE(m_counter)); + + m_counter[i] += n; + } + + /** If you can't use a good index id. Decrement by 1. */ + void dec() UNIV_NOTHROW { sub(1); } + + /** If you can't use a good index id. + * @param - n is the amount to decrement */ + void sub(Type n) UNIV_NOTHROW { + size_t i = m_policy.offset(m_policy.get_rnd_index()); + + ut_ad(i < UT_ARR_SIZE(m_counter)); + + m_counter[i] -= n; + } + + /** Use this if you can use a unique indentifier, saves a + call to get_rnd_index(). + @param i - index into a slot + @param n - amount to decrement */ + void sub(size_t index, Type n) UNIV_NOTHROW { + size_t i = m_policy.offset(index); + + ut_ad(i < UT_ARR_SIZE(m_counter)); + + m_counter[i] -= n; + } + + /* @return total value - not 100% accurate, since it is not atomic. */ + operator Type() const UNIV_NOTHROW { + Type total = 0; + + for (size_t i = 0; i < N; ++i) { + total += m_counter[m_policy.offset(i)]; + } + + return(total); + } + +private: + /** Indexer into the array */ + Indexer<Type, N>m_policy; + + /** Slot 0 is unused. */ + Type m_counter[(N + 1) * (CACHE_LINE_SIZE / sizeof(Type))]; +}; + +#endif /* UT0COUNTER_H */ diff --git a/storage/innobase/include/ut0crc32.h b/storage/innobase/include/ut0crc32.h new file mode 100644 index 00000000000..86217692764 --- /dev/null +++ b/storage/innobase/include/ut0crc32.h @@ -0,0 +1,51 @@ +/***************************************************************************** + +Copyright (c) 2011, 2011, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/ut0crc32.h +CRC32 implementation + +Created Aug 10, 2011 Vasil Dimov +*******************************************************/ + +#ifndef ut0crc32_h +#define ut0crc32_h + +#include "univ.i" + +/********************************************************************//** +Initializes the data structures used by ut_crc32(). Does not do any +allocations, would not hurt if called twice, but would be pointless. */ +UNIV_INTERN +void +ut_crc32_init(); +/*===========*/ + +/********************************************************************//** +Calculates CRC32. +@param ptr - data over which to calculate CRC32. +@param len - data length in bytes. +@return CRC32 (CRC-32C, using the GF(2) primitive polynomial 0x11EDC6F41, +or 0x1EDC6F41 without the high-order bit) */ +typedef ib_uint32_t (*ib_ut_crc32_t)(const byte* ptr, ulint len); + +extern ib_ut_crc32_t ut_crc32; + +extern bool ut_crc32_sse2_enabled; + +#endif /* ut0crc32_h */ diff --git a/storage/innobase/include/ut0dbg.h b/storage/innobase/include/ut0dbg.h new file mode 100644 index 00000000000..6a4afe99597 --- /dev/null +++ b/storage/innobase/include/ut0dbg.h @@ -0,0 +1,132 @@ +/***************************************************************************** + +Copyright (c) 1994, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/*****************************************************************//** +@file include/ut0dbg.h +Debug utilities for Innobase + +Created 1/30/1994 Heikki Tuuri +**********************************************************************/ + +#ifndef ut0dbg_h +#define ut0dbg_h + +#ifdef UNIV_INNOCHECKSUM +#define ut_a assert +#define ut_ad assert +#define ut_error assert(0) +#else /* !UNIV_INNOCHECKSUM */ + +#include "univ.i" +#include <stdlib.h> +#include "os0thread.h" + +#if defined(__GNUC__) && (__GNUC__ > 2) +/** Test if an assertion fails. +@param EXPR assertion expression +@return nonzero if EXPR holds, zero if not */ +# define UT_DBG_FAIL(EXPR) UNIV_UNLIKELY(!((ulint)(EXPR))) +#else +/** This is used to eliminate compiler warnings */ +extern ulint ut_dbg_zero; +/** Test if an assertion fails. +@param EXPR assertion expression +@return nonzero if EXPR holds, zero if not */ +# define UT_DBG_FAIL(EXPR) !((ulint)(EXPR) + ut_dbg_zero) +#endif + +/*************************************************************//** +Report a failed assertion. */ +UNIV_INTERN +void +ut_dbg_assertion_failed( +/*====================*/ + const char* expr, /*!< in: the failed assertion */ + const char* file, /*!< in: source file containing the assertion */ + ulint line) /*!< in: line number of the assertion */ + UNIV_COLD __attribute__((nonnull(2))); + +/** Abort the execution. */ +# define UT_DBG_PANIC abort() + +/** Abort execution if EXPR does not evaluate to nonzero. +@param EXPR assertion expression that should hold */ +#define ut_a(EXPR) do { \ + if (UT_DBG_FAIL(EXPR)) { \ + ut_dbg_assertion_failed(#EXPR, \ + __FILE__, (ulint) __LINE__); \ + UT_DBG_PANIC; \ + } \ +} while (0) + +/** Abort execution. */ +#define ut_error do { \ + ut_dbg_assertion_failed(0, __FILE__, (ulint) __LINE__); \ + UT_DBG_PANIC; \ +} while (0) + +#ifdef UNIV_DEBUG +/** Debug assertion. Does nothing unless UNIV_DEBUG is defined. */ +#define ut_ad(EXPR) ut_a(EXPR) +/** Debug statement. Does nothing unless UNIV_DEBUG is defined. */ +#define ut_d(EXPR) do {EXPR;} while (0) +#else +/** Debug assertion. Does nothing unless UNIV_DEBUG is defined. */ +#define ut_ad(EXPR) +/** Debug statement. Does nothing unless UNIV_DEBUG is defined. */ +#define ut_d(EXPR) +#endif + +/** Silence warnings about an unused variable by doing a null assignment. +@param A the unused variable */ +#define UT_NOT_USED(A) A = A + +#ifdef UNIV_COMPILE_TEST_FUNCS + +#include <sys/types.h> +#include <sys/time.h> +#include <sys/resource.h> + +/** structure used for recording usage statistics */ +struct speedo_t { + struct rusage ru; /*!< getrusage() result */ + struct timeval tv; /*!< gettimeofday() result */ +}; + +/*******************************************************************//** +Resets a speedo (records the current time in it). */ +UNIV_INTERN +void +speedo_reset( +/*=========*/ + speedo_t* speedo); /*!< out: speedo */ + +/*******************************************************************//** +Shows the time elapsed and usage statistics since the last reset of a +speedo. */ +UNIV_INTERN +void +speedo_show( +/*========*/ + const speedo_t* speedo); /*!< in: speedo */ + +#endif /* UNIV_COMPILE_TEST_FUNCS */ + +#endif /* !UNIV_INNOCHECKSUM */ + +#endif diff --git a/storage/innobase/include/ut0list.h b/storage/innobase/include/ut0list.h new file mode 100644 index 00000000000..29fc8669ce4 --- /dev/null +++ b/storage/innobase/include/ut0list.h @@ -0,0 +1,180 @@ +/***************************************************************************** + +Copyright (c) 2006, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/*******************************************************************//** +@file include/ut0list.h +A double-linked list + +Created 4/26/2006 Osku Salerma +************************************************************************/ + +/*******************************************************************//** +A double-linked list. This differs from the one in ut0lst.h in that in this +one, each list node contains a pointer to the data, whereas the one in +ut0lst.h uses a strategy where the list pointers are embedded in the data +items themselves. + +Use this one when you need to store arbitrary data in the list where you +can't embed the list pointers in the data, if a data item needs to be +stored in multiple lists, etc. + +Note about the memory management: ib_list_t is a fixed-size struct whose +allocation/deallocation is done through ib_list_create/ib_list_free, but the +memory for the list nodes is allocated through a user-given memory heap, +which can either be the same for all nodes or vary per node. Most users will +probably want to create a memory heap to store the item-specific data, and +pass in this same heap to the list node creation functions, thus +automatically freeing the list node when the item's heap is freed. + +************************************************************************/ + +#ifndef IB_LIST_H +#define IB_LIST_H + +#include "mem0mem.h" + +struct ib_list_t; +struct ib_list_node_t; + +/****************************************************************//** +Create a new list using mem_alloc. Lists created with this function must be +freed with ib_list_free. +@return list */ +UNIV_INTERN +ib_list_t* +ib_list_create(void); +/*=================*/ + + +/****************************************************************//** +Create a new list using the given heap. ib_list_free MUST NOT BE CALLED for +lists created with this function. +@return list */ +UNIV_INTERN +ib_list_t* +ib_list_create_heap( +/*================*/ + mem_heap_t* heap); /*!< in: memory heap to use */ + +/****************************************************************//** +Free a list. */ +UNIV_INTERN +void +ib_list_free( +/*=========*/ + ib_list_t* list); /*!< in: list */ + +/****************************************************************//** +Add the data to the start of the list. +@return new list node */ +UNIV_INTERN +ib_list_node_t* +ib_list_add_first( +/*==============*/ + ib_list_t* list, /*!< in: list */ + void* data, /*!< in: data */ + mem_heap_t* heap); /*!< in: memory heap to use */ + +/****************************************************************//** +Add the data to the end of the list. +@return new list node */ +UNIV_INTERN +ib_list_node_t* +ib_list_add_last( +/*=============*/ + ib_list_t* list, /*!< in: list */ + void* data, /*!< in: data */ + mem_heap_t* heap); /*!< in: memory heap to use */ + +/****************************************************************//** +Add the data after the indicated node. +@return new list node */ +UNIV_INTERN +ib_list_node_t* +ib_list_add_after( +/*==============*/ + ib_list_t* list, /*!< in: list */ + ib_list_node_t* prev_node, /*!< in: node preceding new node (can + be NULL) */ + void* data, /*!< in: data */ + mem_heap_t* heap); /*!< in: memory heap to use */ + +/****************************************************************//** +Remove the node from the list. */ +UNIV_INTERN +void +ib_list_remove( +/*===========*/ + ib_list_t* list, /*!< in: list */ + ib_list_node_t* node); /*!< in: node to remove */ + +/****************************************************************//** +Get the first node in the list. +@return first node, or NULL */ +UNIV_INLINE +ib_list_node_t* +ib_list_get_first( +/*==============*/ + ib_list_t* list); /*!< in: list */ + +/****************************************************************//** +Get the last node in the list. +@return last node, or NULL */ +UNIV_INLINE +ib_list_node_t* +ib_list_get_last( +/*=============*/ + ib_list_t* list); /*!< in: list */ + +/******************************************************************** +Check if list is empty. */ +UNIV_INLINE +ibool +ib_list_is_empty( +/*=============*/ + /* out: TRUE if empty else */ + const ib_list_t* list); /* in: list */ + +/* List. */ +struct ib_list_t { + ib_list_node_t* first; /*!< first node */ + ib_list_node_t* last; /*!< last node */ + ibool is_heap_list; /*!< TRUE if this list was + allocated through a heap */ +}; + +/* A list node. */ +struct ib_list_node_t { + ib_list_node_t* prev; /*!< previous node */ + ib_list_node_t* next; /*!< next node */ + void* data; /*!< user data */ +}; + +/* Quite often, the only additional piece of data you need is the per-item +memory heap, so we have this generic struct available to use in those +cases. */ +struct ib_list_helper_t { + mem_heap_t* heap; /*!< memory heap */ + void* data; /*!< user data */ +}; + +#ifndef UNIV_NONINL +#include "ut0list.ic" +#endif + +#endif diff --git a/storage/innobase/include/ut0list.ic b/storage/innobase/include/ut0list.ic new file mode 100644 index 00000000000..d9dcb2eac99 --- /dev/null +++ b/storage/innobase/include/ut0list.ic @@ -0,0 +1,60 @@ +/***************************************************************************** + +Copyright (c) 2006, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/*******************************************************************//** +@file include/ut0list.ic +A double-linked list + +Created 4/26/2006 Osku Salerma +************************************************************************/ + +/****************************************************************//** +Get the first node in the list. +@return first node, or NULL */ +UNIV_INLINE +ib_list_node_t* +ib_list_get_first( +/*==============*/ + ib_list_t* list) /*!< in: list */ +{ + return(list->first); +} + +/****************************************************************//** +Get the last node in the list. +@return last node, or NULL */ +UNIV_INLINE +ib_list_node_t* +ib_list_get_last( +/*=============*/ + ib_list_t* list) /*!< in: list */ +{ + return(list->last); +} + +/******************************************************************** +Check if list is empty. */ +UNIV_INLINE +ibool +ib_list_is_empty( +/*=============*/ + /* out: TRUE if empty else FALSE */ + const ib_list_t* list) /* in: list */ +{ + return(!(list->first || list->last)); +} diff --git a/storage/innobase/include/ut0lst.h b/storage/innobase/include/ut0lst.h new file mode 100644 index 00000000000..b53e7ade4c1 --- /dev/null +++ b/storage/innobase/include/ut0lst.h @@ -0,0 +1,408 @@ +/***************************************************************************** + +Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/ut0lst.h +List utilities + +Created 9/10/1995 Heikki Tuuri +***********************************************************************/ + +#ifndef ut0lst_h +#define ut0lst_h + +#include "univ.i" + +/*******************************************************************//** +Return offset of F in POD T. +@param T - POD pointer +@param F - Field in T */ +#define IB_OFFSETOF(T, F) \ + (reinterpret_cast<byte*>(&(T)->F) - reinterpret_cast<byte*>(T)) + +/* This module implements the two-way linear list which should be used +if a list is used in the database. Note that a single struct may belong +to two or more lists, provided that the list are given different names. +An example of the usage of the lists can be found in fil0fil.cc. */ + +/*******************************************************************//** +This macro expands to the unnamed type definition of a struct which acts +as the two-way list base node. The base node contains pointers +to both ends of the list and a count of nodes in the list (excluding +the base node from the count). +@param TYPE the name of the list node data type */ +template <typename TYPE> +struct ut_list_base { + typedef TYPE elem_type; + + ulint count; /*!< count of nodes in list */ + TYPE* start; /*!< pointer to list start, NULL if empty */ + TYPE* end; /*!< pointer to list end, NULL if empty */ +}; + +#define UT_LIST_BASE_NODE_T(TYPE) ut_list_base<TYPE> + +/*******************************************************************//** +This macro expands to the unnamed type definition of a struct which +should be embedded in the nodes of the list, the node type must be a struct. +This struct contains the pointers to next and previous nodes in the list. +The name of the field in the node struct should be the name given +to the list. +@param TYPE the list node type name */ +/* Example: +struct LRU_node_t { + UT_LIST_NODE_T(LRU_node_t) LRU_list; + ... +} +The example implements an LRU list of name LRU_list. Its nodes are of type +LRU_node_t. */ + +template <typename TYPE> +struct ut_list_node { + TYPE* prev; /*!< pointer to the previous node, + NULL if start of list */ + TYPE* next; /*!< pointer to next node, NULL if end of list */ +}; + +#define UT_LIST_NODE_T(TYPE) ut_list_node<TYPE> + +/*******************************************************************//** +Get the list node at offset. +@param elem - list element +@param offset - offset within element. +@return reference to list node. */ +template <typename Type> +ut_list_node<Type>& +ut_elem_get_node(Type& elem, size_t offset) +{ + ut_a(offset < sizeof(elem)); + + return(*reinterpret_cast<ut_list_node<Type>*>( + reinterpret_cast<byte*>(&elem) + offset)); +} + +/*******************************************************************//** +Initializes the base node of a two-way list. +@param BASE the list base node +*/ +#define UT_LIST_INIT(BASE)\ +{\ + (BASE).count = 0;\ + (BASE).start = NULL;\ + (BASE).end = NULL;\ +}\ + +/*******************************************************************//** +Adds the node as the first element in a two-way linked list. +@param list the base node (not a pointer to it) +@param elem the element to add +@param offset offset of list node in elem. */ +template <typename List, typename Type> +void +ut_list_prepend( + List& list, + Type& elem, + size_t offset) +{ + ut_list_node<Type>& elem_node = ut_elem_get_node(elem, offset); + + elem_node.prev = 0; + elem_node.next = list.start; + + if (list.start != 0) { + ut_list_node<Type>& base_node = + ut_elem_get_node(*list.start, offset); + + ut_ad(list.start != &elem); + + base_node.prev = &elem; + } + + list.start = &elem; + + if (list.end == 0) { + list.end = &elem; + } + + ++list.count; +} + +/*******************************************************************//** +Adds the node as the first element in a two-way linked list. +@param NAME list name +@param LIST the base node (not a pointer to it) +@param ELEM the element to add */ +#define UT_LIST_ADD_FIRST(NAME, LIST, ELEM) \ + ut_list_prepend(LIST, *ELEM, IB_OFFSETOF(ELEM, NAME)) + +/*******************************************************************//** +Adds the node as the last element in a two-way linked list. +@param list list +@param elem the element to add +@param offset offset of list node in elem */ +template <typename List, typename Type> +void +ut_list_append( + List& list, + Type& elem, + size_t offset) +{ + ut_list_node<Type>& elem_node = ut_elem_get_node(elem, offset); + + elem_node.next = 0; + elem_node.prev = list.end; + + if (list.end != 0) { + ut_list_node<Type>& base_node = + ut_elem_get_node(*list.end, offset); + + ut_ad(list.end != &elem); + + base_node.next = &elem; + } + + list.end = &elem; + + if (list.start == 0) { + list.start = &elem; + } + + ++list.count; +} + +/*******************************************************************//** +Adds the node as the last element in a two-way linked list. +@param NAME list name +@param LIST list +@param ELEM the element to add */ +#define UT_LIST_ADD_LAST(NAME, LIST, ELEM)\ + ut_list_append(LIST, *ELEM, IB_OFFSETOF(ELEM, NAME)) + +/*******************************************************************//** +Inserts a ELEM2 after ELEM1 in a list. +@param list the base node +@param elem1 node after which ELEM2 is inserted +@param elem2 node being inserted after NODE1 +@param offset offset of list node in elem1 and elem2 */ +template <typename List, typename Type> +void +ut_list_insert( + List& list, + Type& elem1, + Type& elem2, + size_t offset) +{ + ut_ad(&elem1 != &elem2); + + ut_list_node<Type>& elem1_node = ut_elem_get_node(elem1, offset); + ut_list_node<Type>& elem2_node = ut_elem_get_node(elem2, offset); + + elem2_node.prev = &elem1; + elem2_node.next = elem1_node.next; + + if (elem1_node.next != NULL) { + ut_list_node<Type>& next_node = + ut_elem_get_node(*elem1_node.next, offset); + + next_node.prev = &elem2; + } + + elem1_node.next = &elem2; + + if (list.end == &elem1) { + list.end = &elem2; + } + + ++list.count; +} + +/*******************************************************************//** +Inserts a ELEM2 after ELEM1 in a list. +@param NAME list name +@param LIST the base node +@param ELEM1 node after which ELEM2 is inserted +@param ELEM2 node being inserted after ELEM1 */ +#define UT_LIST_INSERT_AFTER(NAME, LIST, ELEM1, ELEM2)\ + ut_list_insert(LIST, *ELEM1, *ELEM2, IB_OFFSETOF(ELEM1, NAME)) + +#ifdef UNIV_LIST_DEBUG +/** Invalidate the pointers in a list node. +@param NAME list name +@param N pointer to the node that was removed */ +# define UT_LIST_REMOVE_CLEAR(N) \ + (N).next = (Type*) -1; \ + (N).prev = (N).next +#else +/** Invalidate the pointers in a list node. +@param NAME list name +@param N pointer to the node that was removed */ +# define UT_LIST_REMOVE_CLEAR(N) +#endif /* UNIV_LIST_DEBUG */ + +/*******************************************************************//** +Removes a node from a two-way linked list. +@param list the base node (not a pointer to it) +@param elem node to be removed from the list +@param offset offset of list node within elem */ +template <typename List, typename Type> +void +ut_list_remove( + List& list, + Type& elem, + size_t offset) +{ + ut_list_node<Type>& elem_node = ut_elem_get_node(elem, offset); + + ut_a(list.count > 0); + + if (elem_node.next != NULL) { + ut_list_node<Type>& next_node = + ut_elem_get_node(*elem_node.next, offset); + + next_node.prev = elem_node.prev; + } else { + list.end = elem_node.prev; + } + + if (elem_node.prev != NULL) { + ut_list_node<Type>& prev_node = + ut_elem_get_node(*elem_node.prev, offset); + + prev_node.next = elem_node.next; + } else { + list.start = elem_node.next; + } + + UT_LIST_REMOVE_CLEAR(elem_node); + + --list.count; +} + +/*******************************************************************//** +Removes a node from a two-way linked list. + aram NAME list name +@param LIST the base node (not a pointer to it) +@param ELEM node to be removed from the list */ +#define UT_LIST_REMOVE(NAME, LIST, ELEM) \ + ut_list_remove(LIST, *ELEM, IB_OFFSETOF(ELEM, NAME)) + +/********************************************************************//** +Gets the next node in a two-way list. +@param NAME list name +@param N pointer to a node +@return the successor of N in NAME, or NULL */ +#define UT_LIST_GET_NEXT(NAME, N)\ + (((N)->NAME).next) + +/********************************************************************//** +Gets the previous node in a two-way list. +@param NAME list name +@param N pointer to a node +@return the predecessor of N in NAME, or NULL */ +#define UT_LIST_GET_PREV(NAME, N)\ + (((N)->NAME).prev) + +/********************************************************************//** +Alternative macro to get the number of nodes in a two-way list, i.e., +its length. +@param BASE the base node (not a pointer to it). +@return the number of nodes in the list */ +#define UT_LIST_GET_LEN(BASE)\ + (BASE).count + +/********************************************************************//** +Gets the first node in a two-way list. +@param BASE the base node (not a pointer to it) +@return first node, or NULL if the list is empty */ +#define UT_LIST_GET_FIRST(BASE)\ + (BASE).start + +/********************************************************************//** +Gets the last node in a two-way list. +@param BASE the base node (not a pointer to it) +@return last node, or NULL if the list is empty */ +#define UT_LIST_GET_LAST(BASE)\ + (BASE).end + +struct NullValidate { void operator()(const void* elem) { } }; + +/********************************************************************//** +Iterate over all the elements and call the functor for each element. +@param list base node (not a pointer to it) +@param functor Functor that is called for each element in the list +@parm node pointer to member node within list element */ +template <typename List, class Functor> +void +ut_list_map( + List& list, + ut_list_node<typename List::elem_type> + List::elem_type::*node, + Functor functor) +{ + ulint count = 0; + + for (typename List::elem_type* elem = list.start; + elem != 0; + elem = (elem->*node).next, ++count) { + + functor(elem); + } + + ut_a(count == list.count); +} + +/********************************************************************//** +Checks the consistency of a two-way list. +@param list base node (not a pointer to it) +@param functor Functor that is called for each element in the list +@parm node pointer to member node within list element */ +template <typename List, class Functor> +void +ut_list_validate( + List& list, + ut_list_node<typename List::elem_type> + List::elem_type::*node, + Functor functor = NullValidate()) +{ + ut_list_map(list, node, functor); + + ulint count = 0; + + for (typename List::elem_type* elem = list.end; + elem != 0; + elem = (elem->*node).prev, ++count) { + + functor(elem); + } + + ut_a(count == list.count); +} + +/********************************************************************//** +Checks the consistency of a two-way list. +@param NAME the name of the list +@param TYPE node type +@param LIST base node (not a pointer to it) +@param FUNCTOR called for each list element */ +#define UT_LIST_VALIDATE(NAME, TYPE, LIST, FUNCTOR) \ + ut_list_validate(LIST, &TYPE::NAME, FUNCTOR) + +#define UT_LIST_CHECK(NAME, TYPE, LIST) \ + ut_list_validate(LIST, &TYPE::NAME, NullValidate()) + +#endif /* ut0lst.h */ diff --git a/storage/innobase/include/ut0mem.h b/storage/innobase/include/ut0mem.h new file mode 100644 index 00000000000..af7eb4e9b1d --- /dev/null +++ b/storage/innobase/include/ut0mem.h @@ -0,0 +1,261 @@ +/***************************************************************************** + +Copyright (c) 1994, 2011, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/*******************************************************************//** +@file include/ut0mem.h +Memory primitives + +Created 5/30/1994 Heikki Tuuri +************************************************************************/ + +#ifndef ut0mem_h +#define ut0mem_h + +#include "univ.i" +#include <string.h> +#ifndef UNIV_HOTBACKUP +# include "os0sync.h" + +/** The total amount of memory currently allocated from the operating +system with os_mem_alloc_large() or malloc(). Does not count malloc() +if srv_use_sys_malloc is set. Protected by ut_list_mutex. */ +extern ulint ut_total_allocated_memory; + +/** Mutex protecting ut_total_allocated_memory and ut_mem_block_list */ +extern os_fast_mutex_t ut_list_mutex; +#endif /* !UNIV_HOTBACKUP */ + +/** Wrapper for memcpy(3). Copy memory area when the source and +target are not overlapping. +* @param dest in: copy to +* @param sour in: copy from +* @param n in: number of bytes to copy +* @return dest */ +UNIV_INLINE +void* +ut_memcpy(void* dest, const void* sour, ulint n); + +/** Wrapper for memmove(3). Copy memory area when the source and +target are overlapping. +* @param dest in: copy to +* @param sour in: copy from +* @param n in: number of bytes to copy +* @return dest */ +UNIV_INLINE +void* +ut_memmove(void* dest, const void* sour, ulint n); + +/** Wrapper for memcmp(3). Compare memory areas. +* @param str1 in: first memory block to compare +* @param str2 in: second memory block to compare +* @param n in: number of bytes to compare +* @return negative, 0, or positive if str1 is smaller, equal, + or greater than str2, respectively. */ +UNIV_INLINE +int +ut_memcmp(const void* str1, const void* str2, ulint n); + +/**********************************************************************//** +Initializes the mem block list at database startup. */ +UNIV_INTERN +void +ut_mem_init(void); +/*=============*/ + +/**********************************************************************//** +Allocates memory. +@return own: allocated memory */ +UNIV_INTERN +void* +ut_malloc_low( +/*==========*/ + ulint n, /*!< in: number of bytes to allocate */ + ibool assert_on_error) /*!< in: if TRUE, we crash mysqld if + the memory cannot be allocated */ + __attribute__((malloc)); +/**********************************************************************//** +Allocates memory. */ +#define ut_malloc(n) ut_malloc_low(n, TRUE) +/**********************************************************************//** +Frees a memory block allocated with ut_malloc. Freeing a NULL pointer is +a nop. */ +UNIV_INTERN +void +ut_free( +/*====*/ + void* ptr); /*!< in, own: memory block, can be NULL */ +#ifndef UNIV_HOTBACKUP +/**********************************************************************//** +Implements realloc. This is needed by /pars/lexyy.cc. Otherwise, you should not +use this function because the allocation functions in mem0mem.h are the +recommended ones in InnoDB. + +man realloc in Linux, 2004: + + realloc() changes the size of the memory block pointed to + by ptr to size bytes. The contents will be unchanged to + the minimum of the old and new sizes; newly allocated mem + ory will be uninitialized. If ptr is NULL, the call is + equivalent to malloc(size); if size is equal to zero, the + call is equivalent to free(ptr). Unless ptr is NULL, it + must have been returned by an earlier call to malloc(), + calloc() or realloc(). + +RETURN VALUE + realloc() returns a pointer to the newly allocated memory, + which is suitably aligned for any kind of variable and may + be different from ptr, or NULL if the request fails. If + size was equal to 0, either NULL or a pointer suitable to + be passed to free() is returned. If realloc() fails the + original block is left untouched - it is not freed or + moved. +@return own: pointer to new mem block or NULL */ +UNIV_INTERN +void* +ut_realloc( +/*=======*/ + void* ptr, /*!< in: pointer to old block or NULL */ + ulint size); /*!< in: desired size */ +/**********************************************************************//** +Frees in shutdown all allocated memory not freed yet. */ +UNIV_INTERN +void +ut_free_all_mem(void); +/*=================*/ +#endif /* !UNIV_HOTBACKUP */ + +/** Wrapper for strcpy(3). Copy a NUL-terminated string. +* @param dest in: copy to +* @param sour in: copy from +* @return dest */ +UNIV_INLINE +char* +ut_strcpy(char* dest, const char* sour); + +/** Wrapper for strlen(3). Determine the length of a NUL-terminated string. +* @param str in: string +* @return length of the string in bytes, excluding the terminating NUL */ +UNIV_INLINE +ulint +ut_strlen(const char* str); + +/** Wrapper for strcmp(3). Compare NUL-terminated strings. +* @param str1 in: first string to compare +* @param str2 in: second string to compare +* @return negative, 0, or positive if str1 is smaller, equal, + or greater than str2, respectively. */ +UNIV_INLINE +int +ut_strcmp(const char* str1, const char* str2); + +/**********************************************************************//** +Copies up to size - 1 characters from the NUL-terminated string src to +dst, NUL-terminating the result. Returns strlen(src), so truncation +occurred if the return value >= size. +@return strlen(src) */ +UNIV_INTERN +ulint +ut_strlcpy( +/*=======*/ + char* dst, /*!< in: destination buffer */ + const char* src, /*!< in: source buffer */ + ulint size); /*!< in: size of destination buffer */ + +/**********************************************************************//** +Like ut_strlcpy, but if src doesn't fit in dst completely, copies the last +(size - 1) bytes of src, not the first. +@return strlen(src) */ +UNIV_INTERN +ulint +ut_strlcpy_rev( +/*===========*/ + char* dst, /*!< in: destination buffer */ + const char* src, /*!< in: source buffer */ + ulint size); /*!< in: size of destination buffer */ + +/**********************************************************************//** +Return the number of times s2 occurs in s1. Overlapping instances of s2 +are only counted once. +@return the number of times s2 occurs in s1 */ +UNIV_INTERN +ulint +ut_strcount( +/*========*/ + const char* s1, /*!< in: string to search in */ + const char* s2); /*!< in: string to search for */ + +/**********************************************************************//** +Replace every occurrence of s1 in str with s2. Overlapping instances of s1 +are only replaced once. +@return own: modified string, must be freed with mem_free() */ +UNIV_INTERN +char* +ut_strreplace( +/*==========*/ + const char* str, /*!< in: string to operate on */ + const char* s1, /*!< in: string to replace */ + const char* s2); /*!< in: string to replace s1 with */ + +/******************************************************************** +Concatenate 3 strings.*/ + +char* +ut_str3cat( +/*=======*/ + /* out, own: concatenated string, must be + freed with mem_free() */ + const char* s1, /* in: string 1 */ + const char* s2, /* in: string 2 */ + const char* s3); /* in: string 3 */ + +/**********************************************************************//** +Converts a raw binary data to a NUL-terminated hex string. The output is +truncated if there is not enough space in "hex", make sure "hex_size" is at +least (2 * raw_size + 1) if you do not want this to happen. Returns the +actual number of characters written to "hex" (including the NUL). +@return number of chars written */ +UNIV_INLINE +ulint +ut_raw_to_hex( +/*==========*/ + const void* raw, /*!< in: raw data */ + ulint raw_size, /*!< in: "raw" length in bytes */ + char* hex, /*!< out: hex string */ + ulint hex_size); /*!< in: "hex" size in bytes */ + +/*******************************************************************//** +Adds single quotes to the start and end of string and escapes any quotes +by doubling them. Returns the number of bytes that were written to "buf" +(including the terminating NUL). If buf_size is too small then the +trailing bytes from "str" are discarded. +@return number of bytes that were written */ +UNIV_INLINE +ulint +ut_str_sql_format( +/*==============*/ + const char* str, /*!< in: string */ + ulint str_len, /*!< in: string length in bytes */ + char* buf, /*!< out: output buffer */ + ulint buf_size); /*!< in: output buffer size + in bytes */ + +#ifndef UNIV_NONINL +#include "ut0mem.ic" +#endif + +#endif diff --git a/storage/innobase/include/ut0mem.ic b/storage/innobase/include/ut0mem.ic new file mode 100644 index 00000000000..5c9071d52cc --- /dev/null +++ b/storage/innobase/include/ut0mem.ic @@ -0,0 +1,317 @@ +/***************************************************************************** + +Copyright (c) 1994, 2011, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/*******************************************************************//** +@file include/ut0mem.ic +Memory primitives + +Created 5/30/1994 Heikki Tuuri +************************************************************************/ + +#include "ut0byte.h" +#include "mach0data.h" + +/** Wrapper for memcpy(3). Copy memory area when the source and +target are not overlapping. +* @param dest in: copy to +* @param sour in: copy from +* @param n in: number of bytes to copy +* @return dest */ +UNIV_INLINE +void* +ut_memcpy(void* dest, const void* sour, ulint n) +{ + return(memcpy(dest, sour, n)); +} + +/** Wrapper for memmove(3). Copy memory area when the source and +target are overlapping. +* @param dest in: copy to +* @param sour in: copy from +* @param n in: number of bytes to copy +* @return dest */ +UNIV_INLINE +void* +ut_memmove(void* dest, const void* sour, ulint n) +{ + return(memmove(dest, sour, n)); +} + +/** Wrapper for memcmp(3). Compare memory areas. +* @param str1 in: first memory block to compare +* @param str2 in: second memory block to compare +* @param n in: number of bytes to compare +* @return negative, 0, or positive if str1 is smaller, equal, + or greater than str2, respectively. */ +UNIV_INLINE +int +ut_memcmp(const void* str1, const void* str2, ulint n) +{ + return(memcmp(str1, str2, n)); +} + +/** Wrapper for strcpy(3). Copy a NUL-terminated string. +* @param dest in: copy to +* @param sour in: copy from +* @return dest */ +UNIV_INLINE +char* +ut_strcpy(char* dest, const char* sour) +{ + return(strcpy(dest, sour)); +} + +/** Wrapper for strlen(3). Determine the length of a NUL-terminated string. +* @param str in: string +* @return length of the string in bytes, excluding the terminating NUL */ +UNIV_INLINE +ulint +ut_strlen(const char* str) +{ + return(strlen(str)); +} + +/** Wrapper for strcmp(3). Compare NUL-terminated strings. +* @param str1 in: first string to compare +* @param str2 in: second string to compare +* @return negative, 0, or positive if str1 is smaller, equal, + or greater than str2, respectively. */ +UNIV_INLINE +int +ut_strcmp(const char* str1, const char* str2) +{ + return(strcmp(str1, str2)); +} + +/**********************************************************************//** +Converts a raw binary data to a NUL-terminated hex string. The output is +truncated if there is not enough space in "hex", make sure "hex_size" is at +least (2 * raw_size + 1) if you do not want this to happen. Returns the +actual number of characters written to "hex" (including the NUL). +@return number of chars written */ +UNIV_INLINE +ulint +ut_raw_to_hex( +/*==========*/ + const void* raw, /*!< in: raw data */ + ulint raw_size, /*!< in: "raw" length in bytes */ + char* hex, /*!< out: hex string */ + ulint hex_size) /*!< in: "hex" size in bytes */ +{ + +#ifdef WORDS_BIGENDIAN + +#define MK_UINT16(a, b) (((uint16) (a)) << 8 | (uint16) (b)) + +#define UINT16_GET_A(u) ((unsigned char) ((u) >> 8)) +#define UINT16_GET_B(u) ((unsigned char) ((u) & 0xFF)) + +#else /* WORDS_BIGENDIAN */ + +#define MK_UINT16(a, b) (((uint16) (b)) << 8 | (uint16) (a)) + +#define UINT16_GET_A(u) ((unsigned char) ((u) & 0xFF)) +#define UINT16_GET_B(u) ((unsigned char) ((u) >> 8)) + +#endif /* WORDS_BIGENDIAN */ + +#define MK_ALL_UINT16_WITH_A(a) \ + MK_UINT16(a, '0'), \ + MK_UINT16(a, '1'), \ + MK_UINT16(a, '2'), \ + MK_UINT16(a, '3'), \ + MK_UINT16(a, '4'), \ + MK_UINT16(a, '5'), \ + MK_UINT16(a, '6'), \ + MK_UINT16(a, '7'), \ + MK_UINT16(a, '8'), \ + MK_UINT16(a, '9'), \ + MK_UINT16(a, 'A'), \ + MK_UINT16(a, 'B'), \ + MK_UINT16(a, 'C'), \ + MK_UINT16(a, 'D'), \ + MK_UINT16(a, 'E'), \ + MK_UINT16(a, 'F') + + static const uint16 hex_map[256] = { + MK_ALL_UINT16_WITH_A('0'), + MK_ALL_UINT16_WITH_A('1'), + MK_ALL_UINT16_WITH_A('2'), + MK_ALL_UINT16_WITH_A('3'), + MK_ALL_UINT16_WITH_A('4'), + MK_ALL_UINT16_WITH_A('5'), + MK_ALL_UINT16_WITH_A('6'), + MK_ALL_UINT16_WITH_A('7'), + MK_ALL_UINT16_WITH_A('8'), + MK_ALL_UINT16_WITH_A('9'), + MK_ALL_UINT16_WITH_A('A'), + MK_ALL_UINT16_WITH_A('B'), + MK_ALL_UINT16_WITH_A('C'), + MK_ALL_UINT16_WITH_A('D'), + MK_ALL_UINT16_WITH_A('E'), + MK_ALL_UINT16_WITH_A('F') + }; + const unsigned char* rawc; + ulint read_bytes; + ulint write_bytes; + ulint i; + + rawc = (const unsigned char*) raw; + + if (hex_size == 0) { + + return(0); + } + + if (hex_size <= 2 * raw_size) { + + read_bytes = hex_size / 2; + write_bytes = hex_size; + } else { + + read_bytes = raw_size; + write_bytes = 2 * raw_size + 1; + } + +#define LOOP_READ_BYTES(ASSIGN) \ + for (i = 0; i < read_bytes; i++) { \ + ASSIGN; \ + hex += 2; \ + rawc++; \ + } + + if (ut_align_offset(hex, 2) == 0) { + + LOOP_READ_BYTES( + *(uint16*) hex = hex_map[*rawc] + ); + } else { + + LOOP_READ_BYTES( + *hex = UINT16_GET_A(hex_map[*rawc]); + *(hex + 1) = UINT16_GET_B(hex_map[*rawc]) + ); + } + + if (hex_size <= 2 * raw_size && hex_size % 2 == 0) { + + hex--; + } + + *hex = '\0'; + + return(write_bytes); +} + +/*******************************************************************//** +Adds single quotes to the start and end of string and escapes any quotes +by doubling them. Returns the number of bytes that were written to "buf" +(including the terminating NUL). If buf_size is too small then the +trailing bytes from "str" are discarded. +@return number of bytes that were written */ +UNIV_INLINE +ulint +ut_str_sql_format( +/*==============*/ + const char* str, /*!< in: string */ + ulint str_len, /*!< in: string length in bytes */ + char* buf, /*!< out: output buffer */ + ulint buf_size) /*!< in: output buffer size + in bytes */ +{ + ulint str_i; + ulint buf_i; + + buf_i = 0; + + switch (buf_size) { + case 3: + + if (str_len == 0) { + + buf[buf_i] = '\''; + buf_i++; + buf[buf_i] = '\''; + buf_i++; + } + /* FALLTHROUGH */ + case 2: + case 1: + + buf[buf_i] = '\0'; + buf_i++; + /* FALLTHROUGH */ + case 0: + + return(buf_i); + } + + /* buf_size >= 4 */ + + buf[0] = '\''; + buf_i = 1; + + for (str_i = 0; str_i < str_len; str_i++) { + + char ch; + + if (buf_size - buf_i == 2) { + + break; + } + + ch = str[str_i]; + + switch (ch) { + case '\0': + + if (buf_size - buf_i < 4) { + + goto func_exit; + } + buf[buf_i] = '\\'; + buf_i++; + buf[buf_i] = '0'; + buf_i++; + break; + case '\'': + case '\\': + + if (buf_size - buf_i < 4) { + + goto func_exit; + } + buf[buf_i] = ch; + buf_i++; + /* FALLTHROUGH */ + default: + + buf[buf_i] = ch; + buf_i++; + } + } + +func_exit: + + buf[buf_i] = '\''; + buf_i++; + buf[buf_i] = '\0'; + buf_i++; + + return(buf_i); +} diff --git a/storage/innobase/include/ut0rbt.h b/storage/innobase/include/ut0rbt.h new file mode 100644 index 00000000000..e0593e99bde --- /dev/null +++ b/storage/innobase/include/ut0rbt.h @@ -0,0 +1,324 @@ +/***************************************************************************//** + +Copyright (c) 2007, 2010, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ +/******************************************************************//** +@file include/ut0rbt.h +Various utilities + +Created 2007-03-20 Sunny Bains +*******************************************************/ + +#ifndef INNOBASE_UT0RBT_H +#define INNOBASE_UT0RBT_H + +#if !defined(IB_RBT_TESTING) +#include "univ.i" +#include "ut0mem.h" +#else +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <assert.h> + +#define ut_malloc malloc +#define ut_free free +#define ulint unsigned long +#define ut_a(c) assert(c) +#define ut_error assert(0) +#define ibool unsigned int +#define TRUE 1 +#define FALSE 0 +#endif + +struct ib_rbt_node_t; +typedef void (*ib_rbt_print_node)(const ib_rbt_node_t* node); +typedef int (*ib_rbt_compare)(const void* p1, const void* p2); +typedef int (*ib_rbt_arg_compare)(const void*, const void* p1, const void* p2); + +/** Red black tree color types */ +enum ib_rbt_color_t { + IB_RBT_RED, + IB_RBT_BLACK +}; + +/** Red black tree node */ +struct ib_rbt_node_t { + ib_rbt_color_t color; /* color of this node */ + + ib_rbt_node_t* left; /* points left child */ + ib_rbt_node_t* right; /* points right child */ + ib_rbt_node_t* parent; /* points parent node */ + + char value[1]; /* Data value */ +}; + +/** Red black tree instance.*/ +struct ib_rbt_t { + ib_rbt_node_t* nil; /* Black colored node that is + used as a sentinel. This is + pre-allocated too.*/ + + ib_rbt_node_t* root; /* Root of the tree, this is + pre-allocated and the first + data node is the left child.*/ + + ulint n_nodes; /* Total number of data nodes */ + + ib_rbt_compare compare; /* Fn. to use for comparison */ + ib_rbt_arg_compare + compare_with_arg; /* Fn. to use for comparison + with argument */ + ulint sizeof_value; /* Sizeof the item in bytes */ + void* cmp_arg; /* Compare func argument */ +}; + +/** The result of searching for a key in the tree, this is useful for +a speedy lookup and insert if key doesn't exist.*/ +struct ib_rbt_bound_t { + const ib_rbt_node_t* + last; /* Last node visited */ + + int result; /* Result of comparing with + the last non-nil node that + was visited */ +}; + +/* Size in elements (t is an rb tree instance) */ +#define rbt_size(t) (t->n_nodes) + +/* Check whether the rb tree is empty (t is an rb tree instance) */ +#define rbt_empty(t) (rbt_size(t) == 0) + +/* Get data value (t is the data type, n is an rb tree node instance) */ +#define rbt_value(t, n) ((t*) &n->value[0]) + +/* Compare a key with the node value (t is tree, k is key, n is node)*/ +#define rbt_compare(t, k, n) (t->compare(k, n->value)) + +/**********************************************************************//** +Free an instance of a red black tree */ +UNIV_INTERN +void +rbt_free( +/*=====*/ + ib_rbt_t* tree); /*!< in: rb tree to free */ +/**********************************************************************//** +Create an instance of a red black tree +@return rb tree instance */ +UNIV_INTERN +ib_rbt_t* +rbt_create( +/*=======*/ + size_t sizeof_value, /*!< in: size in bytes */ + ib_rbt_compare compare); /*!< in: comparator */ +/**********************************************************************//** +Create an instance of a red black tree, whose comparison function takes +an argument +@return rb tree instance */ +UNIV_INTERN +ib_rbt_t* +rbt_create_arg_cmp( +/*===============*/ + size_t sizeof_value, /*!< in: size in bytes */ + ib_rbt_arg_compare + compare, /*!< in: comparator */ + void* cmp_arg); /*!< in: compare fn arg */ +/**********************************************************************//** +Delete a node from the red black tree, identified by key */ +UNIV_INTERN +ibool +rbt_delete( +/*=======*/ + /* in: TRUE on success */ + ib_rbt_t* tree, /* in: rb tree */ + const void* key); /* in: key to delete */ +/**********************************************************************//** +Remove a node from the red black tree, NOTE: This function will not delete +the node instance, THAT IS THE CALLERS RESPONSIBILITY. +@return the deleted node with the const. */ +UNIV_INTERN +ib_rbt_node_t* +rbt_remove_node( +/*============*/ + ib_rbt_t* tree, /*!< in: rb tree */ + const ib_rbt_node_t* + node); /*!< in: node to delete, this + is a fudge and declared const + because the caller has access + only to const nodes.*/ +/**********************************************************************//** +Return a node from the red black tree, identified by +key, NULL if not found +@return node if found else return NULL */ +UNIV_INTERN +const ib_rbt_node_t* +rbt_lookup( +/*=======*/ + const ib_rbt_t* tree, /*!< in: rb tree to search */ + const void* key); /*!< in: key to lookup */ +/**********************************************************************//** +Add data to the red black tree, identified by key (no dups yet!) +@return inserted node */ +UNIV_INTERN +const ib_rbt_node_t* +rbt_insert( +/*=======*/ + ib_rbt_t* tree, /*!< in: rb tree */ + const void* key, /*!< in: key for ordering */ + const void* value); /*!< in: data that will be + copied to the node.*/ +/**********************************************************************//** +Add a new node to the tree, useful for data that is pre-sorted. +@return appended node */ +UNIV_INTERN +const ib_rbt_node_t* +rbt_add_node( +/*=========*/ + ib_rbt_t* tree, /*!< in: rb tree */ + ib_rbt_bound_t* parent, /*!< in: parent */ + const void* value); /*!< in: this value is copied + to the node */ +/**********************************************************************//** +Return the left most data node in the tree +@return left most node */ +UNIV_INTERN +const ib_rbt_node_t* +rbt_first( +/*======*/ + const ib_rbt_t* tree); /*!< in: rb tree */ +/**********************************************************************//** +Return the right most data node in the tree +@return right most node */ +UNIV_INTERN +const ib_rbt_node_t* +rbt_last( +/*=====*/ + const ib_rbt_t* tree); /*!< in: rb tree */ +/**********************************************************************//** +Return the next node from current. +@return successor node to current that is passed in. */ +UNIV_INTERN +const ib_rbt_node_t* +rbt_next( +/*=====*/ + const ib_rbt_t* tree, /*!< in: rb tree */ + const ib_rbt_node_t* /* in: current node */ + current); +/**********************************************************************//** +Return the prev node from current. +@return precedessor node to current that is passed in */ +UNIV_INTERN +const ib_rbt_node_t* +rbt_prev( +/*=====*/ + const ib_rbt_t* tree, /*!< in: rb tree */ + const ib_rbt_node_t* /* in: current node */ + current); +/**********************************************************************//** +Find the node that has the lowest key that is >= key. +@return node that satisfies the lower bound constraint or NULL */ +UNIV_INTERN +const ib_rbt_node_t* +rbt_lower_bound( +/*============*/ + const ib_rbt_t* tree, /*!< in: rb tree */ + const void* key); /*!< in: key to search */ +/**********************************************************************//** +Find the node that has the greatest key that is <= key. +@return node that satisifies the upper bound constraint or NULL */ +UNIV_INTERN +const ib_rbt_node_t* +rbt_upper_bound( +/*============*/ + const ib_rbt_t* tree, /*!< in: rb tree */ + const void* key); /*!< in: key to search */ +/**********************************************************************//** +Search for the key, a node will be retuned in parent.last, whether it +was found or not. If not found then parent.last will contain the +parent node for the possibly new key otherwise the matching node. +@return result of last comparison */ +UNIV_INTERN +int +rbt_search( +/*=======*/ + const ib_rbt_t* tree, /*!< in: rb tree */ + ib_rbt_bound_t* parent, /*!< in: search bounds */ + const void* key); /*!< in: key to search */ +/**********************************************************************//** +Search for the key, a node will be retuned in parent.last, whether it +was found or not. If not found then parent.last will contain the +parent node for the possibly new key otherwise the matching node. +@return result of last comparison */ +UNIV_INTERN +int +rbt_search_cmp( +/*===========*/ + const ib_rbt_t* tree, /*!< in: rb tree */ + ib_rbt_bound_t* parent, /*!< in: search bounds */ + const void* key, /*!< in: key to search */ + ib_rbt_compare compare, /*!< in: comparator */ + ib_rbt_arg_compare + arg_compare); /*!< in: fn to compare items + with argument */ +/**********************************************************************//** +Clear the tree, deletes (and free's) all the nodes. */ +UNIV_INTERN +void +rbt_clear( +/*======*/ + ib_rbt_t* tree); /*!< in: rb tree */ +/**********************************************************************//** +Merge the node from dst into src. Return the number of nodes merged. +@return no. of recs merged */ +UNIV_INTERN +ulint +rbt_merge_uniq( +/*===========*/ + ib_rbt_t* dst, /*!< in: dst rb tree */ + const ib_rbt_t* src); /*!< in: src rb tree */ +/**********************************************************************//** +Merge the node from dst into src. Return the number of nodes merged. +Delete the nodes from src after copying node to dst. As a side effect +the duplicates will be left untouched in the src, since we don't support +duplicates (yet). NOTE: src and dst must be similar, the function doesn't +check for this condition (yet). +@return no. of recs merged */ +UNIV_INTERN +ulint +rbt_merge_uniq_destructive( +/*=======================*/ + ib_rbt_t* dst, /*!< in: dst rb tree */ + ib_rbt_t* src); /*!< in: src rb tree */ +/**********************************************************************//** +Verify the integrity of the RB tree. For debugging. 0 failure else height +of tree (in count of black nodes). +@return TRUE if OK FALSE if tree invalid. */ +UNIV_INTERN +ibool +rbt_validate( +/*=========*/ + const ib_rbt_t* tree); /*!< in: tree to validate */ +/**********************************************************************//** +Iterate over the tree in depth first order. */ +UNIV_INTERN +void +rbt_print( +/*======*/ + const ib_rbt_t* tree, /*!< in: tree to traverse */ + ib_rbt_print_node print); /*!< in: print function */ + +#endif /* INNOBASE_UT0RBT_H */ diff --git a/storage/innobase/include/ut0rnd.h b/storage/innobase/include/ut0rnd.h new file mode 100644 index 00000000000..53b769849a5 --- /dev/null +++ b/storage/innobase/include/ut0rnd.h @@ -0,0 +1,148 @@ +/***************************************************************************** + +Copyright (c) 1994, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/ut0rnd.h +Random numbers and hashing + +Created 1/20/1994 Heikki Tuuri +***********************************************************************/ + +#ifndef ut0rnd_h +#define ut0rnd_h + +#include "univ.i" + +#ifndef UNIV_INNOCHECKSUM + +#include "ut0byte.h" + +/** The 'character code' for end of field or string (used +in folding records */ +#define UT_END_OF_FIELD 257 + +/********************************************************//** +This is used to set the random number seed. */ +UNIV_INLINE +void +ut_rnd_set_seed( +/*============*/ + ulint seed); /*!< in: seed */ +/********************************************************//** +The following function generates a series of 'random' ulint integers. +@return the next 'random' number */ +UNIV_INLINE +ulint +ut_rnd_gen_next_ulint( +/*==================*/ + ulint rnd); /*!< in: the previous random number value */ +/*********************************************************//** +The following function generates 'random' ulint integers which +enumerate the value space (let there be N of them) of ulint integers +in a pseudo-random fashion. Note that the same integer is repeated +always after N calls to the generator. +@return the 'random' number */ +UNIV_INLINE +ulint +ut_rnd_gen_ulint(void); +/*==================*/ +/********************************************************//** +Generates a random integer from a given interval. +@return the 'random' number */ +UNIV_INLINE +ulint +ut_rnd_interval( +/*============*/ + ulint low, /*!< in: low limit; can generate also this value */ + ulint high); /*!< in: high limit; can generate also this value */ +/*********************************************************//** +Generates a random iboolean value. +@return the random value */ +UNIV_INLINE +ibool +ut_rnd_gen_ibool(void); +/*=================*/ +/*******************************************************//** +The following function generates a hash value for a ulint integer +to a hash table of size table_size, which should be a prime or some +random number to work reliably. +@return hash value */ +UNIV_INLINE +ulint +ut_hash_ulint( +/*==========*/ + ulint key, /*!< in: value to be hashed */ + ulint table_size); /*!< in: hash table size */ +/*************************************************************//** +Folds a 64-bit integer. +@return folded value */ +UNIV_INLINE +ulint +ut_fold_ull( +/*========*/ + ib_uint64_t d) /*!< in: 64-bit integer */ + __attribute__((const)); +/*************************************************************//** +Folds a character string ending in the null character. +@return folded value */ +UNIV_INLINE +ulint +ut_fold_string( +/*===========*/ + const char* str) /*!< in: null-terminated string */ + __attribute__((pure)); +/***********************************************************//** +Looks for a prime number slightly greater than the given argument. +The prime is chosen so that it is not near any power of 2. +@return prime */ +UNIV_INTERN +ulint +ut_find_prime( +/*==========*/ + ulint n) /*!< in: positive number > 100 */ + __attribute__((const)); + +#endif /* !UNIV_INNOCHECKSUM */ + +/*************************************************************//** +Folds a pair of ulints. +@return folded value */ +UNIV_INLINE +ulint +ut_fold_ulint_pair( +/*===============*/ + ulint n1, /*!< in: ulint */ + ulint n2) /*!< in: ulint */ + __attribute__((const)); +/*************************************************************//** +Folds a binary string. +@return folded value */ +UNIV_INLINE +ulint +ut_fold_binary( +/*===========*/ + const byte* str, /*!< in: string of bytes */ + ulint len) /*!< in: length */ + __attribute__((pure)); + + +#ifndef UNIV_NONINL +#include "ut0rnd.ic" +#endif + +#endif diff --git a/storage/innobase/include/ut0rnd.ic b/storage/innobase/include/ut0rnd.ic new file mode 100644 index 00000000000..024c59e553b --- /dev/null +++ b/storage/innobase/include/ut0rnd.ic @@ -0,0 +1,255 @@ +/***************************************************************************** + +Copyright (c) 1994, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************************//** +@file include/ut0rnd.ic +Random numbers and hashing + +Created 5/30/1994 Heikki Tuuri +*******************************************************************/ + +#define UT_HASH_RANDOM_MASK 1463735687 +#define UT_HASH_RANDOM_MASK2 1653893711 + +#ifndef UNIV_INNOCHECKSUM + +#define UT_RND1 151117737 +#define UT_RND2 119785373 +#define UT_RND3 85689495 +#define UT_RND4 76595339 +#define UT_SUM_RND2 98781234 +#define UT_SUM_RND3 126792457 +#define UT_SUM_RND4 63498502 +#define UT_XOR_RND1 187678878 +#define UT_XOR_RND2 143537923 + +/** Seed value of ut_rnd_gen_ulint() */ +extern ulint ut_rnd_ulint_counter; + +/********************************************************//** +This is used to set the random number seed. */ +UNIV_INLINE +void +ut_rnd_set_seed( +/*============*/ + ulint seed) /*!< in: seed */ +{ + ut_rnd_ulint_counter = seed; +} + +/********************************************************//** +The following function generates a series of 'random' ulint integers. +@return the next 'random' number */ +UNIV_INLINE +ulint +ut_rnd_gen_next_ulint( +/*==================*/ + ulint rnd) /*!< in: the previous random number value */ +{ + ulint n_bits; + + n_bits = 8 * sizeof(ulint); + + rnd = UT_RND2 * rnd + UT_SUM_RND3; + rnd = UT_XOR_RND1 ^ rnd; + rnd = (rnd << 20) + (rnd >> (n_bits - 20)); + rnd = UT_RND3 * rnd + UT_SUM_RND4; + rnd = UT_XOR_RND2 ^ rnd; + rnd = (rnd << 20) + (rnd >> (n_bits - 20)); + rnd = UT_RND1 * rnd + UT_SUM_RND2; + + return(rnd); +} + +/********************************************************//** +The following function generates 'random' ulint integers which +enumerate the value space of ulint integers in a pseudo random +fashion. Note that the same integer is repeated always after +2 to power 32 calls to the generator (if ulint is 32-bit). +@return the 'random' number */ +UNIV_INLINE +ulint +ut_rnd_gen_ulint(void) +/*==================*/ +{ + ulint rnd; + + ut_rnd_ulint_counter = UT_RND1 * ut_rnd_ulint_counter + UT_RND2; + + rnd = ut_rnd_gen_next_ulint(ut_rnd_ulint_counter); + + return(rnd); +} + +/********************************************************//** +Generates a random integer from a given interval. +@return the 'random' number */ +UNIV_INLINE +ulint +ut_rnd_interval( +/*============*/ + ulint low, /*!< in: low limit; can generate also this value */ + ulint high) /*!< in: high limit; can generate also this value */ +{ + ulint rnd; + + ut_ad(high >= low); + + if (low == high) { + + return(low); + } + + rnd = ut_rnd_gen_ulint(); + + return(low + (rnd % (high - low))); +} + +/*********************************************************//** +Generates a random iboolean value. +@return the random value */ +UNIV_INLINE +ibool +ut_rnd_gen_ibool(void) +/*=================*/ +{ + ulint x; + + x = ut_rnd_gen_ulint(); + + if (((x >> 20) + (x >> 15)) & 1) { + + return(TRUE); + } + + return(FALSE); +} + +/*******************************************************//** +The following function generates a hash value for a ulint integer +to a hash table of size table_size, which should be a prime +or some random number for the hash table to work reliably. +@return hash value */ +UNIV_INLINE +ulint +ut_hash_ulint( +/*==========*/ + ulint key, /*!< in: value to be hashed */ + ulint table_size) /*!< in: hash table size */ +{ + ut_ad(table_size); + key = key ^ UT_HASH_RANDOM_MASK2; + + return(key % table_size); +} + +/*************************************************************//** +Folds a 64-bit integer. +@return folded value */ +UNIV_INLINE +ulint +ut_fold_ull( +/*========*/ + ib_uint64_t d) /*!< in: 64-bit integer */ +{ + return(ut_fold_ulint_pair((ulint) d & ULINT32_MASK, + (ulint) (d >> 32))); +} + +/*************************************************************//** +Folds a character string ending in the null character. +@return folded value */ +UNIV_INLINE +ulint +ut_fold_string( +/*===========*/ + const char* str) /*!< in: null-terminated string */ +{ + ulint fold = 0; + + ut_ad(str); + + while (*str != '\0') { + fold = ut_fold_ulint_pair(fold, (ulint)(*str)); + str++; + } + + return(fold); +} + +#endif /* !UNIV_INNOCHECKSUM */ + +/*************************************************************//** +Folds a pair of ulints. +@return folded value */ +UNIV_INLINE +ulint +ut_fold_ulint_pair( +/*===============*/ + ulint n1, /*!< in: ulint */ + ulint n2) /*!< in: ulint */ +{ + return(((((n1 ^ n2 ^ UT_HASH_RANDOM_MASK2) << 8) + n1) + ^ UT_HASH_RANDOM_MASK) + n2); +} + +/*************************************************************//** +Folds a binary string. +@return folded value */ +UNIV_INLINE +ulint +ut_fold_binary( +/*===========*/ + const byte* str, /*!< in: string of bytes */ + ulint len) /*!< in: length */ +{ + ulint fold = 0; + const byte* str_end = str + (len & 0xFFFFFFF8); + + ut_ad(str || !len); + + while (str < str_end) { + fold = ut_fold_ulint_pair(fold, (ulint)(*str++)); + fold = ut_fold_ulint_pair(fold, (ulint)(*str++)); + fold = ut_fold_ulint_pair(fold, (ulint)(*str++)); + fold = ut_fold_ulint_pair(fold, (ulint)(*str++)); + fold = ut_fold_ulint_pair(fold, (ulint)(*str++)); + fold = ut_fold_ulint_pair(fold, (ulint)(*str++)); + fold = ut_fold_ulint_pair(fold, (ulint)(*str++)); + fold = ut_fold_ulint_pair(fold, (ulint)(*str++)); + } + + switch (len & 0x7) { + case 7: + fold = ut_fold_ulint_pair(fold, (ulint)(*str++)); + case 6: + fold = ut_fold_ulint_pair(fold, (ulint)(*str++)); + case 5: + fold = ut_fold_ulint_pair(fold, (ulint)(*str++)); + case 4: + fold = ut_fold_ulint_pair(fold, (ulint)(*str++)); + case 3: + fold = ut_fold_ulint_pair(fold, (ulint)(*str++)); + case 2: + fold = ut_fold_ulint_pair(fold, (ulint)(*str++)); + case 1: + fold = ut_fold_ulint_pair(fold, (ulint)(*str++)); + } + + return(fold); +} diff --git a/storage/innobase/include/ut0sort.h b/storage/innobase/include/ut0sort.h new file mode 100644 index 00000000000..75648b5c317 --- /dev/null +++ b/storage/innobase/include/ut0sort.h @@ -0,0 +1,106 @@ +/***************************************************************************** + +Copyright (c) 1995, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/ut0sort.h +Sort utility + +Created 11/9/1995 Heikki Tuuri +***********************************************************************/ + +#ifndef ut0sort_h +#define ut0sort_h + +#include "univ.i" + +/* This module gives a macro definition of the body of +a standard sort function for an array of elements of any +type. The comparison function is given as a parameter to +the macro. The sort algorithm is mergesort which has logarithmic +worst case. +*/ + +/*******************************************************************//** +This macro expands to the body of a standard sort function. +The sort function uses mergesort and must be defined separately +for each type of array. +Also the comparison function has to be defined individually +for each array cell type. SORT_FUN is the sort function name. +The function takes the array to be sorted (ARR), +the array of auxiliary space (AUX_ARR) of same size, +and the low (LOW), inclusive, and high (HIGH), noninclusive, +limits for the sort interval as arguments. +CMP_FUN is the comparison function name. It takes as arguments +two elements from the array and returns 1, if the first is bigger, +0 if equal, and -1 if the second bigger. */ + +#define UT_SORT_FUNCTION_BODY(SORT_FUN, ARR, AUX_ARR, LOW, HIGH, CMP_FUN)\ +{\ + ulint ut_sort_mid77;\ + ulint ut_sort_i77;\ + ulint ut_sort_low77;\ + ulint ut_sort_high77;\ +\ + ut_ad((LOW) < (HIGH));\ + ut_ad(ARR);\ + ut_ad(AUX_ARR);\ +\ + if ((LOW) == (HIGH) - 1) {\ + return;\ + } else if ((LOW) == (HIGH) - 2) {\ + if (CMP_FUN((ARR)[LOW], (ARR)[(HIGH) - 1]) > 0) {\ + (AUX_ARR)[LOW] = (ARR)[LOW];\ + (ARR)[LOW] = (ARR)[(HIGH) - 1];\ + (ARR)[(HIGH) - 1] = (AUX_ARR)[LOW];\ + }\ + return;\ + }\ +\ + ut_sort_mid77 = ((LOW) + (HIGH)) / 2;\ +\ + SORT_FUN((ARR), (AUX_ARR), (LOW), ut_sort_mid77);\ + SORT_FUN((ARR), (AUX_ARR), ut_sort_mid77, (HIGH));\ +\ + ut_sort_low77 = (LOW);\ + ut_sort_high77 = ut_sort_mid77;\ +\ + for (ut_sort_i77 = (LOW); ut_sort_i77 < (HIGH); ut_sort_i77++) {\ +\ + if (ut_sort_low77 >= ut_sort_mid77) {\ + (AUX_ARR)[ut_sort_i77] = (ARR)[ut_sort_high77];\ + ut_sort_high77++;\ + } else if (ut_sort_high77 >= (HIGH)) {\ + (AUX_ARR)[ut_sort_i77] = (ARR)[ut_sort_low77];\ + ut_sort_low77++;\ + } else if (CMP_FUN((ARR)[ut_sort_low77],\ + (ARR)[ut_sort_high77]) > 0) {\ + (AUX_ARR)[ut_sort_i77] = (ARR)[ut_sort_high77];\ + ut_sort_high77++;\ + } else {\ + (AUX_ARR)[ut_sort_i77] = (ARR)[ut_sort_low77];\ + ut_sort_low77++;\ + }\ + }\ +\ + memcpy((void*) ((ARR) + (LOW)), (AUX_ARR) + (LOW),\ + ((HIGH) - (LOW)) * sizeof *(ARR));\ +}\ + + +#endif + diff --git a/storage/innobase/include/ut0ut.h b/storage/innobase/include/ut0ut.h new file mode 100644 index 00000000000..0caf379d8fa --- /dev/null +++ b/storage/innobase/include/ut0ut.h @@ -0,0 +1,497 @@ +/***************************************************************************** + +Copyright (c) 1994, 2014, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/ut0ut.h +Various utilities + +Created 1/20/1994 Heikki Tuuri +***********************************************************************/ + +#ifndef ut0ut_h +#define ut0ut_h + +#include "univ.i" + +#ifndef UNIV_INNOCHECKSUM + +#include "db0err.h" + +#ifndef UNIV_HOTBACKUP +# include "os0sync.h" /* for HAVE_ATOMIC_BUILTINS */ +#endif /* UNIV_HOTBACKUP */ + +#include <time.h> +#ifndef MYSQL_SERVER +#include <ctype.h> +#endif + +#include <stdarg.h> /* for va_list */ + +/** Index name prefix in fast index creation */ +#define TEMP_INDEX_PREFIX '\377' +/** Index name prefix in fast index creation, as a string constant */ +#define TEMP_INDEX_PREFIX_STR "\377" + +/** Time stamp */ +typedef time_t ib_time_t; + +/* In order to call a piece of code, when a function returns or when the +scope ends, use this utility class. It will invoke the given function +object in its destructor. */ +template<typename F> +struct ut_when_dtor { + ut_when_dtor(F& p) : f(p) {} + ~ut_when_dtor() { + f(); + } +private: + F& f; +}; + +#ifndef UNIV_HOTBACKUP +# if defined(HAVE_PAUSE_INSTRUCTION) + /* According to the gcc info page, asm volatile means that the + instruction has important side-effects and must not be removed. + Also asm volatile may trigger a memory barrier (spilling all registers + to memory). */ +# ifdef __SUNPRO_CC +# define UT_RELAX_CPU() asm ("pause" ) +# else +# define UT_RELAX_CPU() __asm__ __volatile__ ("pause") +# endif /* __SUNPRO_CC */ + +# elif defined(HAVE_FAKE_PAUSE_INSTRUCTION) +# define UT_RELAX_CPU() __asm__ __volatile__ ("rep; nop") +# elif defined(HAVE_ATOMIC_BUILTINS) +# define UT_RELAX_CPU() do { \ + volatile lint volatile_var; \ + os_compare_and_swap_lint(&volatile_var, 0, 1); \ + } while (0) +# elif defined(HAVE_WINDOWS_ATOMICS) + /* In the Win32 API, the x86 PAUSE instruction is executed by calling + the YieldProcessor macro defined in WinNT.h. It is a CPU architecture- + independent way by using YieldProcessor. */ +# define UT_RELAX_CPU() YieldProcessor() +# else +# define UT_RELAX_CPU() ((void)0) /* avoid warning for an empty statement */ +# endif + +/*********************************************************************//** +Delays execution for at most max_wait_us microseconds or returns earlier +if cond becomes true. +@param cond in: condition to wait for; evaluated every 2 ms +@param max_wait_us in: maximum delay to wait, in microseconds */ +#define UT_WAIT_FOR(cond, max_wait_us) \ +do { \ + ullint start_us; \ + start_us = ut_time_us(NULL); \ + while (!(cond) \ + && ut_time_us(NULL) - start_us < (max_wait_us)) {\ + \ + os_thread_sleep(2000 /* 2 ms */); \ + } \ +} while (0) +#endif /* !UNIV_HOTBACKUP */ + +template <class T> T ut_min(T a, T b) { return(a < b ? a : b); } +template <class T> T ut_max(T a, T b) { return(a > b ? a : b); } + +/******************************************************//** +Calculates the minimum of two ulints. +@return minimum */ +UNIV_INLINE +ulint +ut_min( +/*===*/ + ulint n1, /*!< in: first number */ + ulint n2); /*!< in: second number */ +/******************************************************//** +Calculates the maximum of two ulints. +@return maximum */ +UNIV_INLINE +ulint +ut_max( +/*===*/ + ulint n1, /*!< in: first number */ + ulint n2); /*!< in: second number */ +/****************************************************************//** +Calculates minimum of two ulint-pairs. */ +UNIV_INLINE +void +ut_pair_min( +/*========*/ + ulint* a, /*!< out: more significant part of minimum */ + ulint* b, /*!< out: less significant part of minimum */ + ulint a1, /*!< in: more significant part of first pair */ + ulint b1, /*!< in: less significant part of first pair */ + ulint a2, /*!< in: more significant part of second pair */ + ulint b2); /*!< in: less significant part of second pair */ +/******************************************************//** +Compares two ulints. +@return 1 if a > b, 0 if a == b, -1 if a < b */ +UNIV_INLINE +int +ut_ulint_cmp( +/*=========*/ + ulint a, /*!< in: ulint */ + ulint b); /*!< in: ulint */ +/*******************************************************//** +Compares two pairs of ulints. +@return -1 if a < b, 0 if a == b, 1 if a > b */ +UNIV_INLINE +int +ut_pair_cmp( +/*========*/ + ulint a1, /*!< in: more significant part of first pair */ + ulint a2, /*!< in: less significant part of first pair */ + ulint b1, /*!< in: more significant part of second pair */ + ulint b2); /*!< in: less significant part of second pair */ +/*************************************************************//** +Determines if a number is zero or a power of two. +@param n in: number +@return nonzero if n is zero or a power of two; zero otherwise */ +#define ut_is_2pow(n) UNIV_LIKELY(!((n) & ((n) - 1))) +/*************************************************************//** +Calculates fast the remainder of n/m when m is a power of two. +@param n in: numerator +@param m in: denominator, must be a power of two +@return the remainder of n/m */ +#define ut_2pow_remainder(n, m) ((n) & ((m) - 1)) +/*************************************************************//** +Calculates the biggest multiple of m that is not bigger than n +when m is a power of two. In other words, rounds n down to m * k. +@param n in: number to round down +@param m in: alignment, must be a power of two +@return n rounded down to the biggest possible integer multiple of m */ +#define ut_2pow_round(n, m) ((n) & ~((m) - 1)) +/** Align a number down to a multiple of a power of two. +@param n in: number to round down +@param m in: alignment, must be a power of two +@return n rounded down to the biggest possible integer multiple of m */ +#define ut_calc_align_down(n, m) ut_2pow_round(n, m) +/********************************************************//** +Calculates the smallest multiple of m that is not smaller than n +when m is a power of two. In other words, rounds n up to m * k. +@param n in: number to round up +@param m in: alignment, must be a power of two +@return n rounded up to the smallest possible integer multiple of m */ +#define ut_calc_align(n, m) (((n) + ((m) - 1)) & ~((m) - 1)) +/*************************************************************//** +Calculates fast the 2-logarithm of a number, rounded upward to an +integer. +@return logarithm in the base 2, rounded upward */ +UNIV_INLINE +ulint +ut_2_log( +/*=====*/ + ulint n); /*!< in: number */ +/*************************************************************//** +Calculates 2 to power n. +@return 2 to power n */ +UNIV_INLINE +ulint +ut_2_exp( +/*=====*/ + ulint n); /*!< in: number */ +/*************************************************************//** +Calculates fast the number rounded up to the nearest power of 2. +@return first power of 2 which is >= n */ +UNIV_INTERN +ulint +ut_2_power_up( +/*==========*/ + ulint n) /*!< in: number != 0 */ + __attribute__((const)); + +/** Determine how many bytes (groups of 8 bits) are needed to +store the given number of bits. +@param b in: bits +@return number of bytes (octets) needed to represent b */ +#define UT_BITS_IN_BYTES(b) (((b) + 7) / 8) + +/**********************************************************//** +Returns system time. We do not specify the format of the time returned: +the only way to manipulate it is to use the function ut_difftime. +@return system time */ +UNIV_INTERN +ib_time_t +ut_time(void); +/*=========*/ +#ifndef UNIV_HOTBACKUP +/**********************************************************//** +Returns system time. +Upon successful completion, the value 0 is returned; otherwise the +value -1 is returned and the global variable errno is set to indicate the +error. +@return 0 on success, -1 otherwise */ +UNIV_INTERN +int +ut_usectime( +/*========*/ + ulint* sec, /*!< out: seconds since the Epoch */ + ulint* ms); /*!< out: microseconds since the Epoch+*sec */ + +/**********************************************************//** +Returns the number of microseconds since epoch. Similar to +time(3), the return value is also stored in *tloc, provided +that tloc is non-NULL. +@return us since epoch */ +UNIV_INTERN +ullint +ut_time_us( +/*=======*/ + ullint* tloc); /*!< out: us since epoch, if non-NULL */ +/**********************************************************//** +Returns the number of milliseconds since some epoch. The +value may wrap around. It should only be used for heuristic +purposes. +@return ms since epoch */ +UNIV_INTERN +ulint +ut_time_ms(void); +/*============*/ +#endif /* !UNIV_HOTBACKUP */ + +/**********************************************************//** +Returns the number of milliseconds since some epoch. The +value may wrap around. It should only be used for heuristic +purposes. +@return ms since epoch */ +UNIV_INTERN +ulint +ut_time_ms(void); +/*============*/ + +/**********************************************************//** +Returns the difference of two times in seconds. +@return time2 - time1 expressed in seconds */ +UNIV_INTERN +double +ut_difftime( +/*========*/ + ib_time_t time2, /*!< in: time */ + ib_time_t time1); /*!< in: time */ + +#endif /* !UNIV_INNOCHECKSUM */ + +/**********************************************************//** +Prints a timestamp to a file. */ +UNIV_INTERN +void +ut_print_timestamp( +/*===============*/ + FILE* file) /*!< in: file where to print */ + UNIV_COLD __attribute__((nonnull)); + +#ifndef UNIV_INNOCHECKSUM + +/**********************************************************//** +Sprintfs a timestamp to a buffer, 13..14 chars plus terminating NUL. */ +UNIV_INTERN +void +ut_sprintf_timestamp( +/*=================*/ + char* buf); /*!< in: buffer where to sprintf */ +#ifdef UNIV_HOTBACKUP +/**********************************************************//** +Sprintfs a timestamp to a buffer with no spaces and with ':' characters +replaced by '_'. */ +UNIV_INTERN +void +ut_sprintf_timestamp_without_extra_chars( +/*=====================================*/ + char* buf); /*!< in: buffer where to sprintf */ +/**********************************************************//** +Returns current year, month, day. */ +UNIV_INTERN +void +ut_get_year_month_day( +/*==================*/ + ulint* year, /*!< out: current year */ + ulint* month, /*!< out: month */ + ulint* day); /*!< out: day */ +#else /* UNIV_HOTBACKUP */ +/*************************************************************//** +Runs an idle loop on CPU. The argument gives the desired delay +in microseconds on 100 MHz Pentium + Visual C++. +@return dummy value */ +UNIV_INTERN +ulint +ut_delay( +/*=====*/ + ulint delay); /*!< in: delay in microseconds on 100 MHz Pentium */ +#endif /* UNIV_HOTBACKUP */ +/*************************************************************//** +Prints the contents of a memory buffer in hex and ascii. */ +UNIV_INTERN +void +ut_print_buf( +/*=========*/ + FILE* file, /*!< in: file where to print */ + const void* buf, /*!< in: memory buffer */ + ulint len); /*!< in: length of the buffer */ + +/**********************************************************************//** +Outputs a NUL-terminated file name, quoted with apostrophes. */ +UNIV_INTERN +void +ut_print_filename( +/*==============*/ + FILE* f, /*!< in: output stream */ + const char* name); /*!< in: name to print */ + +#ifndef UNIV_HOTBACKUP +/* Forward declaration of transaction handle */ +struct trx_t; + +/**********************************************************************//** +Outputs a fixed-length string, quoted as an SQL identifier. +If the string contains a slash '/', the string will be +output as two identifiers separated by a period (.), +as in SQL database_name.identifier. */ +UNIV_INTERN +void +ut_print_name( +/*==========*/ + FILE* f, /*!< in: output stream */ + const trx_t* trx, /*!< in: transaction */ + ibool table_id,/*!< in: TRUE=print a table name, + FALSE=print other identifier */ + const char* name); /*!< in: name to print */ + +/**********************************************************************//** +Outputs a fixed-length string, quoted as an SQL identifier. +If the string contains a slash '/', the string will be +output as two identifiers separated by a period (.), +as in SQL database_name.identifier. */ +UNIV_INTERN +void +ut_print_namel( +/*===========*/ + FILE* f, /*!< in: output stream */ + const trx_t* trx, /*!< in: transaction (NULL=no quotes) */ + ibool table_id,/*!< in: TRUE=print a table name, + FALSE=print other identifier */ + const char* name, /*!< in: name to print */ + ulint namelen);/*!< in: length of name */ + +/**********************************************************************//** +Formats a table or index name, quoted as an SQL identifier. If the name +contains a slash '/', the result will contain two identifiers separated by +a period (.), as in SQL database_name.identifier. +@return pointer to 'formatted' */ +UNIV_INTERN +char* +ut_format_name( +/*===========*/ + const char* name, /*!< in: table or index name, must be + '\0'-terminated */ + ibool is_table, /*!< in: if TRUE then 'name' is a table + name */ + char* formatted, /*!< out: formatted result, will be + '\0'-terminated */ + ulint formatted_size);/*!< out: no more than this number of + bytes will be written to 'formatted' */ + +/**********************************************************************//** +Catenate files. */ +UNIV_INTERN +void +ut_copy_file( +/*=========*/ + FILE* dest, /*!< in: output file */ + FILE* src); /*!< in: input file to be appended to output */ +#endif /* !UNIV_HOTBACKUP */ + +#ifdef __WIN__ +/**********************************************************************//** +A substitute for vsnprintf(3), formatted output conversion into +a limited buffer. Note: this function DOES NOT return the number of +characters that would have been printed if the buffer was unlimited because +VC's _vsnprintf() returns -1 in this case and we would need to call +_vscprintf() in addition to estimate that but we would need another copy +of "ap" for that and VC does not provide va_copy(). */ +UNIV_INTERN +void +ut_vsnprintf( +/*=========*/ + char* str, /*!< out: string */ + size_t size, /*!< in: str size */ + const char* fmt, /*!< in: format */ + va_list ap); /*!< in: format values */ + +/**********************************************************************//** +A substitute for snprintf(3), formatted output conversion into +a limited buffer. +@return number of characters that would have been printed if the size +were unlimited, not including the terminating '\0'. */ +UNIV_INTERN +int +ut_snprintf( +/*========*/ + char* str, /*!< out: string */ + size_t size, /*!< in: str size */ + const char* fmt, /*!< in: format */ + ...); /*!< in: format values */ +#else +/**********************************************************************//** +A wrapper for vsnprintf(3), formatted output conversion into +a limited buffer. Note: this function DOES NOT return the number of +characters that would have been printed if the buffer was unlimited because +VC's _vsnprintf() returns -1 in this case and we would need to call +_vscprintf() in addition to estimate that but we would need another copy +of "ap" for that and VC does not provide va_copy(). */ +# define ut_vsnprintf(buf, size, fmt, ap) \ + ((void) vsnprintf(buf, size, fmt, ap)) +/**********************************************************************//** +A wrapper for snprintf(3), formatted output conversion into +a limited buffer. */ +# define ut_snprintf snprintf +#endif /* __WIN__ */ + +/*************************************************************//** +Convert an error number to a human readable text message. The +returned string is static and should not be freed or modified. +@return string, describing the error */ +UNIV_INTERN +const char* +ut_strerr( +/*======*/ + dberr_t num); /*!< in: error number */ + +/**************************************************************** +Sort function for ulint arrays. */ +UNIV_INTERN +void +ut_ulint_sort( +/*==========*/ + ulint* arr, /*!< in/out: array to sort */ + ulint* aux_arr, /*!< in/out: aux array to use in sort */ + ulint low, /*!< in: lower bound */ + ulint high) /*!< in: upper bound */ + __attribute__((nonnull)); + +#ifndef UNIV_NONINL +#include "ut0ut.ic" +#endif + +#endif /* !UNIV_INNOCHECKSUM */ + +#endif + diff --git a/storage/innobase/include/ut0ut.ic b/storage/innobase/include/ut0ut.ic new file mode 100644 index 00000000000..4e0f76e1957 --- /dev/null +++ b/storage/innobase/include/ut0ut.ic @@ -0,0 +1,162 @@ +/***************************************************************************** + +Copyright (c) 1994, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************************//** +@file include/ut0ut.ic +Various utilities + +Created 5/30/1994 Heikki Tuuri +*******************************************************************/ + +/******************************************************//** +Calculates the minimum of two ulints. +@return minimum */ +UNIV_INLINE +ulint +ut_min( +/*===*/ + ulint n1, /*!< in: first number */ + ulint n2) /*!< in: second number */ +{ + return((n1 <= n2) ? n1 : n2); +} + +/******************************************************//** +Calculates the maximum of two ulints. +@return maximum */ +UNIV_INLINE +ulint +ut_max( +/*===*/ + ulint n1, /*!< in: first number */ + ulint n2) /*!< in: second number */ +{ + return((n1 <= n2) ? n2 : n1); +} + +/****************************************************************//** +Calculates minimum of two ulint-pairs. */ +UNIV_INLINE +void +ut_pair_min( +/*========*/ + ulint* a, /*!< out: more significant part of minimum */ + ulint* b, /*!< out: less significant part of minimum */ + ulint a1, /*!< in: more significant part of first pair */ + ulint b1, /*!< in: less significant part of first pair */ + ulint a2, /*!< in: more significant part of second pair */ + ulint b2) /*!< in: less significant part of second pair */ +{ + if (a1 == a2) { + *a = a1; + *b = ut_min(b1, b2); + } else if (a1 < a2) { + *a = a1; + *b = b1; + } else { + *a = a2; + *b = b2; + } +} + +/******************************************************//** +Compares two ulints. +@return 1 if a > b, 0 if a == b, -1 if a < b */ +UNIV_INLINE +int +ut_ulint_cmp( +/*=========*/ + ulint a, /*!< in: ulint */ + ulint b) /*!< in: ulint */ +{ + if (a < b) { + return(-1); + } else if (a == b) { + return(0); + } else { + return(1); + } +} + +/*******************************************************//** +Compares two pairs of ulints. +@return -1 if a < b, 0 if a == b, 1 if a > b */ +UNIV_INLINE +int +ut_pair_cmp( +/*========*/ + ulint a1, /*!< in: more significant part of first pair */ + ulint a2, /*!< in: less significant part of first pair */ + ulint b1, /*!< in: more significant part of second pair */ + ulint b2) /*!< in: less significant part of second pair */ +{ + if (a1 > b1) { + return(1); + } else if (a1 < b1) { + return(-1); + } else if (a2 > b2) { + return(1); + } else if (a2 < b2) { + return(-1); + } else { + return(0); + } +} + +/*************************************************************//** +Calculates fast the 2-logarithm of a number, rounded upward to an +integer. +@return logarithm in the base 2, rounded upward */ +UNIV_INLINE +ulint +ut_2_log( +/*=====*/ + ulint n) /*!< in: number != 0 */ +{ + ulint res; + + res = 0; + + ut_ad(n > 0); + + n = n - 1; + + for (;;) { + n = n / 2; + + if (n == 0) { + break; + } + + res++; + } + + return(res + 1); +} + +/*************************************************************//** +Calculates 2 to power n. +@return 2 to power n */ +UNIV_INLINE +ulint +ut_2_exp( +/*=====*/ + ulint n) /*!< in: number */ +{ + return((ulint) 1 << n); +} diff --git a/storage/innobase/include/ut0vec.h b/storage/innobase/include/ut0vec.h new file mode 100644 index 00000000000..432fb348a09 --- /dev/null +++ b/storage/innobase/include/ut0vec.h @@ -0,0 +1,337 @@ +/***************************************************************************** + +Copyright (c) 2006, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/*******************************************************************//** +@file include/ut0vec.h +A vector of pointers to data items + +Created 4/6/2006 Osku Salerma +************************************************************************/ + +#ifndef IB_VECTOR_H +#define IB_VECTOR_H + +#include "univ.i" +#include "mem0mem.h" + +struct ib_alloc_t; +struct ib_vector_t; + +typedef void* (*ib_mem_alloc_t)( + /* out: Pointer to allocated memory */ + ib_alloc_t* allocator, /* in: Pointer to allocator instance */ + ulint size); /* in: Number of bytes to allocate */ + +typedef void (*ib_mem_free_t)( + ib_alloc_t* allocator, /* in: Pointer to allocator instance */ + void* ptr); /* in: Memory to free */ + +typedef void* (*ib_mem_resize_t)( + /* out: Pointer to resized memory */ + ib_alloc_t* allocator, /* in: Pointer to allocator */ + void* ptr, /* in: Memory to resize */ + ulint old_size, /* in: Old memory size in bytes */ + ulint new_size); /* in: New size in bytes */ + +typedef int (*ib_compare_t)(const void*, const void*); + +/* An automatically resizing vector datatype with the following properties: + + -All memory allocation is done through an allocator, which is responsible for +freeing it when done with the vector. +*/ + +/* This is useful shorthand for elements of type void* */ +#define ib_vector_getp(v, n) (*(void**) ib_vector_get(v, n)) +#define ib_vector_getp_const(v, n) (*(void**) ib_vector_get_const(v, n)) + +#define ib_vector_allocator(v) (v->allocator) + +/******************************************************************** +Create a new vector with the given initial size. */ +UNIV_INTERN +ib_vector_t* +ib_vector_create( +/*=============*/ + /* out: vector */ + ib_alloc_t* alloc, /* in: Allocator */ + /* in: size of the data item */ + ulint sizeof_value, + ulint size); /* in: initial size */ + +/******************************************************************** +Destroy the vector. Make sure the vector owns the allocator, e.g., +the heap in the the heap allocator. */ +UNIV_INLINE +void +ib_vector_free( +/*===========*/ + ib_vector_t* vec); /* in/out: vector */ + +/******************************************************************** +Push a new element to the vector, increasing its size if necessary, +if elem is not NULL then elem is copied to the vector.*/ +UNIV_INLINE +void* +ib_vector_push( +/*===========*/ + /* out: pointer the "new" element */ + ib_vector_t* vec, /* in/out: vector */ + const void* elem); /* in: data element */ + +/******************************************************************** +Pop the last element from the vector.*/ +UNIV_INLINE +void* +ib_vector_pop( +/*==========*/ + /* out: pointer to the "new" element */ + ib_vector_t* vec); /* in/out: vector */ + +/*******************************************************************//** +Remove an element to the vector +@return pointer to the "removed" element */ +UNIV_INLINE +void* +ib_vector_remove( +/*=============*/ + ib_vector_t* vec, /*!< in: vector */ + const void* elem); /*!< in: value to remove */ + +/******************************************************************** +Get the number of elements in the vector. */ +UNIV_INLINE +ulint +ib_vector_size( +/*===========*/ + /* out: number of elements in vector */ + const ib_vector_t* vec); /* in: vector */ + +/******************************************************************** +Increase the size of the vector. */ +UNIV_INTERN +void +ib_vector_resize( +/*=============*/ + /* out: number of elements in vector */ + ib_vector_t* vec); /* in/out: vector */ + +/******************************************************************** +Test whether a vector is empty or not. +@return TRUE if empty */ +UNIV_INLINE +ibool +ib_vector_is_empty( +/*===============*/ + const ib_vector_t* vec); /*!< in: vector */ + +/****************************************************************//** +Get the n'th element. +@return n'th element */ +UNIV_INLINE +void* +ib_vector_get( +/*==========*/ + ib_vector_t* vec, /*!< in: vector */ + ulint n); /*!< in: element index to get */ + +/******************************************************************** +Const version of the get n'th element. +@return n'th element */ +UNIV_INLINE +const void* +ib_vector_get_const( +/*================*/ + const ib_vector_t* vec, /* in: vector */ + ulint n); /* in: element index to get */ +/****************************************************************//** +Get last element. The vector must not be empty. +@return last element */ +UNIV_INLINE +void* +ib_vector_get_last( +/*===============*/ + ib_vector_t* vec); /*!< in: vector */ +/****************************************************************//** +Set the n'th element. */ +UNIV_INLINE +void +ib_vector_set( +/*==========*/ + ib_vector_t* vec, /*!< in/out: vector */ + ulint n, /*!< in: element index to set */ + void* elem); /*!< in: data element */ + +/******************************************************************** +Reset the vector size to 0 elements. */ +UNIV_INLINE +void +ib_vector_reset( +/*============*/ + ib_vector_t* vec); /* in/out: vector */ + +/******************************************************************** +Get the last element of the vector. */ +UNIV_INLINE +void* +ib_vector_last( +/*===========*/ + /* out: pointer to last element */ + ib_vector_t* vec); /* in/out: vector */ + +/******************************************************************** +Get the last element of the vector. */ +UNIV_INLINE +const void* +ib_vector_last_const( +/*=================*/ + /* out: pointer to last element */ + const ib_vector_t* vec); /* in: vector */ + +/******************************************************************** +Sort the vector elements. */ +UNIV_INLINE +void +ib_vector_sort( +/*===========*/ + ib_vector_t* vec, /* in/out: vector */ + ib_compare_t compare); /* in: the comparator to use for sort */ + +/******************************************************************** +The default ib_vector_t heap free. Does nothing. */ +UNIV_INLINE +void +ib_heap_free( +/*=========*/ + ib_alloc_t* allocator, /* in: allocator */ + void* ptr); /* in: size in bytes */ + +/******************************************************************** +The default ib_vector_t heap malloc. Uses mem_heap_alloc(). */ +UNIV_INLINE +void* +ib_heap_malloc( +/*===========*/ + /* out: pointer to allocated memory */ + ib_alloc_t* allocator, /* in: allocator */ + ulint size); /* in: size in bytes */ + +/******************************************************************** +The default ib_vector_t heap resize. Since we can't resize the heap +we have to copy the elements from the old ptr to the new ptr. +Uses mem_heap_alloc(). */ +UNIV_INLINE +void* +ib_heap_resize( +/*===========*/ + /* out: pointer to reallocated + memory */ + ib_alloc_t* allocator, /* in: allocator */ + void* old_ptr, /* in: pointer to memory */ + ulint old_size, /* in: old size in bytes */ + ulint new_size); /* in: new size in bytes */ + +/******************************************************************** +Create a heap allocator that uses the passed in heap. */ +UNIV_INLINE +ib_alloc_t* +ib_heap_allocator_create( +/*=====================*/ + /* out: heap allocator instance */ + mem_heap_t* heap); /* in: heap to use */ + +/******************************************************************** +Free a heap allocator. */ +UNIV_INLINE +void +ib_heap_allocator_free( +/*===================*/ + ib_alloc_t* ib_ut_alloc); /* in: alloc instace to free */ + +/******************************************************************** +Wrapper for ut_free(). */ +UNIV_INLINE +void +ib_ut_free( +/*=======*/ + ib_alloc_t* allocator, /* in: allocator */ + void* ptr); /* in: size in bytes */ + +/******************************************************************** +Wrapper for ut_malloc(). */ +UNIV_INLINE +void* +ib_ut_malloc( +/*=========*/ + /* out: pointer to allocated memory */ + ib_alloc_t* allocator, /* in: allocator */ + ulint size); /* in: size in bytes */ + +/******************************************************************** +Wrapper for ut_realloc(). */ +UNIV_INLINE +void* +ib_ut_resize( +/*=========*/ + /* out: pointer to reallocated + memory */ + ib_alloc_t* allocator, /* in: allocator */ + void* old_ptr, /* in: pointer to memory */ + ulint old_size, /* in: old size in bytes */ + ulint new_size); /* in: new size in bytes */ + +/******************************************************************** +Create a heap allocator that uses the passed in heap. */ +UNIV_INLINE +ib_alloc_t* +ib_ut_allocator_create(void); +/*=========================*/ + +/******************************************************************** +Create a heap allocator that uses the passed in heap. */ +UNIV_INLINE +void +ib_ut_allocator_free( +/*=================*/ + ib_alloc_t* ib_ut_alloc); /* in: alloc instace to free */ + +/* Allocator used by ib_vector_t. */ +struct ib_alloc_t { + ib_mem_alloc_t mem_malloc; /* For allocating memory */ + ib_mem_free_t mem_release; /* For freeing memory */ + ib_mem_resize_t mem_resize; /* For resizing memory */ + void* arg; /* Currently if not NULL then it + points to the heap instance */ +}; + +/* See comment at beginning of file. */ +struct ib_vector_t { + ib_alloc_t* allocator; /* Allocator, because one size + doesn't fit all */ + void* data; /* data elements */ + ulint used; /* number of elements currently used */ + ulint total; /* number of elements allocated */ + /* Size of a data item */ + ulint sizeof_value; +}; + +#ifndef UNIV_NONINL +#include "ut0vec.ic" +#endif + +#endif /* IB_VECTOR_H */ diff --git a/storage/innobase/include/ut0vec.ic b/storage/innobase/include/ut0vec.ic new file mode 100644 index 00000000000..f41a85e1d1d --- /dev/null +++ b/storage/innobase/include/ut0vec.ic @@ -0,0 +1,425 @@ +/***************************************************************************** + +Copyright (c) 2006, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/*******************************************************************//** +@file include/ut0vec.ic +A vector of pointers to data items + +Created 4/6/2006 Osku Salerma +************************************************************************/ + +#define IB_VEC_OFFSET(v, i) (vec->sizeof_value * i) + +/******************************************************************** +The default ib_vector_t heap malloc. Uses mem_heap_alloc(). */ +UNIV_INLINE +void* +ib_heap_malloc( +/*===========*/ + ib_alloc_t* allocator, /* in: allocator */ + ulint size) /* in: size in bytes */ +{ + mem_heap_t* heap = (mem_heap_t*) allocator->arg; + + return(mem_heap_alloc(heap, size)); +} + +/******************************************************************** +The default ib_vector_t heap free. Does nothing. */ +UNIV_INLINE +void +ib_heap_free( +/*=========*/ + ib_alloc_t* allocator UNIV_UNUSED, /* in: allocator */ + void* ptr UNIV_UNUSED) /* in: size in bytes */ +{ + /* We can't free individual elements. */ +} + +/******************************************************************** +The default ib_vector_t heap resize. Since we can't resize the heap +we have to copy the elements from the old ptr to the new ptr. +Uses mem_heap_alloc(). */ +UNIV_INLINE +void* +ib_heap_resize( +/*===========*/ + ib_alloc_t* allocator, /* in: allocator */ + void* old_ptr, /* in: pointer to memory */ + ulint old_size, /* in: old size in bytes */ + ulint new_size) /* in: new size in bytes */ +{ + void* new_ptr; + mem_heap_t* heap = (mem_heap_t*) allocator->arg; + + new_ptr = mem_heap_alloc(heap, new_size); + memcpy(new_ptr, old_ptr, old_size); + + return(new_ptr); +} + +/******************************************************************** +Create a heap allocator that uses the passed in heap. */ +UNIV_INLINE +ib_alloc_t* +ib_heap_allocator_create( +/*=====================*/ + mem_heap_t* heap) /* in: heap to use */ +{ + ib_alloc_t* heap_alloc; + + heap_alloc = (ib_alloc_t*) mem_heap_alloc(heap, sizeof(*heap_alloc)); + + heap_alloc->arg = heap; + heap_alloc->mem_release = ib_heap_free; + heap_alloc->mem_malloc = ib_heap_malloc; + heap_alloc->mem_resize = ib_heap_resize; + + return(heap_alloc); +} + +/******************************************************************** +Free a heap allocator. */ +UNIV_INLINE +void +ib_heap_allocator_free( +/*===================*/ + ib_alloc_t* ib_ut_alloc) /* in: alloc instace to free */ +{ + mem_heap_free((mem_heap_t*) ib_ut_alloc->arg); +} + +/******************************************************************** +Wrapper around ut_malloc(). */ +UNIV_INLINE +void* +ib_ut_malloc( +/*=========*/ + ib_alloc_t* allocator UNIV_UNUSED, /* in: allocator */ + ulint size) /* in: size in bytes */ +{ + return(ut_malloc(size)); +} + +/******************************************************************** +Wrapper around ut_free(). */ +UNIV_INLINE +void +ib_ut_free( +/*=======*/ + ib_alloc_t* allocator UNIV_UNUSED, /* in: allocator */ + void* ptr) /* in: size in bytes */ +{ + ut_free(ptr); +} + +/******************************************************************** +Wrapper aroung ut_realloc(). */ +UNIV_INLINE +void* +ib_ut_resize( +/*=========*/ + ib_alloc_t* allocator UNIV_UNUSED, /* in: allocator */ + void* old_ptr, /* in: pointer to memory */ + ulint old_size UNIV_UNUSED,/* in: old size in bytes */ + ulint new_size) /* in: new size in bytes */ +{ + return(ut_realloc(old_ptr, new_size)); +} + +/******************************************************************** +Create a ut allocator. */ +UNIV_INLINE +ib_alloc_t* +ib_ut_allocator_create(void) +/*========================*/ +{ + ib_alloc_t* ib_ut_alloc; + + ib_ut_alloc = (ib_alloc_t*) ut_malloc(sizeof(*ib_ut_alloc)); + + ib_ut_alloc->arg = NULL; + ib_ut_alloc->mem_release = ib_ut_free; + ib_ut_alloc->mem_malloc = ib_ut_malloc; + ib_ut_alloc->mem_resize = ib_ut_resize; + + return(ib_ut_alloc); +} + +/******************************************************************** +Free a ut allocator. */ +UNIV_INLINE +void +ib_ut_allocator_free( +/*=================*/ + ib_alloc_t* ib_ut_alloc) /* in: alloc instace to free */ +{ + ut_free(ib_ut_alloc); +} + +/******************************************************************** +Get number of elements in vector. */ +UNIV_INLINE +ulint +ib_vector_size( +/*===========*/ + /* out: number of elements in vector*/ + const ib_vector_t* vec) /* in: vector */ +{ + return(vec->used); +} + +/****************************************************************//** +Get n'th element. */ +UNIV_INLINE +void* +ib_vector_get( +/*==========*/ + ib_vector_t* vec, /*!< in: vector */ + ulint n) /*!< in: element index to get */ +{ + ut_a(n < vec->used); + + return((byte*) vec->data + IB_VEC_OFFSET(vec, n)); +} + +/******************************************************************** +Const version of the get n'th element. +@return n'th element */ +UNIV_INLINE +const void* +ib_vector_get_const( +/*================*/ + const ib_vector_t* vec, /* in: vector */ + ulint n) /* in: element index to get */ +{ + ut_a(n < vec->used); + + return((byte*) vec->data + IB_VEC_OFFSET(vec, n)); +} +/****************************************************************//** +Get last element. The vector must not be empty. +@return last element */ +UNIV_INLINE +void* +ib_vector_get_last( +/*===============*/ + ib_vector_t* vec) /*!< in: vector */ +{ + ut_a(vec->used > 0); + + return((byte*) ib_vector_get(vec, vec->used - 1)); +} + +/****************************************************************//** +Set the n'th element. */ +UNIV_INLINE +void +ib_vector_set( +/*==========*/ + ib_vector_t* vec, /*!< in/out: vector */ + ulint n, /*!< in: element index to set */ + void* elem) /*!< in: data element */ +{ + void* slot; + + ut_a(n < vec->used); + + slot = ((byte*) vec->data + IB_VEC_OFFSET(vec, n)); + memcpy(slot, elem, vec->sizeof_value); +} + +/******************************************************************** +Reset the vector size to 0 elements. */ +UNIV_INLINE +void +ib_vector_reset( +/*============*/ + /* out: void */ + ib_vector_t* vec) /* in: vector */ +{ + vec->used = 0; +} + +/******************************************************************** +Get the last element of the vector. */ +UNIV_INLINE +void* +ib_vector_last( +/*===========*/ + /* out: void */ + ib_vector_t* vec) /* in: vector */ +{ + ut_a(ib_vector_size(vec) > 0); + + return(ib_vector_get(vec, ib_vector_size(vec) - 1)); +} + +/******************************************************************** +Get the last element of the vector. */ +UNIV_INLINE +const void* +ib_vector_last_const( +/*=================*/ + /* out: void */ + const ib_vector_t* vec) /* in: vector */ +{ + ut_a(ib_vector_size(vec) > 0); + + return(ib_vector_get_const(vec, ib_vector_size(vec) - 1)); +} + +/****************************************************************//** +Remove the last element from the vector. +@return last vector element */ +UNIV_INLINE +void* +ib_vector_pop( +/*==========*/ + /* out: pointer to element */ + ib_vector_t* vec) /* in: vector */ +{ + void* elem; + + ut_a(vec->used > 0); + + elem = ib_vector_last(vec); + --vec->used; + + return(elem); +} + +/******************************************************************** +Append an element to the vector, if elem != NULL then copy the data +from elem.*/ +UNIV_INLINE +void* +ib_vector_push( +/*===========*/ + /* out: pointer to the "new" element */ + ib_vector_t* vec, /* in: vector */ + const void* elem) /* in: element to add (can be NULL) */ +{ + void* last; + + if (vec->used >= vec->total) { + ib_vector_resize(vec); + } + + last = (byte*) vec->data + IB_VEC_OFFSET(vec, vec->used); + +#ifdef UNIV_DEBUG + memset(last, 0, vec->sizeof_value); +#endif + + if (elem) { + memcpy(last, elem, vec->sizeof_value); + } + + ++vec->used; + + return(last); +} + +/*******************************************************************//** +Remove an element to the vector +@return pointer to the "removed" element */ +UNIV_INLINE +void* +ib_vector_remove( +/*=============*/ + ib_vector_t* vec, /*!< in: vector */ + const void* elem) /*!< in: value to remove */ +{ + void* current = NULL; + void* next; + ulint i; + ulint old_used_count = vec->used; + + for (i = 0; i < vec->used; i++) { + current = ib_vector_get(vec, i); + + if (*(void**) current == elem) { + if (i == vec->used - 1) { + return(ib_vector_pop(vec)); + } + + next = ib_vector_get(vec, i + 1); + memmove(current, next, vec->sizeof_value + * (vec->used - i - 1)); + --vec->used; + break; + } + } + + return((old_used_count != vec->used) ? current : NULL); +} + +/******************************************************************** +Sort the vector elements. */ +UNIV_INLINE +void +ib_vector_sort( +/*===========*/ + /* out: void */ + ib_vector_t* vec, /* in: vector */ + ib_compare_t compare)/* in: the comparator to use for sort */ +{ + qsort(vec->data, vec->used, vec->sizeof_value, compare); +} + +/******************************************************************** +Destroy the vector. Make sure the vector owns the allocator, e.g., +the heap in the the heap allocator. */ +UNIV_INLINE +void +ib_vector_free( +/*===========*/ + ib_vector_t* vec) /* in, own: vector */ +{ + /* Currently we only support two types of allocators, heap + and ut_malloc(), when the heap is freed all the elements are + freed too. With ut allocator, we need to free the elements, + the vector instance and the allocator separately. */ + + /* Only the heap allocator uses the arg field. */ + if (vec->allocator->arg) { + mem_heap_free((mem_heap_t*) vec->allocator->arg); + } else { + ib_alloc_t* allocator; + + allocator = vec->allocator; + + allocator->mem_release(allocator, vec->data); + allocator->mem_release(allocator, vec); + + ib_ut_allocator_free(allocator); + } +} + +/******************************************************************** +Test whether a vector is empty or not. +@return TRUE if empty */ +UNIV_INLINE +ibool +ib_vector_is_empty( +/*===============*/ + const ib_vector_t* vec) /*!< in: vector */ +{ + return(ib_vector_size(vec) == 0); +} diff --git a/storage/innobase/include/ut0wqueue.h b/storage/innobase/include/ut0wqueue.h new file mode 100644 index 00000000000..33385ddf2d4 --- /dev/null +++ b/storage/innobase/include/ut0wqueue.h @@ -0,0 +1,105 @@ +/***************************************************************************** + +Copyright (c) 2006, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/*******************************************************************//** +@file include/ut0wqueue.h +A work queue + +Created 4/26/2006 Osku Salerma +************************************************************************/ + +/*******************************************************************//** +A Work queue. Threads can add work items to the queue and other threads can +wait for work items to be available and take them off the queue for +processing. +************************************************************************/ + +#ifndef IB_WORK_QUEUE_H +#define IB_WORK_QUEUE_H + +#include "ut0list.h" +#include "mem0mem.h" +#include "os0sync.h" +#include "sync0types.h" + +struct ib_wqueue_t; + +/****************************************************************//** +Create a new work queue. +@return work queue */ +UNIV_INTERN +ib_wqueue_t* +ib_wqueue_create(void); +/*===================*/ + +/****************************************************************//** +Free a work queue. */ +UNIV_INTERN +void +ib_wqueue_free( +/*===========*/ + ib_wqueue_t* wq); /*!< in: work queue */ + +/****************************************************************//** +Add a work item to the queue. */ +UNIV_INTERN +void +ib_wqueue_add( +/*==========*/ + ib_wqueue_t* wq, /*!< in: work queue */ + void* item, /*!< in: work item */ + mem_heap_t* heap); /*!< in: memory heap to use for allocating the + list node */ + +/******************************************************************** +Check if queue is empty. */ + +ibool +ib_wqueue_is_empty( +/*===============*/ + /* out: TRUE if queue empty + else FALSE */ + const ib_wqueue_t* wq); /* in: work queue */ + +/****************************************************************//** +Wait for a work item to appear in the queue. +@return work item */ +UNIV_INTERN +void* +ib_wqueue_wait( +/*===========*/ + ib_wqueue_t* wq); /*!< in: work queue */ + +/******************************************************************** +Wait for a work item to appear in the queue for specified time. */ + +void* +ib_wqueue_timedwait( +/*================*/ + /* out: work item or NULL on timeout*/ + ib_wqueue_t* wq, /* in: work queue */ + ib_time_t wait_in_usecs); /* in: wait time in micro seconds */ + +/* Work queue. */ +struct ib_wqueue_t { + ib_mutex_t mutex; /*!< mutex protecting everything */ + ib_list_t* items; /*!< work item list */ + os_event_t event; /*!< event we use to signal additions to list */ +}; + +#endif |