summaryrefslogtreecommitdiff
path: root/storage/innobase/include
diff options
context:
space:
mode:
authorGuilhem Bichot <guilhem@mysql.com>2009-08-07 12:16:00 +0200
committerGuilhem Bichot <guilhem@mysql.com>2009-08-07 12:16:00 +0200
commit7ceb29ff17047a268d8e8670478f6e1669939904 (patch)
tree65b42f5cb11f29ea5b4414ff075ccafd48569ad6 /storage/innobase/include
parent7fa1449eac29401b3d1f3fb4980149e9e562a705 (diff)
downloadmariadb-git-7ceb29ff17047a268d8e8670478f6e1669939904.tar.gz
Renamed storage/innodb_plugin to storage/innobase, so that 1) it's the same
layout as we always had in trees containing only the builtin 2) win\configure.js WITH_INNOBASE_STORAGE_ENGINE still works. storage/innobase/CMakeLists.txt: fix to new directory name (and like 5.1) storage/innobase/Makefile.am: fix to new directory name (and like 5.1) storage/innobase/handler/ha_innodb.cc: fix to new directory name (and like 5.1) storage/innobase/plug.in: fix to new directory name (and like 5.1)
Diffstat (limited to 'storage/innobase/include')
-rw-r--r--storage/innobase/include/btr0btr.h509
-rw-r--r--storage/innobase/include/btr0btr.ic310
-rw-r--r--storage/innobase/include/btr0cur.h753
-rw-r--r--storage/innobase/include/btr0cur.ic200
-rw-r--r--storage/innobase/include/btr0pcur.h537
-rw-r--r--storage/innobase/include/btr0pcur.ic651
-rw-r--r--storage/innobase/include/btr0sea.h304
-rw-r--r--storage/innobase/include/btr0sea.ic84
-rw-r--r--storage/innobase/include/btr0types.h51
-rw-r--r--storage/innobase/include/buf0buddy.h90
-rw-r--r--storage/innobase/include/buf0buddy.ic127
-rw-r--r--storage/innobase/include/buf0buf.h1531
-rw-r--r--storage/innobase/include/buf0buf.ic1066
-rw-r--r--storage/innobase/include/buf0flu.h191
-rw-r--r--storage/innobase/include/buf0flu.ic123
-rw-r--r--storage/innobase/include/buf0lru.h263
-rw-r--r--storage/innobase/include/buf0lru.ic25
-rw-r--r--storage/innobase/include/buf0rea.h139
-rw-r--r--storage/innobase/include/buf0types.h80
-rw-r--r--storage/innobase/include/data0data.h483
-rw-r--r--storage/innobase/include/data0data.ic612
-rw-r--r--storage/innobase/include/data0type.h486
-rw-r--r--storage/innobase/include/data0type.ic599
-rw-r--r--storage/innobase/include/data0types.h36
-rw-r--r--storage/innobase/include/db0err.h105
-rw-r--r--storage/innobase/include/dict0boot.h150
-rw-r--r--storage/innobase/include/dict0boot.ic93
-rw-r--r--storage/innobase/include/dict0crea.h197
-rw-r--r--storage/innobase/include/dict0crea.ic25
-rw-r--r--storage/innobase/include/dict0dict.h1158
-rw-r--r--storage/innobase/include/dict0dict.ic806
-rw-r--r--storage/innobase/include/dict0load.h115
-rw-r--r--storage/innobase/include/dict0load.ic26
-rw-r--r--storage/innobase/include/dict0mem.h537
-rw-r--r--storage/innobase/include/dict0mem.ic26
-rw-r--r--storage/innobase/include/dict0types.h48
-rw-r--r--storage/innobase/include/dyn0dyn.h188
-rw-r--r--storage/innobase/include/dyn0dyn.ic365
-rw-r--r--storage/innobase/include/eval0eval.h114
-rw-r--r--storage/innobase/include/eval0eval.ic251
-rw-r--r--storage/innobase/include/eval0proc.h104
-rw-r--r--storage/innobase/include/eval0proc.ic88
-rw-r--r--storage/innobase/include/fil0fil.h726
-rw-r--r--storage/innobase/include/fsp0fsp.h359
-rw-r--r--storage/innobase/include/fsp0fsp.ic45
-rw-r--r--storage/innobase/include/fsp0types.h110
-rw-r--r--storage/innobase/include/fut0fut.h55
-rw-r--r--storage/innobase/include/fut0fut.ic56
-rw-r--r--storage/innobase/include/fut0lst.h217
-rw-r--r--storage/innobase/include/fut0lst.ic167
-rw-r--r--storage/innobase/include/ha0ha.h241
-rw-r--r--storage/innobase/include/ha0ha.ic220
-rw-r--r--storage/innobase/include/ha0storage.h140
-rw-r--r--storage/innobase/include/ha0storage.ic148
-rw-r--r--storage/innobase/include/ha_prototypes.h283
-rw-r--r--storage/innobase/include/handler0alter.h42
-rw-r--r--storage/innobase/include/hash0hash.h446
-rw-r--r--storage/innobase/include/hash0hash.ic163
-rw-r--r--storage/innobase/include/ibuf0ibuf.h377
-rw-r--r--storage/innobase/include/ibuf0ibuf.ic327
-rw-r--r--storage/innobase/include/ibuf0types.h31
-rw-r--r--storage/innobase/include/lock0iter.h69
-rw-r--r--storage/innobase/include/lock0lock.h809
-rw-r--r--storage/innobase/include/lock0lock.ic121
-rw-r--r--storage/innobase/include/lock0priv.h108
-rw-r--r--storage/innobase/include/lock0priv.ic49
-rw-r--r--storage/innobase/include/lock0types.h45
-rw-r--r--storage/innobase/include/log0log.h958
-rw-r--r--storage/innobase/include/log0log.ic417
-rw-r--r--storage/innobase/include/log0recv.h466
-rw-r--r--storage/innobase/include/log0recv.ic53
-rw-r--r--storage/innobase/include/mach0data.h400
-rw-r--r--storage/innobase/include/mach0data.ic786
-rw-r--r--storage/innobase/include/mem0dbg.h143
-rw-r--r--storage/innobase/include/mem0dbg.ic112
-rw-r--r--storage/innobase/include/mem0mem.h392
-rw-r--r--storage/innobase/include/mem0mem.ic646
-rw-r--r--storage/innobase/include/mem0pool.h129
-rw-r--r--storage/innobase/include/mem0pool.ic24
-rw-r--r--storage/innobase/include/mtr0log.h250
-rw-r--r--storage/innobase/include/mtr0log.ic274
-rw-r--r--storage/innobase/include/mtr0mtr.h416
-rw-r--r--storage/innobase/include/mtr0mtr.ic272
-rw-r--r--storage/innobase/include/mtr0types.h31
-rw-r--r--storage/innobase/include/mysql_addons.h33
-rw-r--r--storage/innobase/include/os0file.h796
-rw-r--r--storage/innobase/include/os0proc.h77
-rw-r--r--storage/innobase/include/os0proc.ic27
-rw-r--r--storage/innobase/include/os0sync.h386
-rw-r--r--storage/innobase/include/os0sync.ic53
-rw-r--r--storage/innobase/include/os0thread.h162
-rw-r--r--storage/innobase/include/os0thread.ic25
-rw-r--r--storage/innobase/include/page0cur.h346
-rw-r--r--storage/innobase/include/page0cur.ic299
-rw-r--r--storage/innobase/include/page0page.h1012
-rw-r--r--storage/innobase/include/page0page.ic1073
-rw-r--r--storage/innobase/include/page0types.h150
-rw-r--r--storage/innobase/include/page0zip.h471
-rw-r--r--storage/innobase/include/page0zip.ic397
-rw-r--r--storage/innobase/include/pars0grm.h236
-rw-r--r--storage/innobase/include/pars0opt.h75
-rw-r--r--storage/innobase/include/pars0opt.ic24
-rw-r--r--storage/innobase/include/pars0pars.h742
-rw-r--r--storage/innobase/include/pars0pars.ic24
-rw-r--r--storage/innobase/include/pars0sym.h244
-rw-r--r--storage/innobase/include/pars0sym.ic24
-rw-r--r--storage/innobase/include/pars0types.h50
-rw-r--r--storage/innobase/include/que0que.h513
-rw-r--r--storage/innobase/include/que0que.ic273
-rw-r--r--storage/innobase/include/que0types.h60
-rw-r--r--storage/innobase/include/read0read.h194
-rw-r--r--storage/innobase/include/read0read.ic98
-rw-r--r--storage/innobase/include/read0types.h32
-rw-r--r--storage/innobase/include/rem0cmp.h194
-rw-r--r--storage/innobase/include/rem0cmp.ic91
-rw-r--r--storage/innobase/include/rem0rec.h824
-rw-r--r--storage/innobase/include/rem0rec.ic1647
-rw-r--r--storage/innobase/include/rem0types.h46
-rw-r--r--storage/innobase/include/row0ext.h95
-rw-r--r--storage/innobase/include/row0ext.ic84
-rw-r--r--storage/innobase/include/row0ins.h156
-rw-r--r--storage/innobase/include/row0ins.ic26
-rw-r--r--storage/innobase/include/row0merge.h197
-rw-r--r--storage/innobase/include/row0mysql.h784
-rw-r--r--storage/innobase/include/row0mysql.ic24
-rw-r--r--storage/innobase/include/row0purge.h96
-rw-r--r--storage/innobase/include/row0purge.ic25
-rw-r--r--storage/innobase/include/row0row.h310
-rw-r--r--storage/innobase/include/row0row.ic120
-rw-r--r--storage/innobase/include/row0sel.h413
-rw-r--r--storage/innobase/include/row0sel.ic105
-rw-r--r--storage/innobase/include/row0types.h59
-rw-r--r--storage/innobase/include/row0uins.h54
-rw-r--r--storage/innobase/include/row0uins.ic25
-rw-r--r--storage/innobase/include/row0umod.h52
-rw-r--r--storage/innobase/include/row0umod.ic24
-rw-r--r--storage/innobase/include/row0undo.h142
-rw-r--r--storage/innobase/include/row0undo.ic24
-rw-r--r--storage/innobase/include/row0upd.h483
-rw-r--r--storage/innobase/include/row0upd.ic184
-rw-r--r--storage/innobase/include/row0vers.h142
-rw-r--r--storage/innobase/include/row0vers.ic30
-rw-r--r--storage/innobase/include/srv0que.h42
-rw-r--r--storage/innobase/include/srv0srv.h664
-rw-r--r--storage/innobase/include/srv0srv.ic24
-rw-r--r--storage/innobase/include/srv0start.h134
-rw-r--r--storage/innobase/include/sync0arr.h142
-rw-r--r--storage/innobase/include/sync0arr.ic27
-rw-r--r--storage/innobase/include/sync0rw.h585
-rw-r--r--storage/innobase/include/sync0rw.ic624
-rw-r--r--storage/innobase/include/sync0sync.h578
-rw-r--r--storage/innobase/include/sync0sync.ic222
-rw-r--r--storage/innobase/include/sync0types.h34
-rw-r--r--storage/innobase/include/thr0loc.h84
-rw-r--r--storage/innobase/include/thr0loc.ic24
-rw-r--r--storage/innobase/include/trx0i_s.h240
-rw-r--r--storage/innobase/include/trx0purge.h183
-rw-r--r--storage/innobase/include/trx0purge.ic43
-rw-r--r--storage/innobase/include/trx0rec.h338
-rw-r--r--storage/innobase/include/trx0rec.ic112
-rw-r--r--storage/innobase/include/trx0roll.h341
-rw-r--r--storage/innobase/include/trx0roll.ic40
-rw-r--r--storage/innobase/include/trx0rseg.h213
-rw-r--r--storage/innobase/include/trx0rseg.ic145
-rw-r--r--storage/innobase/include/trx0sys.h618
-rw-r--r--storage/innobase/include/trx0sys.ic387
-rw-r--r--storage/innobase/include/trx0trx.h814
-rw-r--r--storage/innobase/include/trx0trx.ic164
-rw-r--r--storage/innobase/include/trx0types.h108
-rw-r--r--storage/innobase/include/trx0undo.h544
-rw-r--r--storage/innobase/include/trx0undo.ic351
-rw-r--r--storage/innobase/include/trx0xa.h70
-rw-r--r--storage/innobase/include/univ.i498
-rw-r--r--storage/innobase/include/usr0sess.h78
-rw-r--r--storage/innobase/include/usr0sess.ic24
-rw-r--r--storage/innobase/include/usr0types.h31
-rw-r--r--storage/innobase/include/ut0auxconf.h14
-rw-r--r--storage/innobase/include/ut0byte.h270
-rw-r--r--storage/innobase/include/ut0byte.ic411
-rw-r--r--storage/innobase/include/ut0dbg.h175
-rw-r--r--storage/innobase/include/ut0list.h172
-rw-r--r--storage/innobase/include/ut0list.ic48
-rw-r--r--storage/innobase/include/ut0lst.h261
-rw-r--r--storage/innobase/include/ut0mem.h306
-rw-r--r--storage/innobase/include/ut0mem.ic338
-rw-r--r--storage/innobase/include/ut0rnd.h143
-rw-r--r--storage/innobase/include/ut0rnd.ic230
-rw-r--r--storage/innobase/include/ut0sort.h106
-rw-r--r--storage/innobase/include/ut0ut.h385
-rw-r--r--storage/innobase/include/ut0ut.ic162
-rw-r--r--storage/innobase/include/ut0vec.h125
-rw-r--r--storage/innobase/include/ut0vec.ic96
-rw-r--r--storage/innobase/include/ut0wqueue.h85
193 files changed, 50974 insertions, 0 deletions
diff --git a/storage/innobase/include/btr0btr.h b/storage/innobase/include/btr0btr.h
new file mode 100644
index 00000000000..d5c8258513c
--- /dev/null
+++ b/storage/innobase/include/btr0btr.h
@@ -0,0 +1,509 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/btr0btr.h
+The B-tree
+
+Created 6/2/1994 Heikki Tuuri
+*******************************************************/
+
+#ifndef btr0btr_h
+#define btr0btr_h
+
+#include "univ.i"
+
+#include "dict0dict.h"
+#include "data0data.h"
+#include "page0cur.h"
+#include "mtr0mtr.h"
+#include "btr0types.h"
+
+#ifndef UNIV_HOTBACKUP
+/** Maximum record size which can be stored on a page, without using the
+special big record storage structure */
+#define BTR_PAGE_MAX_REC_SIZE (UNIV_PAGE_SIZE / 2 - 200)
+
+/** @brief Maximum depth of a B-tree in InnoDB.
+
+Note that this isn't a maximum as such; none of the tree operations
+avoid producing trees bigger than this. It is instead a "max depth
+that other code must work with", useful for e.g. fixed-size arrays
+that must store some information about each level in a tree. In other
+words: if a B-tree with bigger depth than this is encountered, it is
+not acceptable for it to lead to mysterious memory corruption, but it
+is acceptable for the program to die with a clear assert failure. */
+#define BTR_MAX_LEVELS 100
+
+/** Latching modes for btr_cur_search_to_nth_level(). */
+enum btr_latch_mode {
+ /** Search a record on a leaf page and S-latch it. */
+ BTR_SEARCH_LEAF = RW_S_LATCH,
+ /** (Prepare to) modify a record on a leaf page and X-latch it. */
+ BTR_MODIFY_LEAF = RW_X_LATCH,
+ /** Obtain no latches. */
+ BTR_NO_LATCHES = RW_NO_LATCH,
+ /** Start modifying the entire B-tree. */
+ BTR_MODIFY_TREE = 33,
+ /** Continue modifying the entire B-tree. */
+ BTR_CONT_MODIFY_TREE = 34,
+ /** Search the previous record. */
+ BTR_SEARCH_PREV = 35,
+ /** Modify the previous record. */
+ BTR_MODIFY_PREV = 36
+};
+
+/** If this is ORed to btr_latch_mode, it means that the search tuple
+will be inserted to the index, at the searched position */
+#define BTR_INSERT 512
+
+/** This flag ORed to btr_latch_mode says that we do the search in query
+optimization */
+#define BTR_ESTIMATE 1024
+
+/** This flag ORed to btr_latch_mode says that we can ignore possible
+UNIQUE definition on secondary indexes when we decide if we can use
+the insert buffer to speed up inserts */
+#define BTR_IGNORE_SEC_UNIQUE 2048
+
+/**************************************************************//**
+Gets the root node of a tree and x-latches it.
+@return root page, x-latched */
+UNIV_INTERN
+page_t*
+btr_root_get(
+/*=========*/
+ dict_index_t* index, /*!< in: index tree */
+ mtr_t* mtr); /*!< in: mtr */
+/**************************************************************//**
+Gets a buffer page and declares its latching order level. */
+UNIV_INLINE
+buf_block_t*
+btr_block_get(
+/*==========*/
+ ulint space, /*!< in: space id */
+ ulint zip_size, /*!< in: compressed page size in bytes
+ or 0 for uncompressed pages */
+ ulint page_no, /*!< in: page number */
+ ulint mode, /*!< in: latch mode */
+ mtr_t* mtr); /*!< in: mtr */
+/**************************************************************//**
+Gets a buffer page and declares its latching order level. */
+UNIV_INLINE
+page_t*
+btr_page_get(
+/*=========*/
+ ulint space, /*!< in: space id */
+ ulint zip_size, /*!< in: compressed page size in bytes
+ or 0 for uncompressed pages */
+ ulint page_no, /*!< in: page number */
+ ulint mode, /*!< in: latch mode */
+ mtr_t* mtr); /*!< in: mtr */
+#endif /* !UNIV_HOTBACKUP */
+/**************************************************************//**
+Gets the index id field of a page.
+@return index id */
+UNIV_INLINE
+dulint
+btr_page_get_index_id(
+/*==================*/
+ const page_t* page); /*!< in: index page */
+#ifndef UNIV_HOTBACKUP
+/********************************************************//**
+Gets the node level field in an index page.
+@return level, leaf level == 0 */
+UNIV_INLINE
+ulint
+btr_page_get_level_low(
+/*===================*/
+ const page_t* page); /*!< in: index page */
+/********************************************************//**
+Gets the node level field in an index page.
+@return level, leaf level == 0 */
+UNIV_INLINE
+ulint
+btr_page_get_level(
+/*===============*/
+ const page_t* page, /*!< in: index page */
+ mtr_t* mtr); /*!< in: mini-transaction handle */
+/********************************************************//**
+Gets the next index page number.
+@return next page number */
+UNIV_INLINE
+ulint
+btr_page_get_next(
+/*==============*/
+ const page_t* page, /*!< in: index page */
+ mtr_t* mtr); /*!< in: mini-transaction handle */
+/********************************************************//**
+Gets the previous index page number.
+@return prev page number */
+UNIV_INLINE
+ulint
+btr_page_get_prev(
+/*==============*/
+ const page_t* page, /*!< in: index page */
+ mtr_t* mtr); /*!< in: mini-transaction handle */
+/*************************************************************//**
+Gets pointer to the previous user record in the tree. It is assumed
+that the caller has appropriate latches on the page and its neighbor.
+@return previous user record, NULL if there is none */
+UNIV_INTERN
+rec_t*
+btr_get_prev_user_rec(
+/*==================*/
+ rec_t* rec, /*!< in: record on leaf level */
+ mtr_t* mtr); /*!< in: mtr holding a latch on the page, and if
+ needed, also to the previous page */
+/*************************************************************//**
+Gets pointer to the next user record in the tree. It is assumed
+that the caller has appropriate latches on the page and its neighbor.
+@return next user record, NULL if there is none */
+UNIV_INTERN
+rec_t*
+btr_get_next_user_rec(
+/*==================*/
+ rec_t* rec, /*!< in: record on leaf level */
+ mtr_t* mtr); /*!< in: mtr holding a latch on the page, and if
+ needed, also to the next page */
+/**************************************************************//**
+Releases the latch on a leaf page and bufferunfixes it. */
+UNIV_INLINE
+void
+btr_leaf_page_release(
+/*==================*/
+ buf_block_t* block, /*!< in: buffer block */
+ ulint latch_mode, /*!< in: BTR_SEARCH_LEAF or
+ BTR_MODIFY_LEAF */
+ mtr_t* mtr); /*!< in: mtr */
+/**************************************************************//**
+Gets the child node file address in a node pointer.
+@return child node address */
+UNIV_INLINE
+ulint
+btr_node_ptr_get_child_page_no(
+/*===========================*/
+ const rec_t* rec, /*!< in: node pointer record */
+ const ulint* offsets);/*!< in: array returned by rec_get_offsets() */
+/************************************************************//**
+Creates the root node for a new index tree.
+@return page number of the created root, FIL_NULL if did not succeed */
+UNIV_INTERN
+ulint
+btr_create(
+/*=======*/
+ ulint type, /*!< in: type of the index */
+ ulint space, /*!< in: space where created */
+ ulint zip_size,/*!< in: compressed page size in bytes
+ or 0 for uncompressed pages */
+ dulint index_id,/*!< in: index id */
+ dict_index_t* index, /*!< in: index */
+ mtr_t* mtr); /*!< in: mini-transaction handle */
+/************************************************************//**
+Frees a B-tree except the root page, which MUST be freed after this
+by calling btr_free_root. */
+UNIV_INTERN
+void
+btr_free_but_not_root(
+/*==================*/
+ ulint space, /*!< in: space where created */
+ ulint zip_size, /*!< in: compressed page size in bytes
+ or 0 for uncompressed pages */
+ ulint root_page_no); /*!< in: root page number */
+/************************************************************//**
+Frees the B-tree root page. Other tree MUST already have been freed. */
+UNIV_INTERN
+void
+btr_free_root(
+/*==========*/
+ ulint space, /*!< in: space where created */
+ ulint zip_size, /*!< in: compressed page size in bytes
+ or 0 for uncompressed pages */
+ ulint root_page_no, /*!< in: root page number */
+ mtr_t* mtr); /*!< in: a mini-transaction which has already
+ been started */
+/*************************************************************//**
+Makes tree one level higher by splitting the root, and inserts
+the tuple. It is assumed that mtr contains an x-latch on the tree.
+NOTE that the operation of this function must always succeed,
+we cannot reverse it: therefore enough free disk space must be
+guaranteed to be available before this function is called.
+@return inserted record */
+UNIV_INTERN
+rec_t*
+btr_root_raise_and_insert(
+/*======================*/
+ btr_cur_t* cursor, /*!< in: cursor at which to insert: must be
+ on the root page; when the function returns,
+ the cursor is positioned on the predecessor
+ of the inserted record */
+ const dtuple_t* tuple, /*!< in: tuple to insert */
+ ulint n_ext, /*!< in: number of externally stored columns */
+ mtr_t* mtr); /*!< in: mtr */
+/*************************************************************//**
+Reorganizes an index page.
+IMPORTANT: if btr_page_reorganize() is invoked on a compressed leaf
+page of a non-clustered index, the caller must update the insert
+buffer free bits in the same mini-transaction in such a way that the
+modification will be redo-logged.
+@return TRUE on success, FALSE on failure */
+UNIV_INTERN
+ibool
+btr_page_reorganize(
+/*================*/
+ buf_block_t* block, /*!< in: page to be reorganized */
+ dict_index_t* index, /*!< in: record descriptor */
+ mtr_t* mtr); /*!< in: mtr */
+/*************************************************************//**
+Decides if the page should be split at the convergence point of
+inserts converging to left.
+@return TRUE if split recommended */
+UNIV_INTERN
+ibool
+btr_page_get_split_rec_to_left(
+/*===========================*/
+ btr_cur_t* cursor, /*!< in: cursor at which to insert */
+ rec_t** split_rec);/*!< out: if split recommended,
+ the first record on upper half page,
+ or NULL if tuple should be first */
+/*************************************************************//**
+Decides if the page should be split at the convergence point of
+inserts converging to right.
+@return TRUE if split recommended */
+UNIV_INTERN
+ibool
+btr_page_get_split_rec_to_right(
+/*============================*/
+ btr_cur_t* cursor, /*!< in: cursor at which to insert */
+ rec_t** split_rec);/*!< out: if split recommended,
+ the first record on upper half page,
+ or NULL if tuple should be first */
+/*************************************************************//**
+Splits an index page to halves and inserts the tuple. It is assumed
+that mtr holds an x-latch to the index tree. NOTE: the tree x-latch is
+released within this function! NOTE that the operation of this
+function must always succeed, we cannot reverse it: therefore enough
+free disk space (2 pages) must be guaranteed to be available before
+this function is called.
+
+@return inserted record */
+UNIV_INTERN
+rec_t*
+btr_page_split_and_insert(
+/*======================*/
+ btr_cur_t* cursor, /*!< in: cursor at which to insert; when the
+ function returns, the cursor is positioned
+ on the predecessor of the inserted record */
+ const dtuple_t* tuple, /*!< in: tuple to insert */
+ ulint n_ext, /*!< in: number of externally stored columns */
+ mtr_t* mtr); /*!< in: mtr */
+/*******************************************************//**
+Inserts a data tuple to a tree on a non-leaf level. It is assumed
+that mtr holds an x-latch on the tree. */
+UNIV_INTERN
+void
+btr_insert_on_non_leaf_level(
+/*=========================*/
+ dict_index_t* index, /*!< in: index */
+ ulint level, /*!< in: level, must be > 0 */
+ dtuple_t* tuple, /*!< in: the record to be inserted */
+ mtr_t* mtr); /*!< in: mtr */
+#endif /* !UNIV_HOTBACKUP */
+/****************************************************************//**
+Sets a record as the predefined minimum record. */
+UNIV_INTERN
+void
+btr_set_min_rec_mark(
+/*=================*/
+ rec_t* rec, /*!< in/out: record */
+ mtr_t* mtr); /*!< in: mtr */
+#ifndef UNIV_HOTBACKUP
+/*************************************************************//**
+Deletes on the upper level the node pointer to a page. */
+UNIV_INTERN
+void
+btr_node_ptr_delete(
+/*================*/
+ dict_index_t* index, /*!< in: index tree */
+ buf_block_t* block, /*!< in: page whose node pointer is deleted */
+ mtr_t* mtr); /*!< in: mtr */
+#ifdef UNIV_DEBUG
+/************************************************************//**
+Checks that the node pointer to a page is appropriate.
+@return TRUE */
+UNIV_INTERN
+ibool
+btr_check_node_ptr(
+/*===============*/
+ dict_index_t* index, /*!< in: index tree */
+ buf_block_t* block, /*!< in: index page */
+ mtr_t* mtr); /*!< in: mtr */
+#endif /* UNIV_DEBUG */
+/*************************************************************//**
+Tries to merge the page first to the left immediate brother if such a
+brother exists, and the node pointers to the current page and to the
+brother reside on the same page. If the left brother does not satisfy these
+conditions, looks at the right brother. If the page is the only one on that
+level lifts the records of the page to the father page, thus reducing the
+tree height. It is assumed that mtr holds an x-latch on the tree and on the
+page. If cursor is on the leaf level, mtr must also hold x-latches to
+the brothers, if they exist.
+@return TRUE on success */
+UNIV_INTERN
+ibool
+btr_compress(
+/*=========*/
+ btr_cur_t* cursor, /*!< in: cursor on the page to merge or lift;
+ the page must not be empty: in record delete
+ use btr_discard_page if the page would become
+ empty */
+ mtr_t* mtr); /*!< in: mtr */
+/*************************************************************//**
+Discards a page from a B-tree. This is used to remove the last record from
+a B-tree page: the whole page must be removed at the same time. This cannot
+be used for the root page, which is allowed to be empty. */
+UNIV_INTERN
+void
+btr_discard_page(
+/*=============*/
+ btr_cur_t* cursor, /*!< in: cursor on the page to discard: not on
+ the root page */
+ mtr_t* mtr); /*!< in: mtr */
+#endif /* !UNIV_HOTBACKUP */
+/****************************************************************//**
+Parses the redo log record for setting an index record as the predefined
+minimum record.
+@return end of log record or NULL */
+UNIV_INTERN
+byte*
+btr_parse_set_min_rec_mark(
+/*=======================*/
+ byte* ptr, /*!< in: buffer */
+ byte* end_ptr,/*!< in: buffer end */
+ ulint comp, /*!< in: nonzero=compact page format */
+ page_t* page, /*!< in: page or NULL */
+ mtr_t* mtr); /*!< in: mtr or NULL */
+/***********************************************************//**
+Parses a redo log record of reorganizing a page.
+@return end of log record or NULL */
+UNIV_INTERN
+byte*
+btr_parse_page_reorganize(
+/*======================*/
+ byte* ptr, /*!< in: buffer */
+ byte* end_ptr,/*!< in: buffer end */
+ dict_index_t* index, /*!< in: record descriptor */
+ buf_block_t* block, /*!< in: page to be reorganized, or NULL */
+ mtr_t* mtr); /*!< in: mtr or NULL */
+#ifndef UNIV_HOTBACKUP
+/**************************************************************//**
+Gets the number of pages in a B-tree.
+@return number of pages */
+UNIV_INTERN
+ulint
+btr_get_size(
+/*=========*/
+ dict_index_t* index, /*!< in: index */
+ ulint flag); /*!< in: BTR_N_LEAF_PAGES or BTR_TOTAL_SIZE */
+/**************************************************************//**
+Allocates a new file page to be used in an index tree. NOTE: we assume
+that the caller has made the reservation for free extents!
+@return new allocated block, x-latched; NULL if out of space */
+UNIV_INTERN
+buf_block_t*
+btr_page_alloc(
+/*===========*/
+ dict_index_t* index, /*!< in: index tree */
+ ulint hint_page_no, /*!< in: hint of a good page */
+ byte file_direction, /*!< in: direction where a possible
+ page split is made */
+ ulint level, /*!< in: level where the page is placed
+ in the tree */
+ mtr_t* mtr); /*!< in: mtr */
+/**************************************************************//**
+Frees a file page used in an index tree. NOTE: cannot free field external
+storage pages because the page must contain info on its level. */
+UNIV_INTERN
+void
+btr_page_free(
+/*==========*/
+ dict_index_t* index, /*!< in: index tree */
+ buf_block_t* block, /*!< in: block to be freed, x-latched */
+ mtr_t* mtr); /*!< in: mtr */
+/**************************************************************//**
+Frees a file page used in an index tree. Can be used also to BLOB
+external storage pages, because the page level 0 can be given as an
+argument. */
+UNIV_INTERN
+void
+btr_page_free_low(
+/*==============*/
+ dict_index_t* index, /*!< in: index tree */
+ buf_block_t* block, /*!< in: block to be freed, x-latched */
+ ulint level, /*!< in: page level */
+ mtr_t* mtr); /*!< in: mtr */
+#ifdef UNIV_BTR_PRINT
+/*************************************************************//**
+Prints size info of a B-tree. */
+UNIV_INTERN
+void
+btr_print_size(
+/*===========*/
+ dict_index_t* index); /*!< in: index tree */
+/**************************************************************//**
+Prints directories and other info of all nodes in the index. */
+UNIV_INTERN
+void
+btr_print_index(
+/*============*/
+ dict_index_t* index, /*!< in: index */
+ ulint width); /*!< in: print this many entries from start
+ and end */
+#endif /* UNIV_BTR_PRINT */
+/************************************************************//**
+Checks the size and number of fields in a record based on the definition of
+the index.
+@return TRUE if ok */
+UNIV_INTERN
+ibool
+btr_index_rec_validate(
+/*===================*/
+ const rec_t* rec, /*!< in: index record */
+ const dict_index_t* index, /*!< in: index */
+ ibool dump_on_error); /*!< in: TRUE if the function
+ should print hex dump of record
+ and page on error */
+/**************************************************************//**
+Checks the consistency of an index tree.
+@return TRUE if ok */
+UNIV_INTERN
+ibool
+btr_validate_index(
+/*===============*/
+ dict_index_t* index, /*!< in: index */
+ trx_t* trx); /*!< in: transaction or NULL */
+
+#define BTR_N_LEAF_PAGES 1
+#define BTR_TOTAL_SIZE 2
+#endif /* !UNIV_HOTBACKUP */
+
+#ifndef UNIV_NONINL
+#include "btr0btr.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/btr0btr.ic b/storage/innobase/include/btr0btr.ic
new file mode 100644
index 00000000000..2259d22c9a6
--- /dev/null
+++ b/storage/innobase/include/btr0btr.ic
@@ -0,0 +1,310 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/btr0btr.ic
+The B-tree
+
+Created 6/2/1994 Heikki Tuuri
+*******************************************************/
+
+#include "mach0data.h"
+#ifndef UNIV_HOTBACKUP
+#include "mtr0mtr.h"
+#include "mtr0log.h"
+#include "page0zip.h"
+
+#define BTR_MAX_NODE_LEVEL 50 /*!< Maximum B-tree page level
+ (not really a hard limit).
+ Used in debug assertions
+ in btr_page_set_level and
+ btr_page_get_level_low */
+
+/**************************************************************//**
+Gets a buffer page and declares its latching order level. */
+UNIV_INLINE
+buf_block_t*
+btr_block_get(
+/*==========*/
+ ulint space, /*!< in: space id */
+ ulint zip_size, /*!< in: compressed page size in bytes
+ or 0 for uncompressed pages */
+ ulint page_no, /*!< in: page number */
+ ulint mode, /*!< in: latch mode */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ buf_block_t* block;
+
+ block = buf_page_get(space, zip_size, page_no, mode, mtr);
+
+ if (mode != RW_NO_LATCH) {
+
+ buf_block_dbg_add_level(block, SYNC_TREE_NODE);
+ }
+
+ return(block);
+}
+
+/**************************************************************//**
+Gets a buffer page and declares its latching order level. */
+UNIV_INLINE
+page_t*
+btr_page_get(
+/*=========*/
+ ulint space, /*!< in: space id */
+ ulint zip_size, /*!< in: compressed page size in bytes
+ or 0 for uncompressed pages */
+ ulint page_no, /*!< in: page number */
+ ulint mode, /*!< in: latch mode */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ return(buf_block_get_frame(btr_block_get(space, zip_size, page_no,
+ mode, mtr)));
+}
+
+/**************************************************************//**
+Sets the index id field of a page. */
+UNIV_INLINE
+void
+btr_page_set_index_id(
+/*==================*/
+ page_t* page, /*!< in: page to be created */
+ page_zip_des_t* page_zip,/*!< in: compressed page whose uncompressed
+ part will be updated, or NULL */
+ dulint id, /*!< in: index id */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ if (UNIV_LIKELY_NULL(page_zip)) {
+ mach_write_to_8(page + (PAGE_HEADER + PAGE_INDEX_ID), id);
+ page_zip_write_header(page_zip,
+ page + (PAGE_HEADER + PAGE_INDEX_ID),
+ 8, mtr);
+ } else {
+ mlog_write_dulint(page + (PAGE_HEADER + PAGE_INDEX_ID),
+ id, mtr);
+ }
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/**************************************************************//**
+Gets the index id field of a page.
+@return index id */
+UNIV_INLINE
+dulint
+btr_page_get_index_id(
+/*==================*/
+ const page_t* page) /*!< in: index page */
+{
+ return(mach_read_from_8(page + PAGE_HEADER + PAGE_INDEX_ID));
+}
+
+#ifndef UNIV_HOTBACKUP
+/********************************************************//**
+Gets the node level field in an index page.
+@return level, leaf level == 0 */
+UNIV_INLINE
+ulint
+btr_page_get_level_low(
+/*===================*/
+ const page_t* page) /*!< in: index page */
+{
+ ulint level;
+
+ ut_ad(page);
+
+ level = mach_read_from_2(page + PAGE_HEADER + PAGE_LEVEL);
+
+ ut_ad(level <= BTR_MAX_NODE_LEVEL);
+
+ return(level);
+}
+
+/********************************************************//**
+Gets the node level field in an index page.
+@return level, leaf level == 0 */
+UNIV_INLINE
+ulint
+btr_page_get_level(
+/*===============*/
+ const page_t* page, /*!< in: index page */
+ mtr_t* mtr __attribute__((unused)))
+ /*!< in: mini-transaction handle */
+{
+ ut_ad(page && mtr);
+
+ return(btr_page_get_level_low(page));
+}
+
+/********************************************************//**
+Sets the node level field in an index page. */
+UNIV_INLINE
+void
+btr_page_set_level(
+/*===============*/
+ page_t* page, /*!< in: index page */
+ page_zip_des_t* page_zip,/*!< in: compressed page whose uncompressed
+ part will be updated, or NULL */
+ ulint level, /*!< in: level, leaf level == 0 */
+ mtr_t* mtr) /*!< in: mini-transaction handle */
+{
+ ut_ad(page && mtr);
+ ut_ad(level <= BTR_MAX_NODE_LEVEL);
+
+ if (UNIV_LIKELY_NULL(page_zip)) {
+ mach_write_to_2(page + (PAGE_HEADER + PAGE_LEVEL), level);
+ page_zip_write_header(page_zip,
+ page + (PAGE_HEADER + PAGE_LEVEL),
+ 2, mtr);
+ } else {
+ mlog_write_ulint(page + (PAGE_HEADER + PAGE_LEVEL), level,
+ MLOG_2BYTES, mtr);
+ }
+}
+
+/********************************************************//**
+Gets the next index page number.
+@return next page number */
+UNIV_INLINE
+ulint
+btr_page_get_next(
+/*==============*/
+ const page_t* page, /*!< in: index page */
+ mtr_t* mtr __attribute__((unused)))
+ /*!< in: mini-transaction handle */
+{
+ ut_ad(page && mtr);
+ ut_ad(mtr_memo_contains_page(mtr, page, MTR_MEMO_PAGE_X_FIX)
+ || mtr_memo_contains_page(mtr, page, MTR_MEMO_PAGE_S_FIX));
+
+ return(mach_read_from_4(page + FIL_PAGE_NEXT));
+}
+
+/********************************************************//**
+Sets the next index page field. */
+UNIV_INLINE
+void
+btr_page_set_next(
+/*==============*/
+ page_t* page, /*!< in: index page */
+ page_zip_des_t* page_zip,/*!< in: compressed page whose uncompressed
+ part will be updated, or NULL */
+ ulint next, /*!< in: next page number */
+ mtr_t* mtr) /*!< in: mini-transaction handle */
+{
+ ut_ad(page && mtr);
+
+ if (UNIV_LIKELY_NULL(page_zip)) {
+ mach_write_to_4(page + FIL_PAGE_NEXT, next);
+ page_zip_write_header(page_zip, page + FIL_PAGE_NEXT, 4, mtr);
+ } else {
+ mlog_write_ulint(page + FIL_PAGE_NEXT, next, MLOG_4BYTES, mtr);
+ }
+}
+
+/********************************************************//**
+Gets the previous index page number.
+@return prev page number */
+UNIV_INLINE
+ulint
+btr_page_get_prev(
+/*==============*/
+ const page_t* page, /*!< in: index page */
+ mtr_t* mtr __attribute__((unused))) /*!< in: mini-transaction handle */
+{
+ ut_ad(page && mtr);
+
+ return(mach_read_from_4(page + FIL_PAGE_PREV));
+}
+
+/********************************************************//**
+Sets the previous index page field. */
+UNIV_INLINE
+void
+btr_page_set_prev(
+/*==============*/
+ page_t* page, /*!< in: index page */
+ page_zip_des_t* page_zip,/*!< in: compressed page whose uncompressed
+ part will be updated, or NULL */
+ ulint prev, /*!< in: previous page number */
+ mtr_t* mtr) /*!< in: mini-transaction handle */
+{
+ ut_ad(page && mtr);
+
+ if (UNIV_LIKELY_NULL(page_zip)) {
+ mach_write_to_4(page + FIL_PAGE_PREV, prev);
+ page_zip_write_header(page_zip, page + FIL_PAGE_PREV, 4, mtr);
+ } else {
+ mlog_write_ulint(page + FIL_PAGE_PREV, prev, MLOG_4BYTES, mtr);
+ }
+}
+
+/**************************************************************//**
+Gets the child node file address in a node pointer.
+@return child node address */
+UNIV_INLINE
+ulint
+btr_node_ptr_get_child_page_no(
+/*===========================*/
+ const rec_t* rec, /*!< in: node pointer record */
+ const ulint* offsets)/*!< in: array returned by rec_get_offsets() */
+{
+ const byte* field;
+ ulint len;
+ ulint page_no;
+
+ ut_ad(!rec_offs_comp(offsets) || rec_get_node_ptr_flag(rec));
+
+ /* The child address is in the last field */
+ field = rec_get_nth_field(rec, offsets,
+ rec_offs_n_fields(offsets) - 1, &len);
+
+ ut_ad(len == 4);
+
+ page_no = mach_read_from_4(field);
+
+ if (UNIV_UNLIKELY(page_no == 0)) {
+ fprintf(stderr,
+ "InnoDB: a nonsensical page number 0"
+ " in a node ptr record at offset %lu\n",
+ (ulong) page_offset(rec));
+ buf_page_print(page_align(rec), 0);
+ }
+
+ return(page_no);
+}
+
+/**************************************************************//**
+Releases the latches on a leaf page and bufferunfixes it. */
+UNIV_INLINE
+void
+btr_leaf_page_release(
+/*==================*/
+ buf_block_t* block, /*!< in: buffer block */
+ ulint latch_mode, /*!< in: BTR_SEARCH_LEAF or
+ BTR_MODIFY_LEAF */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ut_ad(latch_mode == BTR_SEARCH_LEAF || latch_mode == BTR_MODIFY_LEAF);
+ ut_ad(!mtr_memo_contains(mtr, block, MTR_MEMO_MODIFY));
+
+ mtr_memo_release(mtr, block,
+ latch_mode == BTR_SEARCH_LEAF
+ ? MTR_MEMO_PAGE_S_FIX
+ : MTR_MEMO_PAGE_X_FIX);
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/innobase/include/btr0cur.h b/storage/innobase/include/btr0cur.h
new file mode 100644
index 00000000000..b2d43ae3254
--- /dev/null
+++ b/storage/innobase/include/btr0cur.h
@@ -0,0 +1,753 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/btr0cur.h
+The index tree cursor
+
+Created 10/16/1994 Heikki Tuuri
+*******************************************************/
+
+#ifndef btr0cur_h
+#define btr0cur_h
+
+#include "univ.i"
+#include "dict0dict.h"
+#include "page0cur.h"
+#include "btr0types.h"
+
+/* Mode flags for btr_cur operations; these can be ORed */
+#define BTR_NO_UNDO_LOG_FLAG 1 /* do no undo logging */
+#define BTR_NO_LOCKING_FLAG 2 /* do no record lock checking */
+#define BTR_KEEP_SYS_FLAG 4 /* sys fields will be found from the
+ update vector or inserted entry */
+
+#ifndef UNIV_HOTBACKUP
+#include "que0types.h"
+#include "row0types.h"
+#include "ha0ha.h"
+
+#define BTR_CUR_ADAPT
+#define BTR_CUR_HASH_ADAPT
+
+#ifdef UNIV_DEBUG
+/*********************************************************//**
+Returns the page cursor component of a tree cursor.
+@return pointer to page cursor component */
+UNIV_INLINE
+page_cur_t*
+btr_cur_get_page_cur(
+/*=================*/
+ const btr_cur_t* cursor);/*!< in: tree cursor */
+#else /* UNIV_DEBUG */
+# define btr_cur_get_page_cur(cursor) (&(cursor)->page_cur)
+#endif /* UNIV_DEBUG */
+/*********************************************************//**
+Returns the buffer block on which the tree cursor is positioned.
+@return pointer to buffer block */
+UNIV_INLINE
+buf_block_t*
+btr_cur_get_block(
+/*==============*/
+ btr_cur_t* cursor);/*!< in: tree cursor */
+/*********************************************************//**
+Returns the record pointer of a tree cursor.
+@return pointer to record */
+UNIV_INLINE
+rec_t*
+btr_cur_get_rec(
+/*============*/
+ btr_cur_t* cursor);/*!< in: tree cursor */
+/*********************************************************//**
+Returns the compressed page on which the tree cursor is positioned.
+@return pointer to compressed page, or NULL if the page is not compressed */
+UNIV_INLINE
+page_zip_des_t*
+btr_cur_get_page_zip(
+/*=================*/
+ btr_cur_t* cursor);/*!< in: tree cursor */
+/*********************************************************//**
+Invalidates a tree cursor by setting record pointer to NULL. */
+UNIV_INLINE
+void
+btr_cur_invalidate(
+/*===============*/
+ btr_cur_t* cursor);/*!< in: tree cursor */
+/*********************************************************//**
+Returns the page of a tree cursor.
+@return pointer to page */
+UNIV_INLINE
+page_t*
+btr_cur_get_page(
+/*=============*/
+ btr_cur_t* cursor);/*!< in: tree cursor */
+/*********************************************************//**
+Returns the index of a cursor.
+@return index */
+UNIV_INLINE
+dict_index_t*
+btr_cur_get_index(
+/*==============*/
+ btr_cur_t* cursor);/*!< in: B-tree cursor */
+/*********************************************************//**
+Positions a tree cursor at a given record. */
+UNIV_INLINE
+void
+btr_cur_position(
+/*=============*/
+ dict_index_t* index, /*!< in: index */
+ rec_t* rec, /*!< in: record in tree */
+ buf_block_t* block, /*!< in: buffer block of rec */
+ btr_cur_t* cursor);/*!< in: cursor */
+/********************************************************************//**
+Searches an index tree and positions a tree cursor on a given level.
+NOTE: n_fields_cmp in tuple must be set so that it cannot be compared
+to node pointer page number fields on the upper levels of the tree!
+Note that if mode is PAGE_CUR_LE, which is used in inserts, then
+cursor->up_match and cursor->low_match both will have sensible values.
+If mode is PAGE_CUR_GE, then up_match will a have a sensible value. */
+UNIV_INTERN
+void
+btr_cur_search_to_nth_level(
+/*========================*/
+ dict_index_t* index, /*!< in: index */
+ ulint level, /*!< in: the tree level of search */
+ const dtuple_t* tuple, /*!< in: data tuple; NOTE: n_fields_cmp in
+ tuple must be set so that it cannot get
+ compared to the node ptr page number field! */
+ ulint mode, /*!< in: PAGE_CUR_L, ...;
+ NOTE that if the search is made using a unique
+ prefix of a record, mode should be PAGE_CUR_LE,
+ not PAGE_CUR_GE, as the latter may end up on
+ the previous page of the record! Inserts
+ should always be made using PAGE_CUR_LE to
+ search the position! */
+ ulint latch_mode, /*!< in: BTR_SEARCH_LEAF, ..., ORed with
+ BTR_INSERT and BTR_ESTIMATE;
+ cursor->left_block is used to store a pointer
+ to the left neighbor page, in the cases
+ BTR_SEARCH_PREV and BTR_MODIFY_PREV;
+ NOTE that if has_search_latch
+ is != 0, we maybe do not have a latch set
+ on the cursor page, we assume
+ the caller uses his search latch
+ to protect the record! */
+ btr_cur_t* cursor, /*!< in/out: tree cursor; the cursor page is
+ s- or x-latched, but see also above! */
+ ulint has_search_latch,/*!< in: latch mode the caller
+ currently has on btr_search_latch:
+ RW_S_LATCH, or 0 */
+ mtr_t* mtr); /*!< in: mtr */
+/*****************************************************************//**
+Opens a cursor at either end of an index. */
+UNIV_INTERN
+void
+btr_cur_open_at_index_side(
+/*=======================*/
+ ibool from_left, /*!< in: TRUE if open to the low end,
+ FALSE if to the high end */
+ dict_index_t* index, /*!< in: index */
+ ulint latch_mode, /*!< in: latch mode */
+ btr_cur_t* cursor, /*!< in: cursor */
+ mtr_t* mtr); /*!< in: mtr */
+/**********************************************************************//**
+Positions a cursor at a randomly chosen position within a B-tree. */
+UNIV_INTERN
+void
+btr_cur_open_at_rnd_pos(
+/*====================*/
+ dict_index_t* index, /*!< in: index */
+ ulint latch_mode, /*!< in: BTR_SEARCH_LEAF, ... */
+ btr_cur_t* cursor, /*!< in/out: B-tree cursor */
+ mtr_t* mtr); /*!< in: mtr */
+/*************************************************************//**
+Tries to perform an insert to a page in an index tree, next to cursor.
+It is assumed that mtr holds an x-latch on the page. The operation does
+not succeed if there is too little space on the page. If there is just
+one record on the page, the insert will always succeed; this is to
+prevent trying to split a page with just one record.
+@return DB_SUCCESS, DB_WAIT_LOCK, DB_FAIL, or error number */
+UNIV_INTERN
+ulint
+btr_cur_optimistic_insert(
+/*======================*/
+ ulint flags, /*!< in: undo logging and locking flags: if not
+ zero, the parameters index and thr should be
+ specified */
+ btr_cur_t* cursor, /*!< in: cursor on page after which to insert;
+ cursor stays valid */
+ dtuple_t* entry, /*!< in/out: entry to insert */
+ rec_t** rec, /*!< out: pointer to inserted record if
+ succeed */
+ big_rec_t** big_rec,/*!< out: big rec vector whose fields have to
+ be stored externally by the caller, or
+ NULL */
+ ulint n_ext, /*!< in: number of externally stored columns */
+ que_thr_t* thr, /*!< in: query thread or NULL */
+ mtr_t* mtr); /*!< in: mtr; if this function returns
+ DB_SUCCESS on a leaf page of a secondary
+ index in a compressed tablespace, the
+ mtr must be committed before latching
+ any further pages */
+/*************************************************************//**
+Performs an insert on a page of an index tree. It is assumed that mtr
+holds an x-latch on the tree and on the cursor page. If the insert is
+made on the leaf level, to avoid deadlocks, mtr must also own x-latches
+to brothers of page, if those brothers exist.
+@return DB_SUCCESS or error number */
+UNIV_INTERN
+ulint
+btr_cur_pessimistic_insert(
+/*=======================*/
+ ulint flags, /*!< in: undo logging and locking flags: if not
+ zero, the parameter thr should be
+ specified; if no undo logging is specified,
+ then the caller must have reserved enough
+ free extents in the file space so that the
+ insertion will certainly succeed */
+ btr_cur_t* cursor, /*!< in: cursor after which to insert;
+ cursor stays valid */
+ dtuple_t* entry, /*!< in/out: entry to insert */
+ rec_t** rec, /*!< out: pointer to inserted record if
+ succeed */
+ big_rec_t** big_rec,/*!< out: big rec vector whose fields have to
+ be stored externally by the caller, or
+ NULL */
+ ulint n_ext, /*!< in: number of externally stored columns */
+ que_thr_t* thr, /*!< in: query thread or NULL */
+ mtr_t* mtr); /*!< in: mtr */
+/*************************************************************//**
+Updates a record when the update causes no size changes in its fields.
+@return DB_SUCCESS or error number */
+UNIV_INTERN
+ulint
+btr_cur_update_in_place(
+/*====================*/
+ ulint flags, /*!< in: undo logging and locking flags */
+ btr_cur_t* cursor, /*!< in: cursor on the record to update;
+ cursor stays valid and positioned on the
+ same record */
+ const upd_t* update, /*!< in: update vector */
+ ulint cmpl_info,/*!< in: compiler info on secondary index
+ updates */
+ que_thr_t* thr, /*!< in: query thread */
+ mtr_t* mtr); /*!< in: mtr; must be committed before
+ latching any further pages */
+/*************************************************************//**
+Tries to update a record on a page in an index tree. It is assumed that mtr
+holds an x-latch on the page. The operation does not succeed if there is too
+little space on the page or if the update would result in too empty a page,
+so that tree compression is recommended.
+@return DB_SUCCESS, or DB_OVERFLOW if the updated record does not fit,
+DB_UNDERFLOW if the page would become too empty, or DB_ZIP_OVERFLOW if
+there is not enough space left on the compressed page */
+UNIV_INTERN
+ulint
+btr_cur_optimistic_update(
+/*======================*/
+ ulint flags, /*!< in: undo logging and locking flags */
+ btr_cur_t* cursor, /*!< in: cursor on the record to update;
+ cursor stays valid and positioned on the
+ same record */
+ const upd_t* update, /*!< in: update vector; this must also
+ contain trx id and roll ptr fields */
+ ulint cmpl_info,/*!< in: compiler info on secondary index
+ updates */
+ que_thr_t* thr, /*!< in: query thread */
+ mtr_t* mtr); /*!< in: mtr; must be committed before
+ latching any further pages */
+/*************************************************************//**
+Performs an update of a record on a page of a tree. It is assumed
+that mtr holds an x-latch on the tree and on the cursor page. If the
+update is made on the leaf level, to avoid deadlocks, mtr must also
+own x-latches to brothers of page, if those brothers exist.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+ulint
+btr_cur_pessimistic_update(
+/*=======================*/
+ ulint flags, /*!< in: undo logging, locking, and rollback
+ flags */
+ btr_cur_t* cursor, /*!< in: cursor on the record to update */
+ mem_heap_t** heap, /*!< in/out: pointer to memory heap, or NULL */
+ big_rec_t** big_rec,/*!< out: big rec vector whose fields have to
+ be stored externally by the caller, or NULL */
+ const upd_t* update, /*!< in: update vector; this is allowed also
+ contain trx id and roll ptr fields, but
+ the values in update vector have no effect */
+ ulint cmpl_info,/*!< in: compiler info on secondary index
+ updates */
+ que_thr_t* thr, /*!< in: query thread */
+ mtr_t* mtr); /*!< in: mtr; must be committed before
+ latching any further pages */
+/***********************************************************//**
+Marks a clustered index record deleted. Writes an undo log record to
+undo log on this delete marking. Writes in the trx id field the id
+of the deleting transaction, and in the roll ptr field pointer to the
+undo log record created.
+@return DB_SUCCESS, DB_LOCK_WAIT, or error number */
+UNIV_INTERN
+ulint
+btr_cur_del_mark_set_clust_rec(
+/*===========================*/
+ ulint flags, /*!< in: undo logging and locking flags */
+ btr_cur_t* cursor, /*!< in: cursor */
+ ibool val, /*!< in: value to set */
+ que_thr_t* thr, /*!< in: query thread */
+ mtr_t* mtr); /*!< in: mtr */
+/***********************************************************//**
+Sets a secondary index record delete mark to TRUE or FALSE.
+@return DB_SUCCESS, DB_LOCK_WAIT, or error number */
+UNIV_INTERN
+ulint
+btr_cur_del_mark_set_sec_rec(
+/*=========================*/
+ ulint flags, /*!< in: locking flag */
+ btr_cur_t* cursor, /*!< in: cursor */
+ ibool val, /*!< in: value to set */
+ que_thr_t* thr, /*!< in: query thread */
+ mtr_t* mtr); /*!< in: mtr */
+/***********************************************************//**
+Clear a secondary index record's delete mark. This function is only
+used by the insert buffer insert merge mechanism. */
+UNIV_INTERN
+void
+btr_cur_del_unmark_for_ibuf(
+/*========================*/
+ rec_t* rec, /*!< in/out: record to delete unmark */
+ page_zip_des_t* page_zip, /*!< in/out: compressed page
+ corresponding to rec, or NULL
+ when the tablespace is
+ uncompressed */
+ mtr_t* mtr); /*!< in: mtr */
+/*************************************************************//**
+Tries to compress a page of the tree if it seems useful. It is assumed
+that mtr holds an x-latch on the tree and on the cursor page. To avoid
+deadlocks, mtr must also own x-latches to brothers of page, if those
+brothers exist. NOTE: it is assumed that the caller has reserved enough
+free extents so that the compression will always succeed if done!
+@return TRUE if compression occurred */
+UNIV_INTERN
+ibool
+btr_cur_compress_if_useful(
+/*=======================*/
+ btr_cur_t* cursor, /*!< in: cursor on the page to compress;
+ cursor does not stay valid if compression
+ occurs */
+ mtr_t* mtr); /*!< in: mtr */
+/*******************************************************//**
+Removes the record on which the tree cursor is positioned. It is assumed
+that the mtr has an x-latch on the page where the cursor is positioned,
+but no latch on the whole tree.
+@return TRUE if success, i.e., the page did not become too empty */
+UNIV_INTERN
+ibool
+btr_cur_optimistic_delete(
+/*======================*/
+ btr_cur_t* cursor, /*!< in: cursor on the record to delete;
+ cursor stays valid: if deletion succeeds,
+ on function exit it points to the successor
+ of the deleted record */
+ mtr_t* mtr); /*!< in: mtr; if this function returns
+ TRUE on a leaf page of a secondary
+ index, the mtr must be committed
+ before latching any further pages */
+/*************************************************************//**
+Removes the record on which the tree cursor is positioned. Tries
+to compress the page if its fillfactor drops below a threshold
+or if it is the only page on the level. It is assumed that mtr holds
+an x-latch on the tree and on the cursor page. To avoid deadlocks,
+mtr must also own x-latches to brothers of page, if those brothers
+exist.
+@return TRUE if compression occurred */
+UNIV_INTERN
+ibool
+btr_cur_pessimistic_delete(
+/*=======================*/
+ ulint* err, /*!< out: DB_SUCCESS or DB_OUT_OF_FILE_SPACE;
+ the latter may occur because we may have
+ to update node pointers on upper levels,
+ and in the case of variable length keys
+ these may actually grow in size */
+ ibool has_reserved_extents, /*!< in: TRUE if the
+ caller has already reserved enough free
+ extents so that he knows that the operation
+ will succeed */
+ btr_cur_t* cursor, /*!< in: cursor on the record to delete;
+ if compression does not occur, the cursor
+ stays valid: it points to successor of
+ deleted record on function exit */
+ enum trx_rb_ctx rb_ctx, /*!< in: rollback context */
+ mtr_t* mtr); /*!< in: mtr */
+#endif /* !UNIV_HOTBACKUP */
+/***********************************************************//**
+Parses a redo log record of updating a record in-place.
+@return end of log record or NULL */
+UNIV_INTERN
+byte*
+btr_cur_parse_update_in_place(
+/*==========================*/
+ byte* ptr, /*!< in: buffer */
+ byte* end_ptr,/*!< in: buffer end */
+ page_t* page, /*!< in/out: page or NULL */
+ page_zip_des_t* page_zip,/*!< in/out: compressed page, or NULL */
+ dict_index_t* index); /*!< in: index corresponding to page */
+/****************************************************************//**
+Parses the redo log record for delete marking or unmarking of a clustered
+index record.
+@return end of log record or NULL */
+UNIV_INTERN
+byte*
+btr_cur_parse_del_mark_set_clust_rec(
+/*=================================*/
+ byte* ptr, /*!< in: buffer */
+ byte* end_ptr,/*!< in: buffer end */
+ page_t* page, /*!< in/out: page or NULL */
+ page_zip_des_t* page_zip,/*!< in/out: compressed page, or NULL */
+ dict_index_t* index); /*!< in: index corresponding to page */
+/****************************************************************//**
+Parses the redo log record for delete marking or unmarking of a secondary
+index record.
+@return end of log record or NULL */
+UNIV_INTERN
+byte*
+btr_cur_parse_del_mark_set_sec_rec(
+/*===============================*/
+ byte* ptr, /*!< in: buffer */
+ byte* end_ptr,/*!< in: buffer end */
+ page_t* page, /*!< in/out: page or NULL */
+ page_zip_des_t* page_zip);/*!< in/out: compressed page, or NULL */
+#ifndef UNIV_HOTBACKUP
+/*******************************************************************//**
+Estimates the number of rows in a given index range.
+@return estimated number of rows */
+UNIV_INTERN
+ib_int64_t
+btr_estimate_n_rows_in_range(
+/*=========================*/
+ dict_index_t* index, /*!< in: index */
+ const dtuple_t* tuple1, /*!< in: range start, may also be empty tuple */
+ ulint mode1, /*!< in: search mode for range start */
+ const dtuple_t* tuple2, /*!< in: range end, may also be empty tuple */
+ ulint mode2); /*!< in: search mode for range end */
+/*******************************************************************//**
+Estimates the number of different key values in a given index, for
+each n-column prefix of the index where n <= dict_index_get_n_unique(index).
+The estimates are stored in the array index->stat_n_diff_key_vals. */
+UNIV_INTERN
+void
+btr_estimate_number_of_different_key_vals(
+/*======================================*/
+ dict_index_t* index); /*!< in: index */
+/*******************************************************************//**
+Marks not updated extern fields as not-owned by this record. The ownership
+is transferred to the updated record which is inserted elsewhere in the
+index tree. In purge only the owner of externally stored field is allowed
+to free the field. */
+UNIV_INTERN
+void
+btr_cur_mark_extern_inherited_fields(
+/*=================================*/
+ page_zip_des_t* page_zip,/*!< in/out: compressed page whose uncompressed
+ part will be updated, or NULL */
+ rec_t* rec, /*!< in/out: record in a clustered index */
+ dict_index_t* index, /*!< in: index of the page */
+ const ulint* offsets,/*!< in: array returned by rec_get_offsets() */
+ const upd_t* update, /*!< in: update vector */
+ mtr_t* mtr); /*!< in: mtr, or NULL if not logged */
+/*******************************************************************//**
+The complement of the previous function: in an update entry may inherit
+some externally stored fields from a record. We must mark them as inherited
+in entry, so that they are not freed in a rollback. */
+UNIV_INTERN
+void
+btr_cur_mark_dtuple_inherited_extern(
+/*=================================*/
+ dtuple_t* entry, /*!< in/out: updated entry to be
+ inserted to clustered index */
+ const upd_t* update); /*!< in: update vector */
+/*******************************************************************//**
+Marks all extern fields in a dtuple as owned by the record. */
+UNIV_INTERN
+void
+btr_cur_unmark_dtuple_extern_fields(
+/*================================*/
+ dtuple_t* entry); /*!< in/out: clustered index entry */
+/*******************************************************************//**
+Stores the fields in big_rec_vec to the tablespace and puts pointers to
+them in rec. The extern flags in rec will have to be set beforehand.
+The fields are stored on pages allocated from leaf node
+file segment of the index tree.
+@return DB_SUCCESS or error */
+UNIV_INTERN
+ulint
+btr_store_big_rec_extern_fields(
+/*============================*/
+ dict_index_t* index, /*!< in: index of rec; the index tree
+ MUST be X-latched */
+ buf_block_t* rec_block, /*!< in/out: block containing rec */
+ rec_t* rec, /*!< in: record */
+ const ulint* offsets, /*!< in: rec_get_offsets(rec, index);
+ the "external storage" flags in offsets
+ will not correspond to rec when
+ this function returns */
+ big_rec_t* big_rec_vec, /*!< in: vector containing fields
+ to be stored externally */
+ mtr_t* local_mtr); /*!< in: mtr containing the latch to
+ rec and to the tree */
+/*******************************************************************//**
+Frees the space in an externally stored field to the file space
+management if the field in data is owned the externally stored field,
+in a rollback we may have the additional condition that the field must
+not be inherited. */
+UNIV_INTERN
+void
+btr_free_externally_stored_field(
+/*=============================*/
+ dict_index_t* index, /*!< in: index of the data, the index
+ tree MUST be X-latched; if the tree
+ height is 1, then also the root page
+ must be X-latched! (this is relevant
+ in the case this function is called
+ from purge where 'data' is located on
+ an undo log page, not an index
+ page) */
+ byte* field_ref, /*!< in/out: field reference */
+ const rec_t* rec, /*!< in: record containing field_ref, for
+ page_zip_write_blob_ptr(), or NULL */
+ const ulint* offsets, /*!< in: rec_get_offsets(rec, index),
+ or NULL */
+ page_zip_des_t* page_zip, /*!< in: compressed page corresponding
+ to rec, or NULL if rec == NULL */
+ ulint i, /*!< in: field number of field_ref;
+ ignored if rec == NULL */
+ enum trx_rb_ctx rb_ctx, /*!< in: rollback context */
+ mtr_t* local_mtr); /*!< in: mtr containing the latch to
+ data an an X-latch to the index
+ tree */
+/*******************************************************************//**
+Copies the prefix of an externally stored field of a record. The
+clustered index record must be protected by a lock or a page latch.
+@return the length of the copied field, or 0 if the column was being
+or has been deleted */
+UNIV_INTERN
+ulint
+btr_copy_externally_stored_field_prefix(
+/*====================================*/
+ byte* buf, /*!< out: the field, or a prefix of it */
+ ulint len, /*!< in: length of buf, in bytes */
+ ulint zip_size,/*!< in: nonzero=compressed BLOB page size,
+ zero for uncompressed BLOBs */
+ const byte* data, /*!< in: 'internally' stored part of the
+ field containing also the reference to
+ the external part; must be protected by
+ a lock or a page latch */
+ ulint local_len);/*!< in: length of data, in bytes */
+/*******************************************************************//**
+Copies an externally stored field of a record to mem heap.
+@return the field copied to heap */
+UNIV_INTERN
+byte*
+btr_rec_copy_externally_stored_field(
+/*=================================*/
+ const rec_t* rec, /*!< in: record in a clustered index;
+ must be protected by a lock or a page latch */
+ const ulint* offsets,/*!< in: array returned by rec_get_offsets() */
+ ulint zip_size,/*!< in: nonzero=compressed BLOB page size,
+ zero for uncompressed BLOBs */
+ ulint no, /*!< in: field number */
+ ulint* len, /*!< out: length of the field */
+ mem_heap_t* heap); /*!< in: mem heap */
+/*******************************************************************//**
+Flags the data tuple fields that are marked as extern storage in the
+update vector. We use this function to remember which fields we must
+mark as extern storage in a record inserted for an update.
+@return number of flagged external columns */
+UNIV_INTERN
+ulint
+btr_push_update_extern_fields(
+/*==========================*/
+ dtuple_t* tuple, /*!< in/out: data tuple */
+ const upd_t* update, /*!< in: update vector */
+ mem_heap_t* heap) /*!< in: memory heap */
+ __attribute__((nonnull));
+
+/*######################################################################*/
+
+/** In the pessimistic delete, if the page data size drops below this
+limit, merging it to a neighbor is tried */
+#define BTR_CUR_PAGE_COMPRESS_LIMIT (UNIV_PAGE_SIZE / 2)
+
+/** A slot in the path array. We store here info on a search path down the
+tree. Each slot contains data on a single level of the tree. */
+
+typedef struct btr_path_struct btr_path_t;
+struct btr_path_struct{
+ ulint nth_rec; /*!< index of the record
+ where the page cursor stopped on
+ this level (index in alphabetical
+ order); value ULINT_UNDEFINED
+ denotes array end */
+ ulint n_recs; /*!< number of records on the page */
+};
+
+#define BTR_PATH_ARRAY_N_SLOTS 250 /*!< size of path array (in slots) */
+
+/** Values for the flag documenting the used search method */
+enum btr_cur_method {
+ BTR_CUR_HASH = 1, /*!< successful shortcut using
+ the hash index */
+ BTR_CUR_HASH_FAIL, /*!< failure using hash, success using
+ binary search: the misleading hash
+ reference is stored in the field
+ hash_node, and might be necessary to
+ update */
+ BTR_CUR_BINARY, /*!< success using the binary search */
+ BTR_CUR_INSERT_TO_IBUF, /*!< performed the intended insert to
+ the insert buffer */
+};
+
+/** The tree cursor: the definition appears here only for the compiler
+to know struct size! */
+struct btr_cur_struct {
+ dict_index_t* index; /*!< index where positioned */
+ page_cur_t page_cur; /*!< page cursor */
+ buf_block_t* left_block; /*!< this field is used to store
+ a pointer to the left neighbor
+ page, in the cases
+ BTR_SEARCH_PREV and
+ BTR_MODIFY_PREV */
+ /*------------------------------*/
+ que_thr_t* thr; /*!< this field is only used
+ when btr_cur_search_to_nth_level
+ is called for an index entry
+ insertion: the calling query
+ thread is passed here to be
+ used in the insert buffer */
+ /*------------------------------*/
+ /** The following fields are used in
+ btr_cur_search_to_nth_level to pass information: */
+ /* @{ */
+ enum btr_cur_method flag; /*!< Search method used */
+ ulint tree_height; /*!< Tree height if the search is done
+ for a pessimistic insert or update
+ operation */
+ ulint up_match; /*!< If the search mode was PAGE_CUR_LE,
+ the number of matched fields to the
+ the first user record to the right of
+ the cursor record after
+ btr_cur_search_to_nth_level;
+ for the mode PAGE_CUR_GE, the matched
+ fields to the first user record AT THE
+ CURSOR or to the right of it;
+ NOTE that the up_match and low_match
+ values may exceed the correct values
+ for comparison to the adjacent user
+ record if that record is on a
+ different leaf page! (See the note in
+ row_ins_duplicate_key.) */
+ ulint up_bytes; /*!< number of matched bytes to the
+ right at the time cursor positioned;
+ only used internally in searches: not
+ defined after the search */
+ ulint low_match; /*!< if search mode was PAGE_CUR_LE,
+ the number of matched fields to the
+ first user record AT THE CURSOR or
+ to the left of it after
+ btr_cur_search_to_nth_level;
+ NOT defined for PAGE_CUR_GE or any
+ other search modes; see also the NOTE
+ in up_match! */
+ ulint low_bytes; /*!< number of matched bytes to the
+ right at the time cursor positioned;
+ only used internally in searches: not
+ defined after the search */
+ ulint n_fields; /*!< prefix length used in a hash
+ search if hash_node != NULL */
+ ulint n_bytes; /*!< hash prefix bytes if hash_node !=
+ NULL */
+ ulint fold; /*!< fold value used in the search if
+ flag is BTR_CUR_HASH */
+ /*------------------------------*/
+ /* @} */
+ btr_path_t* path_arr; /*!< in estimating the number of
+ rows in range, we store in this array
+ information of the path through
+ the tree */
+};
+
+/** If pessimistic delete fails because of lack of file space, there
+is still a good change of success a little later. Try this many
+times. */
+#define BTR_CUR_RETRY_DELETE_N_TIMES 100
+/** If pessimistic delete fails because of lack of file space, there
+is still a good change of success a little later. Sleep this many
+microseconds between retries. */
+#define BTR_CUR_RETRY_SLEEP_TIME 50000
+
+/** The reference in a field for which data is stored on a different page.
+The reference is at the end of the 'locally' stored part of the field.
+'Locally' means storage in the index record.
+We store locally a long enough prefix of each column so that we can determine
+the ordering parts of each index record without looking into the externally
+stored part. */
+/*-------------------------------------- @{ */
+#define BTR_EXTERN_SPACE_ID 0 /*!< space id where stored */
+#define BTR_EXTERN_PAGE_NO 4 /*!< page no where stored */
+#define BTR_EXTERN_OFFSET 8 /*!< offset of BLOB header
+ on that page */
+#define BTR_EXTERN_LEN 12 /*!< 8 bytes containing the
+ length of the externally
+ stored part of the BLOB.
+ The 2 highest bits are
+ reserved to the flags below. */
+/*-------------------------------------- @} */
+/* #define BTR_EXTERN_FIELD_REF_SIZE 20 // moved to btr0types.h */
+
+/** The most significant bit of BTR_EXTERN_LEN (i.e., the most
+significant bit of the byte at smallest address) is set to 1 if this
+field does not 'own' the externally stored field; only the owner field
+is allowed to free the field in purge! */
+#define BTR_EXTERN_OWNER_FLAG 128
+/** If the second most significant bit of BTR_EXTERN_LEN (i.e., the
+second most significant bit of the byte at smallest address) is 1 then
+it means that the externally stored field was inherited from an
+earlier version of the row. In rollback we are not allowed to free an
+inherited external field. */
+#define BTR_EXTERN_INHERITED_FLAG 64
+
+/** Number of searches down the B-tree in btr_cur_search_to_nth_level(). */
+extern ulint btr_cur_n_non_sea;
+/** Number of successful adaptive hash index lookups in
+btr_cur_search_to_nth_level(). */
+extern ulint btr_cur_n_sea;
+/** Old value of btr_cur_n_non_sea. Copied by
+srv_refresh_innodb_monitor_stats(). Referenced by
+srv_printf_innodb_monitor(). */
+extern ulint btr_cur_n_non_sea_old;
+/** Old value of btr_cur_n_sea. Copied by
+srv_refresh_innodb_monitor_stats(). Referenced by
+srv_printf_innodb_monitor(). */
+extern ulint btr_cur_n_sea_old;
+#endif /* !UNIV_HOTBACKUP */
+
+#ifndef UNIV_NONINL
+#include "btr0cur.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/btr0cur.ic b/storage/innobase/include/btr0cur.ic
new file mode 100644
index 00000000000..280583f6ccf
--- /dev/null
+++ b/storage/innobase/include/btr0cur.ic
@@ -0,0 +1,200 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/btr0cur.ic
+The index tree cursor
+
+Created 10/16/1994 Heikki Tuuri
+*******************************************************/
+
+#ifndef UNIV_HOTBACKUP
+#include "btr0btr.h"
+
+#ifdef UNIV_DEBUG
+/*********************************************************//**
+Returns the page cursor component of a tree cursor.
+@return pointer to page cursor component */
+UNIV_INLINE
+page_cur_t*
+btr_cur_get_page_cur(
+/*=================*/
+ const btr_cur_t* cursor) /*!< in: tree cursor */
+{
+ return(&((btr_cur_t*) cursor)->page_cur);
+}
+#endif /* UNIV_DEBUG */
+/*********************************************************//**
+Returns the buffer block on which the tree cursor is positioned.
+@return pointer to buffer block */
+UNIV_INLINE
+buf_block_t*
+btr_cur_get_block(
+/*==============*/
+ btr_cur_t* cursor) /*!< in: tree cursor */
+{
+ return(page_cur_get_block(btr_cur_get_page_cur(cursor)));
+}
+
+/*********************************************************//**
+Returns the record pointer of a tree cursor.
+@return pointer to record */
+UNIV_INLINE
+rec_t*
+btr_cur_get_rec(
+/*============*/
+ btr_cur_t* cursor) /*!< in: tree cursor */
+{
+ return(page_cur_get_rec(&(cursor->page_cur)));
+}
+
+/*********************************************************//**
+Returns the compressed page on which the tree cursor is positioned.
+@return pointer to compressed page, or NULL if the page is not compressed */
+UNIV_INLINE
+page_zip_des_t*
+btr_cur_get_page_zip(
+/*=================*/
+ btr_cur_t* cursor) /*!< in: tree cursor */
+{
+ return(buf_block_get_page_zip(btr_cur_get_block(cursor)));
+}
+
+/*********************************************************//**
+Invalidates a tree cursor by setting record pointer to NULL. */
+UNIV_INLINE
+void
+btr_cur_invalidate(
+/*===============*/
+ btr_cur_t* cursor) /*!< in: tree cursor */
+{
+ page_cur_invalidate(&(cursor->page_cur));
+}
+
+/*********************************************************//**
+Returns the page of a tree cursor.
+@return pointer to page */
+UNIV_INLINE
+page_t*
+btr_cur_get_page(
+/*=============*/
+ btr_cur_t* cursor) /*!< in: tree cursor */
+{
+ return(page_align(page_cur_get_rec(&(cursor->page_cur))));
+}
+
+/*********************************************************//**
+Returns the index of a cursor.
+@return index */
+UNIV_INLINE
+dict_index_t*
+btr_cur_get_index(
+/*==============*/
+ btr_cur_t* cursor) /*!< in: B-tree cursor */
+{
+ return(cursor->index);
+}
+
+/*********************************************************//**
+Positions a tree cursor at a given record. */
+UNIV_INLINE
+void
+btr_cur_position(
+/*=============*/
+ dict_index_t* index, /*!< in: index */
+ rec_t* rec, /*!< in: record in tree */
+ buf_block_t* block, /*!< in: buffer block of rec */
+ btr_cur_t* cursor) /*!< out: cursor */
+{
+ ut_ad(page_align(rec) == block->frame);
+
+ page_cur_position(rec, block, btr_cur_get_page_cur(cursor));
+
+ cursor->index = index;
+}
+
+/*********************************************************************//**
+Checks if compressing an index page where a btr cursor is placed makes
+sense.
+@return TRUE if compression is recommended */
+UNIV_INLINE
+ibool
+btr_cur_compress_recommendation(
+/*============================*/
+ btr_cur_t* cursor, /*!< in: btr cursor */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ page_t* page;
+
+ ut_ad(mtr_memo_contains(mtr, btr_cur_get_block(cursor),
+ MTR_MEMO_PAGE_X_FIX));
+
+ page = btr_cur_get_page(cursor);
+
+ if ((page_get_data_size(page) < BTR_CUR_PAGE_COMPRESS_LIMIT)
+ || ((btr_page_get_next(page, mtr) == FIL_NULL)
+ && (btr_page_get_prev(page, mtr) == FIL_NULL))) {
+
+ /* The page fillfactor has dropped below a predefined
+ minimum value OR the level in the B-tree contains just
+ one page: we recommend compression if this is not the
+ root page. */
+
+ return(dict_index_get_page(cursor->index)
+ != page_get_page_no(page));
+ }
+
+ return(FALSE);
+}
+
+/*********************************************************************//**
+Checks if the record on which the cursor is placed can be deleted without
+making tree compression necessary (or, recommended).
+@return TRUE if can be deleted without recommended compression */
+UNIV_INLINE
+ibool
+btr_cur_can_delete_without_compress(
+/*================================*/
+ btr_cur_t* cursor, /*!< in: btr cursor */
+ ulint rec_size,/*!< in: rec_get_size(btr_cur_get_rec(cursor))*/
+ mtr_t* mtr) /*!< in: mtr */
+{
+ page_t* page;
+
+ ut_ad(mtr_memo_contains(mtr, btr_cur_get_block(cursor),
+ MTR_MEMO_PAGE_X_FIX));
+
+ page = btr_cur_get_page(cursor);
+
+ if ((page_get_data_size(page) - rec_size < BTR_CUR_PAGE_COMPRESS_LIMIT)
+ || ((btr_page_get_next(page, mtr) == FIL_NULL)
+ && (btr_page_get_prev(page, mtr) == FIL_NULL))
+ || (page_get_n_recs(page) < 2)) {
+
+ /* The page fillfactor will drop below a predefined
+ minimum value, OR the level in the B-tree contains just
+ one page, OR the page will become empty: we recommend
+ compression if this is not the root page. */
+
+ return(dict_index_get_page(cursor->index)
+ == page_get_page_no(page));
+ }
+
+ return(TRUE);
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/innobase/include/btr0pcur.h b/storage/innobase/include/btr0pcur.h
new file mode 100644
index 00000000000..12b1375d8b7
--- /dev/null
+++ b/storage/innobase/include/btr0pcur.h
@@ -0,0 +1,537 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/btr0pcur.h
+The index tree persistent cursor
+
+Created 2/23/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef btr0pcur_h
+#define btr0pcur_h
+
+#include "univ.i"
+#include "dict0dict.h"
+#include "data0data.h"
+#include "mtr0mtr.h"
+#include "page0cur.h"
+#include "btr0cur.h"
+#include "btr0btr.h"
+#include "btr0types.h"
+
+/* Relative positions for a stored cursor position */
+#define BTR_PCUR_ON 1
+#define BTR_PCUR_BEFORE 2
+#define BTR_PCUR_AFTER 3
+/* Note that if the tree is not empty, btr_pcur_store_position does not
+use the following, but only uses the above three alternatives, where the
+position is stored relative to a specific record: this makes implementation
+of a scroll cursor easier */
+#define BTR_PCUR_BEFORE_FIRST_IN_TREE 4 /* in an empty tree */
+#define BTR_PCUR_AFTER_LAST_IN_TREE 5 /* in an empty tree */
+
+/**************************************************************//**
+Allocates memory for a persistent cursor object and initializes the cursor.
+@return own: persistent cursor */
+UNIV_INTERN
+btr_pcur_t*
+btr_pcur_create_for_mysql(void);
+/*============================*/
+/**************************************************************//**
+Frees the memory for a persistent cursor object. */
+UNIV_INTERN
+void
+btr_pcur_free_for_mysql(
+/*====================*/
+ btr_pcur_t* cursor); /*!< in, own: persistent cursor */
+/**************************************************************//**
+Copies the stored position of a pcur to another pcur. */
+UNIV_INTERN
+void
+btr_pcur_copy_stored_position(
+/*==========================*/
+ btr_pcur_t* pcur_receive, /*!< in: pcur which will receive the
+ position info */
+ btr_pcur_t* pcur_donate); /*!< in: pcur from which the info is
+ copied */
+/**************************************************************//**
+Sets the old_rec_buf field to NULL. */
+UNIV_INLINE
+void
+btr_pcur_init(
+/*==========*/
+ btr_pcur_t* pcur); /*!< in: persistent cursor */
+/**************************************************************//**
+Initializes and opens a persistent cursor to an index tree. It should be
+closed with btr_pcur_close. */
+UNIV_INLINE
+void
+btr_pcur_open(
+/*==========*/
+ dict_index_t* index, /*!< in: index */
+ const dtuple_t* tuple, /*!< in: tuple on which search done */
+ ulint mode, /*!< in: PAGE_CUR_L, ...;
+ NOTE that if the search is made using a unique
+ prefix of a record, mode should be
+ PAGE_CUR_LE, not PAGE_CUR_GE, as the latter
+ may end up on the previous page from the
+ record! */
+ ulint latch_mode,/*!< in: BTR_SEARCH_LEAF, ... */
+ btr_pcur_t* cursor, /*!< in: memory buffer for persistent cursor */
+ mtr_t* mtr); /*!< in: mtr */
+/**************************************************************//**
+Opens an persistent cursor to an index tree without initializing the
+cursor. */
+UNIV_INLINE
+void
+btr_pcur_open_with_no_init(
+/*=======================*/
+ dict_index_t* index, /*!< in: index */
+ const dtuple_t* tuple, /*!< in: tuple on which search done */
+ ulint mode, /*!< in: PAGE_CUR_L, ...;
+ NOTE that if the search is made using a unique
+ prefix of a record, mode should be
+ PAGE_CUR_LE, not PAGE_CUR_GE, as the latter
+ may end up on the previous page of the
+ record! */
+ ulint latch_mode,/*!< in: BTR_SEARCH_LEAF, ...;
+ NOTE that if has_search_latch != 0 then
+ we maybe do not acquire a latch on the cursor
+ page, but assume that the caller uses his
+ btr search latch to protect the record! */
+ btr_pcur_t* cursor, /*!< in: memory buffer for persistent cursor */
+ ulint has_search_latch,/*!< in: latch mode the caller
+ currently has on btr_search_latch:
+ RW_S_LATCH, or 0 */
+ mtr_t* mtr); /*!< in: mtr */
+/*****************************************************************//**
+Opens a persistent cursor at either end of an index. */
+UNIV_INLINE
+void
+btr_pcur_open_at_index_side(
+/*========================*/
+ ibool from_left, /*!< in: TRUE if open to the low end,
+ FALSE if to the high end */
+ dict_index_t* index, /*!< in: index */
+ ulint latch_mode, /*!< in: latch mode */
+ btr_pcur_t* pcur, /*!< in: cursor */
+ ibool do_init, /*!< in: TRUE if should be initialized */
+ mtr_t* mtr); /*!< in: mtr */
+/**************************************************************//**
+Gets the up_match value for a pcur after a search.
+@return number of matched fields at the cursor or to the right if
+search mode was PAGE_CUR_GE, otherwise undefined */
+UNIV_INLINE
+ulint
+btr_pcur_get_up_match(
+/*==================*/
+ btr_pcur_t* cursor); /*!< in: memory buffer for persistent cursor */
+/**************************************************************//**
+Gets the low_match value for a pcur after a search.
+@return number of matched fields at the cursor or to the right if
+search mode was PAGE_CUR_LE, otherwise undefined */
+UNIV_INLINE
+ulint
+btr_pcur_get_low_match(
+/*===================*/
+ btr_pcur_t* cursor); /*!< in: memory buffer for persistent cursor */
+/**************************************************************//**
+If mode is PAGE_CUR_G or PAGE_CUR_GE, opens a persistent cursor on the first
+user record satisfying the search condition, in the case PAGE_CUR_L or
+PAGE_CUR_LE, on the last user record. If no such user record exists, then
+in the first case sets the cursor after last in tree, and in the latter case
+before first in tree. The latching mode must be BTR_SEARCH_LEAF or
+BTR_MODIFY_LEAF. */
+UNIV_INTERN
+void
+btr_pcur_open_on_user_rec(
+/*======================*/
+ dict_index_t* index, /*!< in: index */
+ const dtuple_t* tuple, /*!< in: tuple on which search done */
+ ulint mode, /*!< in: PAGE_CUR_L, ... */
+ ulint latch_mode, /*!< in: BTR_SEARCH_LEAF or
+ BTR_MODIFY_LEAF */
+ btr_pcur_t* cursor, /*!< in: memory buffer for persistent
+ cursor */
+ mtr_t* mtr); /*!< in: mtr */
+/**********************************************************************//**
+Positions a cursor at a randomly chosen position within a B-tree. */
+UNIV_INLINE
+void
+btr_pcur_open_at_rnd_pos(
+/*=====================*/
+ dict_index_t* index, /*!< in: index */
+ ulint latch_mode, /*!< in: BTR_SEARCH_LEAF, ... */
+ btr_pcur_t* cursor, /*!< in/out: B-tree pcur */
+ mtr_t* mtr); /*!< in: mtr */
+/**************************************************************//**
+Frees the possible old_rec_buf buffer of a persistent cursor and sets the
+latch mode of the persistent cursor to BTR_NO_LATCHES. */
+UNIV_INLINE
+void
+btr_pcur_close(
+/*===========*/
+ btr_pcur_t* cursor); /*!< in: persistent cursor */
+/**************************************************************//**
+The position of the cursor is stored by taking an initial segment of the
+record the cursor is positioned on, before, or after, and copying it to the
+cursor data structure, or just setting a flag if the cursor id before the
+first in an EMPTY tree, or after the last in an EMPTY tree. NOTE that the
+page where the cursor is positioned must not be empty if the index tree is
+not totally empty! */
+UNIV_INTERN
+void
+btr_pcur_store_position(
+/*====================*/
+ btr_pcur_t* cursor, /*!< in: persistent cursor */
+ mtr_t* mtr); /*!< in: mtr */
+/**************************************************************//**
+Restores the stored position of a persistent cursor bufferfixing the page and
+obtaining the specified latches. If the cursor position was saved when the
+(1) cursor was positioned on a user record: this function restores the position
+to the last record LESS OR EQUAL to the stored record;
+(2) cursor was positioned on a page infimum record: restores the position to
+the last record LESS than the user record which was the successor of the page
+infimum;
+(3) cursor was positioned on the page supremum: restores to the first record
+GREATER than the user record which was the predecessor of the supremum.
+(4) cursor was positioned before the first or after the last in an empty tree:
+restores to before first or after the last in the tree.
+@return TRUE if the cursor position was stored when it was on a user
+record and it can be restored on a user record whose ordering fields
+are identical to the ones of the original user record */
+UNIV_INTERN
+ibool
+btr_pcur_restore_position(
+/*======================*/
+ ulint latch_mode, /*!< in: BTR_SEARCH_LEAF, ... */
+ btr_pcur_t* cursor, /*!< in: detached persistent cursor */
+ mtr_t* mtr); /*!< in: mtr */
+/**************************************************************//**
+If the latch mode of the cursor is BTR_LEAF_SEARCH or BTR_LEAF_MODIFY,
+releases the page latch and bufferfix reserved by the cursor.
+NOTE! In the case of BTR_LEAF_MODIFY, there should not exist changes
+made by the current mini-transaction to the data protected by the
+cursor latch, as then the latch must not be released until mtr_commit. */
+UNIV_INTERN
+void
+btr_pcur_release_leaf(
+/*==================*/
+ btr_pcur_t* cursor, /*!< in: persistent cursor */
+ mtr_t* mtr); /*!< in: mtr */
+/*********************************************************//**
+Gets the rel_pos field for a cursor whose position has been stored.
+@return BTR_PCUR_ON, ... */
+UNIV_INLINE
+ulint
+btr_pcur_get_rel_pos(
+/*=================*/
+ const btr_pcur_t* cursor);/*!< in: persistent cursor */
+/*********************************************************//**
+Sets the mtr field for a pcur. */
+UNIV_INLINE
+void
+btr_pcur_set_mtr(
+/*=============*/
+ btr_pcur_t* cursor, /*!< in: persistent cursor */
+ mtr_t* mtr); /*!< in, own: mtr */
+/*********************************************************//**
+Gets the mtr field for a pcur.
+@return mtr */
+UNIV_INLINE
+mtr_t*
+btr_pcur_get_mtr(
+/*=============*/
+ btr_pcur_t* cursor); /*!< in: persistent cursor */
+/**************************************************************//**
+Commits the pcur mtr and sets the pcur latch mode to BTR_NO_LATCHES,
+that is, the cursor becomes detached. If there have been modifications
+to the page where pcur is positioned, this can be used instead of
+btr_pcur_release_leaf. Function btr_pcur_store_position should be used
+before calling this, if restoration of cursor is wanted later. */
+UNIV_INLINE
+void
+btr_pcur_commit(
+/*============*/
+ btr_pcur_t* pcur); /*!< in: persistent cursor */
+/**************************************************************//**
+Differs from btr_pcur_commit in that we can specify the mtr to commit. */
+UNIV_INLINE
+void
+btr_pcur_commit_specify_mtr(
+/*========================*/
+ btr_pcur_t* pcur, /*!< in: persistent cursor */
+ mtr_t* mtr); /*!< in: mtr to commit */
+/**************************************************************//**
+Tests if a cursor is detached: that is the latch mode is BTR_NO_LATCHES.
+@return TRUE if detached */
+UNIV_INLINE
+ibool
+btr_pcur_is_detached(
+/*=================*/
+ btr_pcur_t* pcur); /*!< in: persistent cursor */
+/*********************************************************//**
+Moves the persistent cursor to the next record in the tree. If no records are
+left, the cursor stays 'after last in tree'.
+@return TRUE if the cursor was not after last in tree */
+UNIV_INLINE
+ibool
+btr_pcur_move_to_next(
+/*==================*/
+ btr_pcur_t* cursor, /*!< in: persistent cursor; NOTE that the
+ function may release the page latch */
+ mtr_t* mtr); /*!< in: mtr */
+/*********************************************************//**
+Moves the persistent cursor to the previous record in the tree. If no records
+are left, the cursor stays 'before first in tree'.
+@return TRUE if the cursor was not before first in tree */
+UNIV_INTERN
+ibool
+btr_pcur_move_to_prev(
+/*==================*/
+ btr_pcur_t* cursor, /*!< in: persistent cursor; NOTE that the
+ function may release the page latch */
+ mtr_t* mtr); /*!< in: mtr */
+/*********************************************************//**
+Moves the persistent cursor to the last record on the same page. */
+UNIV_INLINE
+void
+btr_pcur_move_to_last_on_page(
+/*==========================*/
+ btr_pcur_t* cursor, /*!< in: persistent cursor */
+ mtr_t* mtr); /*!< in: mtr */
+/*********************************************************//**
+Moves the persistent cursor to the next user record in the tree. If no user
+records are left, the cursor ends up 'after last in tree'.
+@return TRUE if the cursor moved forward, ending on a user record */
+UNIV_INLINE
+ibool
+btr_pcur_move_to_next_user_rec(
+/*===========================*/
+ btr_pcur_t* cursor, /*!< in: persistent cursor; NOTE that the
+ function may release the page latch */
+ mtr_t* mtr); /*!< in: mtr */
+/*********************************************************//**
+Moves the persistent cursor to the first record on the next page.
+Releases the latch on the current page, and bufferunfixes it.
+Note that there must not be modifications on the current page,
+as then the x-latch can be released only in mtr_commit. */
+UNIV_INTERN
+void
+btr_pcur_move_to_next_page(
+/*=======================*/
+ btr_pcur_t* cursor, /*!< in: persistent cursor; must be on the
+ last record of the current page */
+ mtr_t* mtr); /*!< in: mtr */
+/*********************************************************//**
+Moves the persistent cursor backward if it is on the first record
+of the page. Releases the latch on the current page, and bufferunfixes
+it. Note that to prevent a possible deadlock, the operation first
+stores the position of the cursor, releases the leaf latch, acquires
+necessary latches and restores the cursor position again before returning.
+The alphabetical position of the cursor is guaranteed to be sensible
+on return, but it may happen that the cursor is not positioned on the
+last record of any page, because the structure of the tree may have
+changed while the cursor had no latches. */
+UNIV_INTERN
+void
+btr_pcur_move_backward_from_page(
+/*=============================*/
+ btr_pcur_t* cursor, /*!< in: persistent cursor, must be on the
+ first record of the current page */
+ mtr_t* mtr); /*!< in: mtr */
+#ifdef UNIV_DEBUG
+/*********************************************************//**
+Returns the btr cursor component of a persistent cursor.
+@return pointer to btr cursor component */
+UNIV_INLINE
+btr_cur_t*
+btr_pcur_get_btr_cur(
+/*=================*/
+ const btr_pcur_t* cursor); /*!< in: persistent cursor */
+/*********************************************************//**
+Returns the page cursor component of a persistent cursor.
+@return pointer to page cursor component */
+UNIV_INLINE
+page_cur_t*
+btr_pcur_get_page_cur(
+/*==================*/
+ const btr_pcur_t* cursor); /*!< in: persistent cursor */
+#else /* UNIV_DEBUG */
+# define btr_pcur_get_btr_cur(cursor) (&(cursor)->btr_cur)
+# define btr_pcur_get_page_cur(cursor) (&(cursor)->btr_cur.page_cur)
+#endif /* UNIV_DEBUG */
+/*********************************************************//**
+Returns the page of a persistent cursor.
+@return pointer to the page */
+UNIV_INLINE
+page_t*
+btr_pcur_get_page(
+/*==============*/
+ btr_pcur_t* cursor);/*!< in: persistent cursor */
+/*********************************************************//**
+Returns the buffer block of a persistent cursor.
+@return pointer to the block */
+UNIV_INLINE
+buf_block_t*
+btr_pcur_get_block(
+/*===============*/
+ btr_pcur_t* cursor);/*!< in: persistent cursor */
+/*********************************************************//**
+Returns the record of a persistent cursor.
+@return pointer to the record */
+UNIV_INLINE
+rec_t*
+btr_pcur_get_rec(
+/*=============*/
+ btr_pcur_t* cursor);/*!< in: persistent cursor */
+/*********************************************************//**
+Checks if the persistent cursor is on a user record. */
+UNIV_INLINE
+ibool
+btr_pcur_is_on_user_rec(
+/*====================*/
+ const btr_pcur_t* cursor);/*!< in: persistent cursor */
+/*********************************************************//**
+Checks if the persistent cursor is after the last user record on
+a page. */
+UNIV_INLINE
+ibool
+btr_pcur_is_after_last_on_page(
+/*===========================*/
+ const btr_pcur_t* cursor);/*!< in: persistent cursor */
+/*********************************************************//**
+Checks if the persistent cursor is before the first user record on
+a page. */
+UNIV_INLINE
+ibool
+btr_pcur_is_before_first_on_page(
+/*=============================*/
+ const btr_pcur_t* cursor);/*!< in: persistent cursor */
+/*********************************************************//**
+Checks if the persistent cursor is before the first user record in
+the index tree. */
+UNIV_INLINE
+ibool
+btr_pcur_is_before_first_in_tree(
+/*=============================*/
+ btr_pcur_t* cursor, /*!< in: persistent cursor */
+ mtr_t* mtr); /*!< in: mtr */
+/*********************************************************//**
+Checks if the persistent cursor is after the last user record in
+the index tree. */
+UNIV_INLINE
+ibool
+btr_pcur_is_after_last_in_tree(
+/*===========================*/
+ btr_pcur_t* cursor, /*!< in: persistent cursor */
+ mtr_t* mtr); /*!< in: mtr */
+/*********************************************************//**
+Moves the persistent cursor to the next record on the same page. */
+UNIV_INLINE
+void
+btr_pcur_move_to_next_on_page(
+/*==========================*/
+ btr_pcur_t* cursor);/*!< in/out: persistent cursor */
+/*********************************************************//**
+Moves the persistent cursor to the previous record on the same page. */
+UNIV_INLINE
+void
+btr_pcur_move_to_prev_on_page(
+/*==========================*/
+ btr_pcur_t* cursor);/*!< in/out: persistent cursor */
+
+
+/* The persistent B-tree cursor structure. This is used mainly for SQL
+selects, updates, and deletes. */
+
+struct btr_pcur_struct{
+ btr_cur_t btr_cur; /*!< a B-tree cursor */
+ ulint latch_mode; /*!< see TODO note below!
+ BTR_SEARCH_LEAF, BTR_MODIFY_LEAF,
+ BTR_MODIFY_TREE, or BTR_NO_LATCHES,
+ depending on the latching state of
+ the page and tree where the cursor is
+ positioned; the last value means that
+ the cursor is not currently positioned:
+ we say then that the cursor is
+ detached; it can be restored to
+ attached if the old position was
+ stored in old_rec */
+ ulint old_stored; /*!< BTR_PCUR_OLD_STORED
+ or BTR_PCUR_OLD_NOT_STORED */
+ rec_t* old_rec; /*!< if cursor position is stored,
+ contains an initial segment of the
+ latest record cursor was positioned
+ either on, before, or after */
+ ulint old_n_fields; /*!< number of fields in old_rec */
+ ulint rel_pos; /*!< BTR_PCUR_ON, BTR_PCUR_BEFORE, or
+ BTR_PCUR_AFTER, depending on whether
+ cursor was on, before, or after the
+ old_rec record */
+ buf_block_t* block_when_stored;/* buffer block when the position was
+ stored */
+ ib_uint64_t modify_clock; /*!< the modify clock value of the
+ buffer block when the cursor position
+ was stored */
+ ulint pos_state; /*!< see TODO note below!
+ BTR_PCUR_IS_POSITIONED,
+ BTR_PCUR_WAS_POSITIONED,
+ BTR_PCUR_NOT_POSITIONED */
+ ulint search_mode; /*!< PAGE_CUR_G, ... */
+ trx_t* trx_if_known; /*!< the transaction, if we know it;
+ otherwise this field is not defined;
+ can ONLY BE USED in error prints in
+ fatal assertion failures! */
+ /*-----------------------------*/
+ /* NOTE that the following fields may possess dynamically allocated
+ memory which should be freed if not needed anymore! */
+
+ mtr_t* mtr; /*!< NULL, or this field may contain
+ a mini-transaction which holds the
+ latch on the cursor page */
+ byte* old_rec_buf; /*!< NULL, or a dynamically allocated
+ buffer for old_rec */
+ ulint buf_size; /*!< old_rec_buf size if old_rec_buf
+ is not NULL */
+};
+
+#define BTR_PCUR_IS_POSITIONED 1997660512 /* TODO: currently, the state
+ can be BTR_PCUR_IS_POSITIONED,
+ though it really should be
+ BTR_PCUR_WAS_POSITIONED,
+ because we have no obligation
+ to commit the cursor with
+ mtr; similarly latch_mode may
+ be out of date. This can
+ lead to problems if btr_pcur
+ is not used the right way;
+ all current code should be
+ ok. */
+#define BTR_PCUR_WAS_POSITIONED 1187549791
+#define BTR_PCUR_NOT_POSITIONED 1328997689
+
+#define BTR_PCUR_OLD_STORED 908467085
+#define BTR_PCUR_OLD_NOT_STORED 122766467
+
+#ifndef UNIV_NONINL
+#include "btr0pcur.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/btr0pcur.ic b/storage/innobase/include/btr0pcur.ic
new file mode 100644
index 00000000000..0ca7223f861
--- /dev/null
+++ b/storage/innobase/include/btr0pcur.ic
@@ -0,0 +1,651 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/btr0pcur.ic
+The index tree persistent cursor
+
+Created 2/23/1996 Heikki Tuuri
+*******************************************************/
+
+
+/*********************************************************//**
+Gets the rel_pos field for a cursor whose position has been stored.
+@return BTR_PCUR_ON, ... */
+UNIV_INLINE
+ulint
+btr_pcur_get_rel_pos(
+/*=================*/
+ const btr_pcur_t* cursor) /*!< in: persistent cursor */
+{
+ ut_ad(cursor);
+ ut_ad(cursor->old_rec);
+ ut_ad(cursor->old_stored == BTR_PCUR_OLD_STORED);
+ ut_ad(cursor->pos_state == BTR_PCUR_WAS_POSITIONED
+ || cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+
+ return(cursor->rel_pos);
+}
+
+/*********************************************************//**
+Sets the mtr field for a pcur. */
+UNIV_INLINE
+void
+btr_pcur_set_mtr(
+/*=============*/
+ btr_pcur_t* cursor, /*!< in: persistent cursor */
+ mtr_t* mtr) /*!< in, own: mtr */
+{
+ ut_ad(cursor);
+
+ cursor->mtr = mtr;
+}
+
+/*********************************************************//**
+Gets the mtr field for a pcur.
+@return mtr */
+UNIV_INLINE
+mtr_t*
+btr_pcur_get_mtr(
+/*=============*/
+ btr_pcur_t* cursor) /*!< in: persistent cursor */
+{
+ ut_ad(cursor);
+
+ return(cursor->mtr);
+}
+
+#ifdef UNIV_DEBUG
+/*********************************************************//**
+Returns the btr cursor component of a persistent cursor.
+@return pointer to btr cursor component */
+UNIV_INLINE
+btr_cur_t*
+btr_pcur_get_btr_cur(
+/*=================*/
+ const btr_pcur_t* cursor) /*!< in: persistent cursor */
+{
+ const btr_cur_t* btr_cur = &cursor->btr_cur;
+ return((btr_cur_t*) btr_cur);
+}
+
+/*********************************************************//**
+Returns the page cursor component of a persistent cursor.
+@return pointer to page cursor component */
+UNIV_INLINE
+page_cur_t*
+btr_pcur_get_page_cur(
+/*==================*/
+ const btr_pcur_t* cursor) /*!< in: persistent cursor */
+{
+ return(btr_cur_get_page_cur(btr_pcur_get_btr_cur(cursor)));
+}
+#endif /* UNIV_DEBUG */
+/*********************************************************//**
+Returns the page of a persistent cursor.
+@return pointer to the page */
+UNIV_INLINE
+page_t*
+btr_pcur_get_page(
+/*==============*/
+ btr_pcur_t* cursor) /*!< in: persistent cursor */
+{
+ ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+
+ return(btr_cur_get_page(btr_pcur_get_btr_cur(cursor)));
+}
+
+/*********************************************************//**
+Returns the buffer block of a persistent cursor.
+@return pointer to the block */
+UNIV_INLINE
+buf_block_t*
+btr_pcur_get_block(
+/*===============*/
+ btr_pcur_t* cursor) /*!< in: persistent cursor */
+{
+ ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+
+ return(btr_cur_get_block(btr_pcur_get_btr_cur(cursor)));
+}
+
+/*********************************************************//**
+Returns the record of a persistent cursor.
+@return pointer to the record */
+UNIV_INLINE
+rec_t*
+btr_pcur_get_rec(
+/*=============*/
+ btr_pcur_t* cursor) /*!< in: persistent cursor */
+{
+ ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+ ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+
+ return(btr_cur_get_rec(btr_pcur_get_btr_cur(cursor)));
+}
+
+/**************************************************************//**
+Gets the up_match value for a pcur after a search.
+@return number of matched fields at the cursor or to the right if
+search mode was PAGE_CUR_GE, otherwise undefined */
+UNIV_INLINE
+ulint
+btr_pcur_get_up_match(
+/*==================*/
+ btr_pcur_t* cursor) /*!< in: memory buffer for persistent cursor */
+{
+ btr_cur_t* btr_cursor;
+
+ ut_ad((cursor->pos_state == BTR_PCUR_WAS_POSITIONED)
+ || (cursor->pos_state == BTR_PCUR_IS_POSITIONED));
+
+ btr_cursor = btr_pcur_get_btr_cur(cursor);
+
+ ut_ad(btr_cursor->up_match != ULINT_UNDEFINED);
+
+ return(btr_cursor->up_match);
+}
+
+/**************************************************************//**
+Gets the low_match value for a pcur after a search.
+@return number of matched fields at the cursor or to the right if
+search mode was PAGE_CUR_LE, otherwise undefined */
+UNIV_INLINE
+ulint
+btr_pcur_get_low_match(
+/*===================*/
+ btr_pcur_t* cursor) /*!< in: memory buffer for persistent cursor */
+{
+ btr_cur_t* btr_cursor;
+
+ ut_ad((cursor->pos_state == BTR_PCUR_WAS_POSITIONED)
+ || (cursor->pos_state == BTR_PCUR_IS_POSITIONED));
+
+ btr_cursor = btr_pcur_get_btr_cur(cursor);
+ ut_ad(btr_cursor->low_match != ULINT_UNDEFINED);
+
+ return(btr_cursor->low_match);
+}
+
+/*********************************************************//**
+Checks if the persistent cursor is after the last user record on
+a page. */
+UNIV_INLINE
+ibool
+btr_pcur_is_after_last_on_page(
+/*===========================*/
+ const btr_pcur_t* cursor) /*!< in: persistent cursor */
+{
+ ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+ ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+
+ return(page_cur_is_after_last(btr_pcur_get_page_cur(cursor)));
+}
+
+/*********************************************************//**
+Checks if the persistent cursor is before the first user record on
+a page. */
+UNIV_INLINE
+ibool
+btr_pcur_is_before_first_on_page(
+/*=============================*/
+ const btr_pcur_t* cursor) /*!< in: persistent cursor */
+{
+ ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+ ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+
+ return(page_cur_is_before_first(btr_pcur_get_page_cur(cursor)));
+}
+
+/*********************************************************//**
+Checks if the persistent cursor is on a user record. */
+UNIV_INLINE
+ibool
+btr_pcur_is_on_user_rec(
+/*====================*/
+ const btr_pcur_t* cursor) /*!< in: persistent cursor */
+{
+ ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+ ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+
+ if (btr_pcur_is_before_first_on_page(cursor)
+ || btr_pcur_is_after_last_on_page(cursor)) {
+
+ return(FALSE);
+ }
+
+ return(TRUE);
+}
+
+/*********************************************************//**
+Checks if the persistent cursor is before the first user record in
+the index tree. */
+UNIV_INLINE
+ibool
+btr_pcur_is_before_first_in_tree(
+/*=============================*/
+ btr_pcur_t* cursor, /*!< in: persistent cursor */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+ ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+
+ if (btr_page_get_prev(btr_pcur_get_page(cursor), mtr) != FIL_NULL) {
+
+ return(FALSE);
+ }
+
+ return(page_cur_is_before_first(btr_pcur_get_page_cur(cursor)));
+}
+
+/*********************************************************//**
+Checks if the persistent cursor is after the last user record in
+the index tree. */
+UNIV_INLINE
+ibool
+btr_pcur_is_after_last_in_tree(
+/*===========================*/
+ btr_pcur_t* cursor, /*!< in: persistent cursor */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+ ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+
+ if (btr_page_get_next(btr_pcur_get_page(cursor), mtr) != FIL_NULL) {
+
+ return(FALSE);
+ }
+
+ return(page_cur_is_after_last(btr_pcur_get_page_cur(cursor)));
+}
+
+/*********************************************************//**
+Moves the persistent cursor to the next record on the same page. */
+UNIV_INLINE
+void
+btr_pcur_move_to_next_on_page(
+/*==========================*/
+ btr_pcur_t* cursor) /*!< in/out: persistent cursor */
+{
+ ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+ ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+
+ page_cur_move_to_next(btr_pcur_get_page_cur(cursor));
+
+ cursor->old_stored = BTR_PCUR_OLD_NOT_STORED;
+}
+
+/*********************************************************//**
+Moves the persistent cursor to the previous record on the same page. */
+UNIV_INLINE
+void
+btr_pcur_move_to_prev_on_page(
+/*==========================*/
+ btr_pcur_t* cursor) /*!< in/out: persistent cursor */
+{
+ ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+ ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+
+ page_cur_move_to_prev(btr_pcur_get_page_cur(cursor));
+
+ cursor->old_stored = BTR_PCUR_OLD_NOT_STORED;
+}
+
+/*********************************************************//**
+Moves the persistent cursor to the last record on the same page. */
+UNIV_INLINE
+void
+btr_pcur_move_to_last_on_page(
+/*==========================*/
+ btr_pcur_t* cursor, /*!< in: persistent cursor */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ UT_NOT_USED(mtr);
+ ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+
+ page_cur_set_after_last(btr_pcur_get_block(cursor),
+ btr_pcur_get_page_cur(cursor));
+
+ cursor->old_stored = BTR_PCUR_OLD_NOT_STORED;
+}
+
+/*********************************************************//**
+Moves the persistent cursor to the next user record in the tree. If no user
+records are left, the cursor ends up 'after last in tree'.
+@return TRUE if the cursor moved forward, ending on a user record */
+UNIV_INLINE
+ibool
+btr_pcur_move_to_next_user_rec(
+/*===========================*/
+ btr_pcur_t* cursor, /*!< in: persistent cursor; NOTE that the
+ function may release the page latch */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+ ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+ cursor->old_stored = BTR_PCUR_OLD_NOT_STORED;
+loop:
+ if (btr_pcur_is_after_last_on_page(cursor)) {
+
+ if (btr_pcur_is_after_last_in_tree(cursor, mtr)) {
+
+ return(FALSE);
+ }
+
+ btr_pcur_move_to_next_page(cursor, mtr);
+ } else {
+ btr_pcur_move_to_next_on_page(cursor);
+ }
+
+ if (btr_pcur_is_on_user_rec(cursor)) {
+
+ return(TRUE);
+ }
+
+ goto loop;
+}
+
+/*********************************************************//**
+Moves the persistent cursor to the next record in the tree. If no records are
+left, the cursor stays 'after last in tree'.
+@return TRUE if the cursor was not after last in tree */
+UNIV_INLINE
+ibool
+btr_pcur_move_to_next(
+/*==================*/
+ btr_pcur_t* cursor, /*!< in: persistent cursor; NOTE that the
+ function may release the page latch */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+ ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+
+ cursor->old_stored = BTR_PCUR_OLD_NOT_STORED;
+
+ if (btr_pcur_is_after_last_on_page(cursor)) {
+
+ if (btr_pcur_is_after_last_in_tree(cursor, mtr)) {
+
+ return(FALSE);
+ }
+
+ btr_pcur_move_to_next_page(cursor, mtr);
+
+ return(TRUE);
+ }
+
+ btr_pcur_move_to_next_on_page(cursor);
+
+ return(TRUE);
+}
+
+/**************************************************************//**
+Commits the pcur mtr and sets the pcur latch mode to BTR_NO_LATCHES,
+that is, the cursor becomes detached. If there have been modifications
+to the page where pcur is positioned, this can be used instead of
+btr_pcur_release_leaf. Function btr_pcur_store_position should be used
+before calling this, if restoration of cursor is wanted later. */
+UNIV_INLINE
+void
+btr_pcur_commit(
+/*============*/
+ btr_pcur_t* pcur) /*!< in: persistent cursor */
+{
+ ut_a(pcur->pos_state == BTR_PCUR_IS_POSITIONED);
+
+ pcur->latch_mode = BTR_NO_LATCHES;
+
+ mtr_commit(pcur->mtr);
+
+ pcur->pos_state = BTR_PCUR_WAS_POSITIONED;
+}
+
+/**************************************************************//**
+Differs from btr_pcur_commit in that we can specify the mtr to commit. */
+UNIV_INLINE
+void
+btr_pcur_commit_specify_mtr(
+/*========================*/
+ btr_pcur_t* pcur, /*!< in: persistent cursor */
+ mtr_t* mtr) /*!< in: mtr to commit */
+{
+ ut_a(pcur->pos_state == BTR_PCUR_IS_POSITIONED);
+
+ pcur->latch_mode = BTR_NO_LATCHES;
+
+ mtr_commit(mtr);
+
+ pcur->pos_state = BTR_PCUR_WAS_POSITIONED;
+}
+
+/**************************************************************//**
+Sets the pcur latch mode to BTR_NO_LATCHES. */
+UNIV_INLINE
+void
+btr_pcur_detach(
+/*============*/
+ btr_pcur_t* pcur) /*!< in: persistent cursor */
+{
+ ut_a(pcur->pos_state == BTR_PCUR_IS_POSITIONED);
+
+ pcur->latch_mode = BTR_NO_LATCHES;
+
+ pcur->pos_state = BTR_PCUR_WAS_POSITIONED;
+}
+
+/**************************************************************//**
+Tests if a cursor is detached: that is the latch mode is BTR_NO_LATCHES.
+@return TRUE if detached */
+UNIV_INLINE
+ibool
+btr_pcur_is_detached(
+/*=================*/
+ btr_pcur_t* pcur) /*!< in: persistent cursor */
+{
+ if (pcur->latch_mode == BTR_NO_LATCHES) {
+
+ return(TRUE);
+ }
+
+ return(FALSE);
+}
+
+/**************************************************************//**
+Sets the old_rec_buf field to NULL. */
+UNIV_INLINE
+void
+btr_pcur_init(
+/*==========*/
+ btr_pcur_t* pcur) /*!< in: persistent cursor */
+{
+ pcur->old_stored = BTR_PCUR_OLD_NOT_STORED;
+ pcur->old_rec_buf = NULL;
+ pcur->old_rec = NULL;
+}
+
+/**************************************************************//**
+Initializes and opens a persistent cursor to an index tree. It should be
+closed with btr_pcur_close. */
+UNIV_INLINE
+void
+btr_pcur_open(
+/*==========*/
+ dict_index_t* index, /*!< in: index */
+ const dtuple_t* tuple, /*!< in: tuple on which search done */
+ ulint mode, /*!< in: PAGE_CUR_L, ...;
+ NOTE that if the search is made using a unique
+ prefix of a record, mode should be
+ PAGE_CUR_LE, not PAGE_CUR_GE, as the latter
+ may end up on the previous page from the
+ record! */
+ ulint latch_mode,/*!< in: BTR_SEARCH_LEAF, ... */
+ btr_pcur_t* cursor, /*!< in: memory buffer for persistent cursor */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ btr_cur_t* btr_cursor;
+
+ /* Initialize the cursor */
+
+ btr_pcur_init(cursor);
+
+ cursor->latch_mode = latch_mode;
+ cursor->search_mode = mode;
+
+ /* Search with the tree cursor */
+
+ btr_cursor = btr_pcur_get_btr_cur(cursor);
+
+ btr_cur_search_to_nth_level(index, 0, tuple, mode, latch_mode,
+ btr_cursor, 0, mtr);
+ cursor->pos_state = BTR_PCUR_IS_POSITIONED;
+
+ cursor->trx_if_known = NULL;
+}
+
+/**************************************************************//**
+Opens an persistent cursor to an index tree without initializing the
+cursor. */
+UNIV_INLINE
+void
+btr_pcur_open_with_no_init(
+/*=======================*/
+ dict_index_t* index, /*!< in: index */
+ const dtuple_t* tuple, /*!< in: tuple on which search done */
+ ulint mode, /*!< in: PAGE_CUR_L, ...;
+ NOTE that if the search is made using a unique
+ prefix of a record, mode should be
+ PAGE_CUR_LE, not PAGE_CUR_GE, as the latter
+ may end up on the previous page of the
+ record! */
+ ulint latch_mode,/*!< in: BTR_SEARCH_LEAF, ...;
+ NOTE that if has_search_latch != 0 then
+ we maybe do not acquire a latch on the cursor
+ page, but assume that the caller uses his
+ btr search latch to protect the record! */
+ btr_pcur_t* cursor, /*!< in: memory buffer for persistent cursor */
+ ulint has_search_latch,/*!< in: latch mode the caller
+ currently has on btr_search_latch:
+ RW_S_LATCH, or 0 */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ btr_cur_t* btr_cursor;
+
+ cursor->latch_mode = latch_mode;
+ cursor->search_mode = mode;
+
+ /* Search with the tree cursor */
+
+ btr_cursor = btr_pcur_get_btr_cur(cursor);
+
+ btr_cur_search_to_nth_level(index, 0, tuple, mode, latch_mode,
+ btr_cursor, has_search_latch, mtr);
+ cursor->pos_state = BTR_PCUR_IS_POSITIONED;
+
+ cursor->old_stored = BTR_PCUR_OLD_NOT_STORED;
+
+ cursor->trx_if_known = NULL;
+}
+
+/*****************************************************************//**
+Opens a persistent cursor at either end of an index. */
+UNIV_INLINE
+void
+btr_pcur_open_at_index_side(
+/*========================*/
+ ibool from_left, /*!< in: TRUE if open to the low end,
+ FALSE if to the high end */
+ dict_index_t* index, /*!< in: index */
+ ulint latch_mode, /*!< in: latch mode */
+ btr_pcur_t* pcur, /*!< in: cursor */
+ ibool do_init, /*!< in: TRUE if should be initialized */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ pcur->latch_mode = latch_mode;
+
+ if (from_left) {
+ pcur->search_mode = PAGE_CUR_G;
+ } else {
+ pcur->search_mode = PAGE_CUR_L;
+ }
+
+ if (do_init) {
+ btr_pcur_init(pcur);
+ }
+
+ btr_cur_open_at_index_side(from_left, index, latch_mode,
+ btr_pcur_get_btr_cur(pcur), mtr);
+ pcur->pos_state = BTR_PCUR_IS_POSITIONED;
+
+ pcur->old_stored = BTR_PCUR_OLD_NOT_STORED;
+
+ pcur->trx_if_known = NULL;
+}
+
+/**********************************************************************//**
+Positions a cursor at a randomly chosen position within a B-tree. */
+UNIV_INLINE
+void
+btr_pcur_open_at_rnd_pos(
+/*=====================*/
+ dict_index_t* index, /*!< in: index */
+ ulint latch_mode, /*!< in: BTR_SEARCH_LEAF, ... */
+ btr_pcur_t* cursor, /*!< in/out: B-tree pcur */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ /* Initialize the cursor */
+
+ cursor->latch_mode = latch_mode;
+ cursor->search_mode = PAGE_CUR_G;
+
+ btr_pcur_init(cursor);
+
+ btr_cur_open_at_rnd_pos(index, latch_mode,
+ btr_pcur_get_btr_cur(cursor), mtr);
+ cursor->pos_state = BTR_PCUR_IS_POSITIONED;
+ cursor->old_stored = BTR_PCUR_OLD_NOT_STORED;
+
+ cursor->trx_if_known = NULL;
+}
+
+/**************************************************************//**
+Frees the possible memory heap of a persistent cursor and sets the latch
+mode of the persistent cursor to BTR_NO_LATCHES. */
+UNIV_INLINE
+void
+btr_pcur_close(
+/*===========*/
+ btr_pcur_t* cursor) /*!< in: persistent cursor */
+{
+ if (cursor->old_rec_buf != NULL) {
+
+ mem_free(cursor->old_rec_buf);
+
+ cursor->old_rec = NULL;
+ cursor->old_rec_buf = NULL;
+ }
+
+ cursor->btr_cur.page_cur.rec = NULL;
+ cursor->btr_cur.page_cur.block = NULL;
+ cursor->old_rec = NULL;
+ cursor->old_stored = BTR_PCUR_OLD_NOT_STORED;
+
+ cursor->latch_mode = BTR_NO_LATCHES;
+ cursor->pos_state = BTR_PCUR_NOT_POSITIONED;
+
+ cursor->trx_if_known = NULL;
+}
diff --git a/storage/innobase/include/btr0sea.h b/storage/innobase/include/btr0sea.h
new file mode 100644
index 00000000000..631b3bd386c
--- /dev/null
+++ b/storage/innobase/include/btr0sea.h
@@ -0,0 +1,304 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/btr0sea.h
+The index tree adaptive search
+
+Created 2/17/1996 Heikki Tuuri
+*************************************************************************/
+
+#ifndef btr0sea_h
+#define btr0sea_h
+
+#include "univ.i"
+
+#include "rem0rec.h"
+#include "dict0dict.h"
+#include "btr0types.h"
+#include "mtr0mtr.h"
+#include "ha0ha.h"
+
+/*****************************************************************//**
+Creates and initializes the adaptive search system at a database start. */
+UNIV_INTERN
+void
+btr_search_sys_create(
+/*==================*/
+ ulint hash_size); /*!< in: hash index hash table size */
+
+/********************************************************************//**
+Disable the adaptive hash search system and empty the index. */
+UNIV_INTERN
+void
+btr_search_disable(void);
+/*====================*/
+/********************************************************************//**
+Enable the adaptive hash search system. */
+UNIV_INTERN
+void
+btr_search_enable(void);
+/*====================*/
+
+/********************************************************************//**
+Returns search info for an index.
+@return search info; search mutex reserved */
+UNIV_INLINE
+btr_search_t*
+btr_search_get_info(
+/*================*/
+ dict_index_t* index); /*!< in: index */
+/*****************************************************************//**
+Creates and initializes a search info struct.
+@return own: search info struct */
+UNIV_INTERN
+btr_search_t*
+btr_search_info_create(
+/*===================*/
+ mem_heap_t* heap); /*!< in: heap where created */
+/*****************************************************************//**
+Returns the value of ref_count. The value is protected by
+btr_search_latch.
+@return ref_count value. */
+UNIV_INTERN
+ulint
+btr_search_info_get_ref_count(
+/*==========================*/
+ btr_search_t* info); /*!< in: search info. */
+/*********************************************************************//**
+Updates the search info. */
+UNIV_INLINE
+void
+btr_search_info_update(
+/*===================*/
+ dict_index_t* index, /*!< in: index of the cursor */
+ btr_cur_t* cursor);/*!< in: cursor which was just positioned */
+/******************************************************************//**
+Tries to guess the right search position based on the hash search info
+of the index. Note that if mode is PAGE_CUR_LE, which is used in inserts,
+and the function returns TRUE, then cursor->up_match and cursor->low_match
+both have sensible values.
+@return TRUE if succeeded */
+UNIV_INTERN
+ibool
+btr_search_guess_on_hash(
+/*=====================*/
+ dict_index_t* index, /*!< in: index */
+ btr_search_t* info, /*!< in: index search info */
+ const dtuple_t* tuple, /*!< in: logical record */
+ ulint mode, /*!< in: PAGE_CUR_L, ... */
+ ulint latch_mode, /*!< in: BTR_SEARCH_LEAF, ... */
+ btr_cur_t* cursor, /*!< out: tree cursor */
+ ulint has_search_latch,/*!< in: latch mode the caller
+ currently has on btr_search_latch:
+ RW_S_LATCH, RW_X_LATCH, or 0 */
+ mtr_t* mtr); /*!< in: mtr */
+/********************************************************************//**
+Moves or deletes hash entries for moved records. If new_page is already hashed,
+then the hash index for page, if any, is dropped. If new_page is not hashed,
+and page is hashed, then a new hash index is built to new_page with the same
+parameters as page (this often happens when a page is split). */
+UNIV_INTERN
+void
+btr_search_move_or_delete_hash_entries(
+/*===================================*/
+ buf_block_t* new_block, /*!< in: records are copied
+ to this page */
+ buf_block_t* block, /*!< in: index page from which
+ records were copied, and the
+ copied records will be deleted
+ from this page */
+ dict_index_t* index); /*!< in: record descriptor */
+/********************************************************************//**
+Drops a page hash index. */
+UNIV_INTERN
+void
+btr_search_drop_page_hash_index(
+/*============================*/
+ buf_block_t* block); /*!< in: block containing index page,
+ s- or x-latched, or an index page
+ for which we know that
+ block->buf_fix_count == 0 */
+/********************************************************************//**
+Drops a page hash index when a page is freed from a fseg to the file system.
+Drops possible hash index if the page happens to be in the buffer pool. */
+UNIV_INTERN
+void
+btr_search_drop_page_hash_when_freed(
+/*=================================*/
+ ulint space, /*!< in: space id */
+ ulint zip_size, /*!< in: compressed page size in bytes
+ or 0 for uncompressed pages */
+ ulint page_no); /*!< in: page number */
+/********************************************************************//**
+Updates the page hash index when a single record is inserted on a page. */
+UNIV_INTERN
+void
+btr_search_update_hash_node_on_insert(
+/*==================================*/
+ btr_cur_t* cursor);/*!< in: cursor which was positioned to the
+ place to insert using btr_cur_search_...,
+ and the new record has been inserted next
+ to the cursor */
+/********************************************************************//**
+Updates the page hash index when a single record is inserted on a page. */
+UNIV_INTERN
+void
+btr_search_update_hash_on_insert(
+/*=============================*/
+ btr_cur_t* cursor);/*!< in: cursor which was positioned to the
+ place to insert using btr_cur_search_...,
+ and the new record has been inserted next
+ to the cursor */
+/********************************************************************//**
+Updates the page hash index when a single record is deleted from a page. */
+UNIV_INTERN
+void
+btr_search_update_hash_on_delete(
+/*=============================*/
+ btr_cur_t* cursor);/*!< in: cursor which was positioned on the
+ record to delete using btr_cur_search_...,
+ the record is not yet deleted */
+/********************************************************************//**
+Validates the search system.
+@return TRUE if ok */
+UNIV_INTERN
+ibool
+btr_search_validate(void);
+/*======================*/
+
+/** Flag: has the search system been enabled?
+Protected by btr_search_latch and btr_search_enabled_mutex. */
+extern char btr_search_enabled;
+
+/** The search info struct in an index */
+struct btr_search_struct{
+ ulint ref_count; /*!< Number of blocks in this index tree
+ that have search index built
+ i.e. block->index points to this index.
+ Protected by btr_search_latch except
+ when during initialization in
+ btr_search_info_create(). */
+
+ /* @{ The following fields are not protected by any latch.
+ Unfortunately, this means that they must be aligned to
+ the machine word, i.e., they cannot be turned into bit-fields. */
+ buf_block_t* root_guess;/*!< the root page frame when it was last time
+ fetched, or NULL */
+ ulint hash_analysis; /*!< when this exceeds
+ BTR_SEARCH_HASH_ANALYSIS, the hash
+ analysis starts; this is reset if no
+ success noticed */
+ ibool last_hash_succ; /*!< TRUE if the last search would have
+ succeeded, or did succeed, using the hash
+ index; NOTE that the value here is not exact:
+ it is not calculated for every search, and the
+ calculation itself is not always accurate! */
+ ulint n_hash_potential;
+ /*!< number of consecutive searches
+ which would have succeeded, or did succeed,
+ using the hash index;
+ the range is 0 .. BTR_SEARCH_BUILD_LIMIT + 5 */
+ /* @} */
+ /*---------------------- @{ */
+ ulint n_fields; /*!< recommended prefix length for hash search:
+ number of full fields */
+ ulint n_bytes; /*!< recommended prefix: number of bytes in
+ an incomplete field
+ @see BTR_PAGE_MAX_REC_SIZE */
+ ibool left_side; /*!< TRUE or FALSE, depending on whether
+ the leftmost record of several records with
+ the same prefix should be indexed in the
+ hash index */
+ /*---------------------- @} */
+#ifdef UNIV_SEARCH_PERF_STAT
+ ulint n_hash_succ; /*!< number of successful hash searches thus
+ far */
+ ulint n_hash_fail; /*!< number of failed hash searches */
+ ulint n_patt_succ; /*!< number of successful pattern searches thus
+ far */
+ ulint n_searches; /*!< number of searches */
+#endif /* UNIV_SEARCH_PERF_STAT */
+#ifdef UNIV_DEBUG
+ ulint magic_n; /*!< magic number @see BTR_SEARCH_MAGIC_N */
+/** value of btr_search_struct::magic_n, used in assertions */
+# define BTR_SEARCH_MAGIC_N 1112765
+#endif /* UNIV_DEBUG */
+};
+
+/** The hash index system */
+typedef struct btr_search_sys_struct btr_search_sys_t;
+
+/** The hash index system */
+struct btr_search_sys_struct{
+ hash_table_t* hash_index; /*!< the adaptive hash index,
+ mapping dtuple_fold values
+ to rec_t pointers on index pages */
+};
+
+/** The adaptive hash index */
+extern btr_search_sys_t* btr_search_sys;
+
+/** @brief The latch protecting the adaptive search system
+
+This latch protects the
+(1) hash index;
+(2) columns of a record to which we have a pointer in the hash index;
+
+but does NOT protect:
+
+(3) next record offset field in a record;
+(4) next or previous records on the same page.
+
+Bear in mind (3) and (4) when using the hash index.
+*/
+extern rw_lock_t* btr_search_latch_temp;
+
+/** The latch protecting the adaptive search system */
+#define btr_search_latch (*btr_search_latch_temp)
+
+#ifdef UNIV_SEARCH_PERF_STAT
+/** Number of successful adaptive hash index lookups */
+extern ulint btr_search_n_succ;
+/** Number of failed adaptive hash index lookups */
+extern ulint btr_search_n_hash_fail;
+#endif /* UNIV_SEARCH_PERF_STAT */
+
+/** After change in n_fields or n_bytes in info, this many rounds are waited
+before starting the hash analysis again: this is to save CPU time when there
+is no hope in building a hash index. */
+#define BTR_SEARCH_HASH_ANALYSIS 17
+
+/** Limit of consecutive searches for trying a search shortcut on the search
+pattern */
+#define BTR_SEARCH_ON_PATTERN_LIMIT 3
+
+/** Limit of consecutive searches for trying a search shortcut using
+the hash index */
+#define BTR_SEARCH_ON_HASH_LIMIT 3
+
+/** We do this many searches before trying to keep the search latch
+over calls from MySQL. If we notice someone waiting for the latch, we
+again set this much timeout. This is to reduce contention. */
+#define BTR_SEA_TIMEOUT 10000
+
+#ifndef UNIV_NONINL
+#include "btr0sea.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/btr0sea.ic b/storage/innobase/include/btr0sea.ic
new file mode 100644
index 00000000000..beadeeb8d02
--- /dev/null
+++ b/storage/innobase/include/btr0sea.ic
@@ -0,0 +1,84 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/btr0sea.ic
+The index tree adaptive search
+
+Created 2/17/1996 Heikki Tuuri
+*************************************************************************/
+
+#include "dict0mem.h"
+#include "btr0cur.h"
+#include "buf0buf.h"
+
+/*********************************************************************//**
+Updates the search info. */
+UNIV_INTERN
+void
+btr_search_info_update_slow(
+/*========================*/
+ btr_search_t* info, /*!< in/out: search info */
+ btr_cur_t* cursor);/*!< in: cursor which was just positioned */
+
+/********************************************************************//**
+Returns search info for an index.
+@return search info; search mutex reserved */
+UNIV_INLINE
+btr_search_t*
+btr_search_get_info(
+/*================*/
+ dict_index_t* index) /*!< in: index */
+{
+ ut_ad(index);
+
+ return(index->search_info);
+}
+
+/*********************************************************************//**
+Updates the search info. */
+UNIV_INLINE
+void
+btr_search_info_update(
+/*===================*/
+ dict_index_t* index, /*!< in: index of the cursor */
+ btr_cur_t* cursor) /*!< in: cursor which was just positioned */
+{
+ btr_search_t* info;
+
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(!rw_lock_own(&btr_search_latch, RW_LOCK_SHARED));
+ ut_ad(!rw_lock_own(&btr_search_latch, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
+ info = btr_search_get_info(index);
+
+ info->hash_analysis++;
+
+ if (info->hash_analysis < BTR_SEARCH_HASH_ANALYSIS) {
+
+ /* Do nothing */
+
+ return;
+
+ }
+
+ ut_ad(cursor->flag != BTR_CUR_HASH);
+
+ btr_search_info_update_slow(info, cursor);
+}
diff --git a/storage/innobase/include/btr0types.h b/storage/innobase/include/btr0types.h
new file mode 100644
index 00000000000..ef4a6b04b34
--- /dev/null
+++ b/storage/innobase/include/btr0types.h
@@ -0,0 +1,51 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/btr0types.h
+The index tree general types
+
+Created 2/17/1996 Heikki Tuuri
+*************************************************************************/
+
+#ifndef btr0types_h
+#define btr0types_h
+
+#include "univ.i"
+
+#include "rem0types.h"
+#include "page0types.h"
+
+/** Persistent cursor */
+typedef struct btr_pcur_struct btr_pcur_t;
+/** B-tree cursor */
+typedef struct btr_cur_struct btr_cur_t;
+/** B-tree search information for the adaptive hash index */
+typedef struct btr_search_struct btr_search_t;
+
+/** The size of a reference to data stored on a different page.
+The reference is stored at the end of the prefix of the field
+in the index record. */
+#define BTR_EXTERN_FIELD_REF_SIZE 20
+
+/** A BLOB field reference full of zero, for use in assertions and tests.
+Initially, BLOB field references are set to zero, in
+dtuple_convert_big_rec(). */
+extern const byte field_ref_zero[BTR_EXTERN_FIELD_REF_SIZE];
+
+#endif
diff --git a/storage/innobase/include/buf0buddy.h b/storage/innobase/include/buf0buddy.h
new file mode 100644
index 00000000000..7648950d5d1
--- /dev/null
+++ b/storage/innobase/include/buf0buddy.h
@@ -0,0 +1,90 @@
+/*****************************************************************************
+
+Copyright (c) 2006, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/buf0buddy.h
+Binary buddy allocator for compressed pages
+
+Created December 2006 by Marko Makela
+*******************************************************/
+
+#ifndef buf0buddy_h
+#define buf0buddy_h
+
+#ifdef UNIV_MATERIALIZE
+# undef UNIV_INLINE
+# define UNIV_INLINE
+#endif
+
+#include "univ.i"
+#include "buf0types.h"
+
+/**********************************************************************//**
+Allocate a block. The thread calling this function must hold
+buf_pool_mutex and must not hold buf_pool_zip_mutex or any
+block->mutex. The buf_pool_mutex may only be released and reacquired
+if lru != NULL. This function should only be used for allocating
+compressed page frames or control blocks (buf_page_t). Allocated
+control blocks must be properly initialized immediately after
+buf_buddy_alloc() has returned the memory, before releasing
+buf_pool_mutex.
+@return allocated block, possibly NULL if lru == NULL */
+UNIV_INLINE
+void*
+buf_buddy_alloc(
+/*============*/
+ ulint size, /*!< in: block size, up to UNIV_PAGE_SIZE */
+ ibool* lru) /*!< in: pointer to a variable that will be assigned
+ TRUE if storage was allocated from the LRU list
+ and buf_pool_mutex was temporarily released,
+ or NULL if the LRU list should not be used */
+ __attribute__((malloc));
+
+/**********************************************************************//**
+Release a block. */
+UNIV_INLINE
+void
+buf_buddy_free(
+/*===========*/
+ void* buf, /*!< in: block to be freed, must not be
+ pointed to by the buffer pool */
+ ulint size) /*!< in: block size, up to UNIV_PAGE_SIZE */
+ __attribute__((nonnull));
+
+/** Statistics of buddy blocks of a given size. */
+struct buf_buddy_stat_struct {
+ /** Number of blocks allocated from the buddy system. */
+ ulint used;
+ /** Number of blocks relocated by the buddy system. */
+ ib_uint64_t relocated;
+ /** Total duration of block relocations, in microseconds. */
+ ib_uint64_t relocated_usec;
+};
+
+/** Statistics of buddy blocks of a given size. */
+typedef struct buf_buddy_stat_struct buf_buddy_stat_t;
+
+/** Statistics of the buddy system, indexed by block size.
+Protected by buf_pool_mutex. */
+extern buf_buddy_stat_t buf_buddy_stat[BUF_BUDDY_SIZES + 1];
+
+#ifndef UNIV_NONINL
+# include "buf0buddy.ic"
+#endif
+
+#endif /* buf0buddy_h */
diff --git a/storage/innobase/include/buf0buddy.ic b/storage/innobase/include/buf0buddy.ic
new file mode 100644
index 00000000000..c419a2374d9
--- /dev/null
+++ b/storage/innobase/include/buf0buddy.ic
@@ -0,0 +1,127 @@
+/*****************************************************************************
+
+Copyright (c) 2006, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/buf0buddy.ic
+Binary buddy allocator for compressed pages
+
+Created December 2006 by Marko Makela
+*******************************************************/
+
+#ifdef UNIV_MATERIALIZE
+# undef UNIV_INLINE
+# define UNIV_INLINE
+#endif
+
+#include "buf0buf.h"
+#include "buf0buddy.h"
+#include "ut0ut.h"
+#include "sync0sync.h"
+
+/**********************************************************************//**
+Allocate a block. The thread calling this function must hold
+buf_pool_mutex and must not hold buf_pool_zip_mutex or any block->mutex.
+The buf_pool_mutex may only be released and reacquired if lru != NULL.
+@return allocated block, possibly NULL if lru==NULL */
+UNIV_INTERN
+void*
+buf_buddy_alloc_low(
+/*================*/
+ ulint i, /*!< in: index of buf_pool->zip_free[],
+ or BUF_BUDDY_SIZES */
+ ibool* lru) /*!< in: pointer to a variable that will be assigned
+ TRUE if storage was allocated from the LRU list
+ and buf_pool_mutex was temporarily released,
+ or NULL if the LRU list should not be used */
+ __attribute__((malloc));
+
+/**********************************************************************//**
+Deallocate a block. */
+UNIV_INTERN
+void
+buf_buddy_free_low(
+/*===============*/
+ void* buf, /*!< in: block to be freed, must not be
+ pointed to by the buffer pool */
+ ulint i) /*!< in: index of buf_pool->zip_free[],
+ or BUF_BUDDY_SIZES */
+ __attribute__((nonnull));
+
+/**********************************************************************//**
+Get the index of buf_pool->zip_free[] for a given block size.
+@return index of buf_pool->zip_free[], or BUF_BUDDY_SIZES */
+UNIV_INLINE
+ulint
+buf_buddy_get_slot(
+/*===============*/
+ ulint size) /*!< in: block size */
+{
+ ulint i;
+ ulint s;
+
+ for (i = 0, s = BUF_BUDDY_LOW; s < size; i++, s <<= 1) {
+ }
+
+ ut_ad(i <= BUF_BUDDY_SIZES);
+ return(i);
+}
+
+/**********************************************************************//**
+Allocate a block. The thread calling this function must hold
+buf_pool_mutex and must not hold buf_pool_zip_mutex or any
+block->mutex. The buf_pool_mutex may only be released and reacquired
+if lru != NULL. This function should only be used for allocating
+compressed page frames or control blocks (buf_page_t). Allocated
+control blocks must be properly initialized immediately after
+buf_buddy_alloc() has returned the memory, before releasing
+buf_pool_mutex.
+@return allocated block, possibly NULL if lru == NULL */
+UNIV_INLINE
+void*
+buf_buddy_alloc(
+/*============*/
+ ulint size, /*!< in: block size, up to UNIV_PAGE_SIZE */
+ ibool* lru) /*!< in: pointer to a variable that will be assigned
+ TRUE if storage was allocated from the LRU list
+ and buf_pool_mutex was temporarily released,
+ or NULL if the LRU list should not be used */
+{
+ ut_ad(buf_pool_mutex_own());
+
+ return(buf_buddy_alloc_low(buf_buddy_get_slot(size), lru));
+}
+
+/**********************************************************************//**
+Deallocate a block. */
+UNIV_INLINE
+void
+buf_buddy_free(
+/*===========*/
+ void* buf, /*!< in: block to be freed, must not be
+ pointed to by the buffer pool */
+ ulint size) /*!< in: block size, up to UNIV_PAGE_SIZE */
+{
+ ut_ad(buf_pool_mutex_own());
+
+ buf_buddy_free_low(buf, buf_buddy_get_slot(size));
+}
+
+#ifdef UNIV_MATERIALIZE
+# undef UNIV_INLINE
+# define UNIV_INLINE UNIV_INLINE_ORIGINAL
+#endif
diff --git a/storage/innobase/include/buf0buf.h b/storage/innobase/include/buf0buf.h
new file mode 100644
index 00000000000..65ad42c895a
--- /dev/null
+++ b/storage/innobase/include/buf0buf.h
@@ -0,0 +1,1531 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/buf0buf.h
+The database buffer pool high-level routines
+
+Created 11/5/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef buf0buf_h
+#define buf0buf_h
+
+#include "univ.i"
+#include "fil0fil.h"
+#include "mtr0types.h"
+#include "buf0types.h"
+#include "hash0hash.h"
+#include "ut0byte.h"
+#include "page0types.h"
+#ifndef UNIV_HOTBACKUP
+#include "os0proc.h"
+
+/** @name Modes for buf_page_get_gen */
+/* @{ */
+#define BUF_GET 10 /*!< get always */
+#define BUF_GET_IF_IN_POOL 11 /*!< get if in pool */
+#define BUF_GET_NO_LATCH 14 /*!< get and bufferfix, but
+ set no latch; we have
+ separated this case, because
+ it is error-prone programming
+ not to set a latch, and it
+ should be used with care */
+/* @} */
+/** @name Modes for buf_page_get_known_nowait */
+/* @{ */
+#define BUF_MAKE_YOUNG 51 /*!< Move the block to the
+ start of the LRU list if there
+ is a danger that the block
+ would drift out of the buffer
+ pool*/
+#define BUF_KEEP_OLD 52 /*!< Preserve the current LRU
+ position of the block. */
+/* @} */
+
+extern buf_pool_t* buf_pool; /*!< The buffer pool of the database */
+#ifdef UNIV_DEBUG
+extern ibool buf_debug_prints;/*!< If this is set TRUE, the program
+ prints info whenever read or flush
+ occurs */
+#endif /* UNIV_DEBUG */
+extern ulint srv_buf_pool_write_requests; /*!< variable to count write request
+ issued */
+#else /* !UNIV_HOTBACKUP */
+extern buf_block_t* back_block1; /*!< first block, for --apply-log */
+extern buf_block_t* back_block2; /*!< second block, for page reorganize */
+#endif /* !UNIV_HOTBACKUP */
+
+/** Magic value to use instead of checksums when they are disabled */
+#define BUF_NO_CHECKSUM_MAGIC 0xDEADBEEFUL
+
+/** @brief States of a control block
+@see buf_page_struct
+
+The enumeration values must be 0..7. */
+enum buf_page_state {
+ BUF_BLOCK_ZIP_FREE = 0, /*!< contains a free
+ compressed page */
+ BUF_BLOCK_ZIP_PAGE, /*!< contains a clean
+ compressed page */
+ BUF_BLOCK_ZIP_DIRTY, /*!< contains a compressed
+ page that is in the
+ buf_pool->flush_list */
+
+ BUF_BLOCK_NOT_USED, /*!< is in the free list;
+ must be after the BUF_BLOCK_ZIP_
+ constants for compressed-only pages
+ @see buf_block_state_valid() */
+ BUF_BLOCK_READY_FOR_USE, /*!< when buf_LRU_get_free_block
+ returns a block, it is in this state */
+ BUF_BLOCK_FILE_PAGE, /*!< contains a buffered file page */
+ BUF_BLOCK_MEMORY, /*!< contains some main memory
+ object */
+ BUF_BLOCK_REMOVE_HASH /*!< hash index should be removed
+ before putting to the free list */
+};
+
+#ifndef UNIV_HOTBACKUP
+/********************************************************************//**
+Creates the buffer pool.
+@return own: buf_pool object, NULL if not enough memory or error */
+UNIV_INTERN
+buf_pool_t*
+buf_pool_init(void);
+/*===============*/
+/********************************************************************//**
+Frees the buffer pool at shutdown. This must not be invoked before
+freeing all mutexes. */
+UNIV_INTERN
+void
+buf_pool_free(void);
+/*===============*/
+
+/********************************************************************//**
+Drops the adaptive hash index. To prevent a livelock, this function
+is only to be called while holding btr_search_latch and while
+btr_search_enabled == FALSE. */
+UNIV_INTERN
+void
+buf_pool_drop_hash_index(void);
+/*==========================*/
+
+/********************************************************************//**
+Relocate a buffer control block. Relocates the block on the LRU list
+and in buf_pool->page_hash. Does not relocate bpage->list.
+The caller must take care of relocating bpage->list. */
+UNIV_INTERN
+void
+buf_relocate(
+/*=========*/
+ buf_page_t* bpage, /*!< in/out: control block being relocated;
+ buf_page_get_state(bpage) must be
+ BUF_BLOCK_ZIP_DIRTY or BUF_BLOCK_ZIP_PAGE */
+ buf_page_t* dpage) /*!< in/out: destination control block */
+ __attribute__((nonnull));
+/********************************************************************//**
+Resizes the buffer pool. */
+UNIV_INTERN
+void
+buf_pool_resize(void);
+/*=================*/
+/*********************************************************************//**
+Gets the current size of buffer buf_pool in bytes.
+@return size in bytes */
+UNIV_INLINE
+ulint
+buf_pool_get_curr_size(void);
+/*========================*/
+/********************************************************************//**
+Gets the smallest oldest_modification lsn for any page in the pool. Returns
+zero if all modified pages have been flushed to disk.
+@return oldest modification in pool, zero if none */
+UNIV_INLINE
+ib_uint64_t
+buf_pool_get_oldest_modification(void);
+/*==================================*/
+/********************************************************************//**
+Allocates a buffer block.
+@return own: the allocated block, in state BUF_BLOCK_MEMORY */
+UNIV_INLINE
+buf_block_t*
+buf_block_alloc(
+/*============*/
+ ulint zip_size); /*!< in: compressed page size in bytes,
+ or 0 if uncompressed tablespace */
+/********************************************************************//**
+Frees a buffer block which does not contain a file page. */
+UNIV_INLINE
+void
+buf_block_free(
+/*===========*/
+ buf_block_t* block); /*!< in, own: block to be freed */
+#endif /* !UNIV_HOTBACKUP */
+/*********************************************************************//**
+Copies contents of a buffer frame to a given buffer.
+@return buf */
+UNIV_INLINE
+byte*
+buf_frame_copy(
+/*===========*/
+ byte* buf, /*!< in: buffer to copy to */
+ const buf_frame_t* frame); /*!< in: buffer frame */
+#ifndef UNIV_HOTBACKUP
+/**************************************************************//**
+NOTE! The following macros should be used instead of buf_page_get_gen,
+to improve debugging. Only values RW_S_LATCH and RW_X_LATCH are allowed
+in LA! */
+#define buf_page_get(SP, ZS, OF, LA, MTR) buf_page_get_gen(\
+ SP, ZS, OF, LA, NULL,\
+ BUF_GET, __FILE__, __LINE__, MTR)
+/**************************************************************//**
+Use these macros to bufferfix a page with no latching. Remember not to
+read the contents of the page unless you know it is safe. Do not modify
+the contents of the page! We have separated this case, because it is
+error-prone programming not to set a latch, and it should be used
+with care. */
+#define buf_page_get_with_no_latch(SP, ZS, OF, MTR) buf_page_get_gen(\
+ SP, ZS, OF, RW_NO_LATCH, NULL,\
+ BUF_GET_NO_LATCH, __FILE__, __LINE__, MTR)
+/**************************************************************//**
+NOTE! The following macros should be used instead of
+buf_page_optimistic_get_func, to improve debugging. Only values RW_S_LATCH and
+RW_X_LATCH are allowed as LA! */
+#define buf_page_optimistic_get(LA, BL, MC, MTR) \
+ buf_page_optimistic_get_func(LA, BL, MC, __FILE__, __LINE__, MTR)
+/********************************************************************//**
+This is the general function used to get optimistic access to a database
+page.
+@return TRUE if success */
+UNIV_INTERN
+ibool
+buf_page_optimistic_get_func(
+/*=========================*/
+ ulint rw_latch,/*!< in: RW_S_LATCH, RW_X_LATCH */
+ buf_block_t* block, /*!< in: guessed block */
+ ib_uint64_t modify_clock,/*!< in: modify clock value if mode is
+ ..._GUESS_ON_CLOCK */
+ const char* file, /*!< in: file name */
+ ulint line, /*!< in: line where called */
+ mtr_t* mtr); /*!< in: mini-transaction */
+/********************************************************************//**
+This is used to get access to a known database page, when no waiting can be
+done.
+@return TRUE if success */
+UNIV_INTERN
+ibool
+buf_page_get_known_nowait(
+/*======================*/
+ ulint rw_latch,/*!< in: RW_S_LATCH, RW_X_LATCH */
+ buf_block_t* block, /*!< in: the known page */
+ ulint mode, /*!< in: BUF_MAKE_YOUNG or BUF_KEEP_OLD */
+ const char* file, /*!< in: file name */
+ ulint line, /*!< in: line where called */
+ mtr_t* mtr); /*!< in: mini-transaction */
+
+/*******************************************************************//**
+Given a tablespace id and page number tries to get that page. If the
+page is not in the buffer pool it is not loaded and NULL is returned.
+Suitable for using when holding the kernel mutex. */
+UNIV_INTERN
+const buf_block_t*
+buf_page_try_get_func(
+/*==================*/
+ ulint space_id,/*!< in: tablespace id */
+ ulint page_no,/*!< in: page number */
+ const char* file, /*!< in: file name */
+ ulint line, /*!< in: line where called */
+ mtr_t* mtr); /*!< in: mini-transaction */
+
+/** Tries to get a page. If the page is not in the buffer pool it is
+not loaded. Suitable for using when holding the kernel mutex.
+@param space_id in: tablespace id
+@param page_no in: page number
+@param mtr in: mini-transaction
+@return the page if in buffer pool, NULL if not */
+#define buf_page_try_get(space_id, page_no, mtr) \
+ buf_page_try_get_func(space_id, page_no, __FILE__, __LINE__, mtr);
+
+/********************************************************************//**
+Get read access to a compressed page (usually of type
+FIL_PAGE_TYPE_ZBLOB or FIL_PAGE_TYPE_ZBLOB2).
+The page must be released with buf_page_release_zip().
+NOTE: the page is not protected by any latch. Mutual exclusion has to
+be implemented at a higher level. In other words, all possible
+accesses to a given page through this function must be protected by
+the same set of mutexes or latches.
+@return pointer to the block, or NULL if not compressed */
+UNIV_INTERN
+buf_page_t*
+buf_page_get_zip(
+/*=============*/
+ ulint space, /*!< in: space id */
+ ulint zip_size,/*!< in: compressed page size */
+ ulint offset);/*!< in: page number */
+/********************************************************************//**
+This is the general function used to get access to a database page.
+@return pointer to the block or NULL */
+UNIV_INTERN
+buf_block_t*
+buf_page_get_gen(
+/*=============*/
+ ulint space, /*!< in: space id */
+ ulint zip_size,/*!< in: compressed page size in bytes
+ or 0 for uncompressed pages */
+ ulint offset, /*!< in: page number */
+ ulint rw_latch,/*!< in: RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH */
+ buf_block_t* guess, /*!< in: guessed block or NULL */
+ ulint mode, /*!< in: BUF_GET, BUF_GET_IF_IN_POOL,
+ BUF_GET_NO_LATCH */
+ const char* file, /*!< in: file name */
+ ulint line, /*!< in: line where called */
+ mtr_t* mtr); /*!< in: mini-transaction */
+/********************************************************************//**
+Initializes a page to the buffer buf_pool. The page is usually not read
+from a file even if it cannot be found in the buffer buf_pool. This is one
+of the functions which perform to a block a state transition NOT_USED =>
+FILE_PAGE (the other is buf_page_get_gen).
+@return pointer to the block, page bufferfixed */
+UNIV_INTERN
+buf_block_t*
+buf_page_create(
+/*============*/
+ ulint space, /*!< in: space id */
+ ulint offset, /*!< in: offset of the page within space in units of
+ a page */
+ ulint zip_size,/*!< in: compressed page size, or 0 */
+ mtr_t* mtr); /*!< in: mini-transaction handle */
+#else /* !UNIV_HOTBACKUP */
+/********************************************************************//**
+Inits a page to the buffer buf_pool, for use in ibbackup --restore. */
+UNIV_INTERN
+void
+buf_page_init_for_backup_restore(
+/*=============================*/
+ ulint space, /*!< in: space id */
+ ulint offset, /*!< in: offset of the page within space
+ in units of a page */
+ ulint zip_size,/*!< in: compressed page size in bytes
+ or 0 for uncompressed pages */
+ buf_block_t* block); /*!< in: block to init */
+#endif /* !UNIV_HOTBACKUP */
+
+#ifndef UNIV_HOTBACKUP
+/********************************************************************//**
+Releases a compressed-only page acquired with buf_page_get_zip(). */
+UNIV_INLINE
+void
+buf_page_release_zip(
+/*=================*/
+ buf_page_t* bpage); /*!< in: buffer block */
+/********************************************************************//**
+Decrements the bufferfix count of a buffer control block and releases
+a latch, if specified. */
+UNIV_INLINE
+void
+buf_page_release(
+/*=============*/
+ buf_block_t* block, /*!< in: buffer block */
+ ulint rw_latch, /*!< in: RW_S_LATCH, RW_X_LATCH,
+ RW_NO_LATCH */
+ mtr_t* mtr); /*!< in: mtr */
+/********************************************************************//**
+Moves a page to the start of the buffer pool LRU list. This high-level
+function can be used to prevent an important page from from slipping out of
+the buffer pool. */
+UNIV_INTERN
+void
+buf_page_make_young(
+/*================*/
+ buf_page_t* bpage); /*!< in: buffer block of a file page */
+/********************************************************************//**
+Returns TRUE if the page can be found in the buffer pool hash table.
+
+NOTE that it is possible that the page is not yet read from disk,
+though.
+
+@return TRUE if found in the page hash table */
+UNIV_INLINE
+ibool
+buf_page_peek(
+/*==========*/
+ ulint space, /*!< in: space id */
+ ulint offset);/*!< in: page number */
+/********************************************************************//**
+Resets the check_index_page_at_flush field of a page if found in the buffer
+pool. */
+UNIV_INTERN
+void
+buf_reset_check_index_page_at_flush(
+/*================================*/
+ ulint space, /*!< in: space id */
+ ulint offset);/*!< in: page number */
+#ifdef UNIV_DEBUG_FILE_ACCESSES
+/********************************************************************//**
+Sets file_page_was_freed TRUE if the page is found in the buffer pool.
+This function should be called when we free a file page and want the
+debug version to check that it is not accessed any more unless
+reallocated.
+@return control block if found in page hash table, otherwise NULL */
+UNIV_INTERN
+buf_page_t*
+buf_page_set_file_page_was_freed(
+/*=============================*/
+ ulint space, /*!< in: space id */
+ ulint offset);/*!< in: page number */
+/********************************************************************//**
+Sets file_page_was_freed FALSE if the page is found in the buffer pool.
+This function should be called when we free a file page and want the
+debug version to check that it is not accessed any more unless
+reallocated.
+@return control block if found in page hash table, otherwise NULL */
+UNIV_INTERN
+buf_page_t*
+buf_page_reset_file_page_was_freed(
+/*===============================*/
+ ulint space, /*!< in: space id */
+ ulint offset); /*!< in: page number */
+#endif /* UNIV_DEBUG_FILE_ACCESSES */
+/********************************************************************//**
+Reads the freed_page_clock of a buffer block.
+@return freed_page_clock */
+UNIV_INLINE
+ulint
+buf_page_get_freed_page_clock(
+/*==========================*/
+ const buf_page_t* bpage) /*!< in: block */
+ __attribute__((pure));
+/********************************************************************//**
+Reads the freed_page_clock of a buffer block.
+@return freed_page_clock */
+UNIV_INLINE
+ulint
+buf_block_get_freed_page_clock(
+/*===========================*/
+ const buf_block_t* block) /*!< in: block */
+ __attribute__((pure));
+
+/********************************************************************//**
+Recommends a move of a block to the start of the LRU list if there is danger
+of dropping from the buffer pool. NOTE: does not reserve the buffer pool
+mutex.
+@return TRUE if should be made younger */
+UNIV_INLINE
+ibool
+buf_page_peek_if_too_old(
+/*=====================*/
+ const buf_page_t* bpage); /*!< in: block to make younger */
+/********************************************************************//**
+Returns the current state of is_hashed of a page. FALSE if the page is
+not in the pool. NOTE that this operation does not fix the page in the
+pool if it is found there.
+@return TRUE if page hash index is built in search system */
+UNIV_INTERN
+ibool
+buf_page_peek_if_search_hashed(
+/*===========================*/
+ ulint space, /*!< in: space id */
+ ulint offset);/*!< in: page number */
+/********************************************************************//**
+Gets the youngest modification log sequence number for a frame.
+Returns zero if not file page or no modification occurred yet.
+@return newest modification to page */
+UNIV_INLINE
+ib_uint64_t
+buf_page_get_newest_modification(
+/*=============================*/
+ const buf_page_t* bpage); /*!< in: block containing the
+ page frame */
+/********************************************************************//**
+Increments the modify clock of a frame by 1. The caller must (1) own the
+buf_pool mutex and block bufferfix count has to be zero, (2) or own an x-lock
+on the block. */
+UNIV_INLINE
+void
+buf_block_modify_clock_inc(
+/*=======================*/
+ buf_block_t* block); /*!< in: block */
+/********************************************************************//**
+Returns the value of the modify clock. The caller must have an s-lock
+or x-lock on the block.
+@return value */
+UNIV_INLINE
+ib_uint64_t
+buf_block_get_modify_clock(
+/*=======================*/
+ buf_block_t* block); /*!< in: block */
+#else /* !UNIV_HOTBACKUP */
+# define buf_block_modify_clock_inc(block) ((void) 0)
+#endif /* !UNIV_HOTBACKUP */
+/********************************************************************//**
+Calculates a page checksum which is stored to the page when it is written
+to a file. Note that we must be careful to calculate the same value
+on 32-bit and 64-bit architectures.
+@return checksum */
+UNIV_INTERN
+ulint
+buf_calc_page_new_checksum(
+/*=======================*/
+ const byte* page); /*!< in: buffer page */
+/********************************************************************//**
+In versions < 4.0.14 and < 4.1.1 there was a bug that the checksum only
+looked at the first few bytes of the page. This calculates that old
+checksum.
+NOTE: we must first store the new formula checksum to
+FIL_PAGE_SPACE_OR_CHKSUM before calculating and storing this old checksum
+because this takes that field as an input!
+@return checksum */
+UNIV_INTERN
+ulint
+buf_calc_page_old_checksum(
+/*=======================*/
+ const byte* page); /*!< in: buffer page */
+/********************************************************************//**
+Checks if a page is corrupt.
+@return TRUE if corrupted */
+UNIV_INTERN
+ibool
+buf_page_is_corrupted(
+/*==================*/
+ const byte* read_buf, /*!< in: a database page */
+ ulint zip_size); /*!< in: size of compressed page;
+ 0 for uncompressed pages */
+#ifndef UNIV_HOTBACKUP
+/**********************************************************************//**
+Gets the space id, page offset, and byte offset within page of a
+pointer pointing to a buffer frame containing a file page. */
+UNIV_INLINE
+void
+buf_ptr_get_fsp_addr(
+/*=================*/
+ const void* ptr, /*!< in: pointer to a buffer frame */
+ ulint* space, /*!< out: space id */
+ fil_addr_t* addr); /*!< out: page offset and byte offset */
+/**********************************************************************//**
+Gets the hash value of a block. This can be used in searches in the
+lock hash table.
+@return lock hash value */
+UNIV_INLINE
+ulint
+buf_block_get_lock_hash_val(
+/*========================*/
+ const buf_block_t* block) /*!< in: block */
+ __attribute__((pure));
+#ifdef UNIV_DEBUG
+/*********************************************************************//**
+Finds a block in the buffer pool that points to a
+given compressed page.
+@return buffer block pointing to the compressed page, or NULL */
+UNIV_INTERN
+buf_block_t*
+buf_pool_contains_zip(
+/*==================*/
+ const void* data); /*!< in: pointer to compressed page */
+#endif /* UNIV_DEBUG */
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+/*********************************************************************//**
+Validates the buffer pool data structure.
+@return TRUE */
+UNIV_INTERN
+ibool
+buf_validate(void);
+/*==============*/
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+#if defined UNIV_DEBUG_PRINT || defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+/*********************************************************************//**
+Prints info of the buffer pool data structure. */
+UNIV_INTERN
+void
+buf_print(void);
+/*============*/
+#endif /* UNIV_DEBUG_PRINT || UNIV_DEBUG || UNIV_BUF_DEBUG */
+#endif /* !UNIV_HOTBACKUP */
+/********************************************************************//**
+Prints a page to stderr. */
+UNIV_INTERN
+void
+buf_page_print(
+/*===========*/
+ const byte* read_buf, /*!< in: a database page */
+ ulint zip_size); /*!< in: compressed page size, or
+ 0 for uncompressed pages */
+/********************************************************************//**
+Decompress a block.
+@return TRUE if successful */
+UNIV_INTERN
+ibool
+buf_zip_decompress(
+/*===============*/
+ buf_block_t* block, /*!< in/out: block */
+ ibool check); /*!< in: TRUE=verify the page checksum */
+#ifndef UNIV_HOTBACKUP
+#ifdef UNIV_DEBUG
+/*********************************************************************//**
+Returns the number of latched pages in the buffer pool.
+@return number of latched pages */
+UNIV_INTERN
+ulint
+buf_get_latched_pages_number(void);
+/*==============================*/
+#endif /* UNIV_DEBUG */
+/*********************************************************************//**
+Returns the number of pending buf pool ios.
+@return number of pending I/O operations */
+UNIV_INTERN
+ulint
+buf_get_n_pending_ios(void);
+/*=======================*/
+/*********************************************************************//**
+Prints info of the buffer i/o. */
+UNIV_INTERN
+void
+buf_print_io(
+/*=========*/
+ FILE* file); /*!< in: file where to print */
+/*********************************************************************//**
+Returns the ratio in percents of modified pages in the buffer pool /
+database pages in the buffer pool.
+@return modified page percentage ratio */
+UNIV_INTERN
+ulint
+buf_get_modified_ratio_pct(void);
+/*============================*/
+/**********************************************************************//**
+Refreshes the statistics used to print per-second averages. */
+UNIV_INTERN
+void
+buf_refresh_io_stats(void);
+/*======================*/
+/*********************************************************************//**
+Asserts that all file pages in the buffer are in a replaceable state.
+@return TRUE */
+UNIV_INTERN
+ibool
+buf_all_freed(void);
+/*===============*/
+/*********************************************************************//**
+Checks that there currently are no pending i/o-operations for the buffer
+pool.
+@return TRUE if there is no pending i/o */
+UNIV_INTERN
+ibool
+buf_pool_check_no_pending_io(void);
+/*==============================*/
+/*********************************************************************//**
+Invalidates the file pages in the buffer pool when an archive recovery is
+completed. All the file pages buffered must be in a replaceable state when
+this function is called: not latched and not modified. */
+UNIV_INTERN
+void
+buf_pool_invalidate(void);
+/*=====================*/
+#endif /* !UNIV_HOTBACKUP */
+
+/*========================================================================
+--------------------------- LOWER LEVEL ROUTINES -------------------------
+=========================================================================*/
+
+#ifdef UNIV_SYNC_DEBUG
+/*********************************************************************//**
+Adds latch level info for the rw-lock protecting the buffer frame. This
+should be called in the debug version after a successful latching of a
+page if we know the latching order level of the acquired latch. */
+UNIV_INLINE
+void
+buf_block_dbg_add_level(
+/*====================*/
+ buf_block_t* block, /*!< in: buffer page
+ where we have acquired latch */
+ ulint level); /*!< in: latching order level */
+#else /* UNIV_SYNC_DEBUG */
+# define buf_block_dbg_add_level(block, level) /* nothing */
+#endif /* UNIV_SYNC_DEBUG */
+/*********************************************************************//**
+Gets the state of a block.
+@return state */
+UNIV_INLINE
+enum buf_page_state
+buf_page_get_state(
+/*===============*/
+ const buf_page_t* bpage); /*!< in: pointer to the control block */
+/*********************************************************************//**
+Gets the state of a block.
+@return state */
+UNIV_INLINE
+enum buf_page_state
+buf_block_get_state(
+/*================*/
+ const buf_block_t* block) /*!< in: pointer to the control block */
+ __attribute__((pure));
+/*********************************************************************//**
+Sets the state of a block. */
+UNIV_INLINE
+void
+buf_page_set_state(
+/*===============*/
+ buf_page_t* bpage, /*!< in/out: pointer to control block */
+ enum buf_page_state state); /*!< in: state */
+/*********************************************************************//**
+Sets the state of a block. */
+UNIV_INLINE
+void
+buf_block_set_state(
+/*================*/
+ buf_block_t* block, /*!< in/out: pointer to control block */
+ enum buf_page_state state); /*!< in: state */
+/*********************************************************************//**
+Determines if a block is mapped to a tablespace.
+@return TRUE if mapped */
+UNIV_INLINE
+ibool
+buf_page_in_file(
+/*=============*/
+ const buf_page_t* bpage) /*!< in: pointer to control block */
+ __attribute__((pure));
+#ifndef UNIV_HOTBACKUP
+/*********************************************************************//**
+Determines if a block should be on unzip_LRU list.
+@return TRUE if block belongs to unzip_LRU */
+UNIV_INLINE
+ibool
+buf_page_belongs_to_unzip_LRU(
+/*==========================*/
+ const buf_page_t* bpage) /*!< in: pointer to control block */
+ __attribute__((pure));
+/*********************************************************************//**
+Determine the approximate LRU list position of a block.
+@return LRU list position */
+UNIV_INLINE
+ulint
+buf_page_get_LRU_position(
+/*======================*/
+ const buf_page_t* bpage) /*!< in: control block */
+ __attribute__((pure));
+
+/*********************************************************************//**
+Gets the mutex of a block.
+@return pointer to mutex protecting bpage */
+UNIV_INLINE
+mutex_t*
+buf_page_get_mutex(
+/*===============*/
+ const buf_page_t* bpage) /*!< in: pointer to control block */
+ __attribute__((pure));
+
+/*********************************************************************//**
+Get the flush type of a page.
+@return flush type */
+UNIV_INLINE
+enum buf_flush
+buf_page_get_flush_type(
+/*====================*/
+ const buf_page_t* bpage) /*!< in: buffer page */
+ __attribute__((pure));
+/*********************************************************************//**
+Set the flush type of a page. */
+UNIV_INLINE
+void
+buf_page_set_flush_type(
+/*====================*/
+ buf_page_t* bpage, /*!< in: buffer page */
+ enum buf_flush flush_type); /*!< in: flush type */
+/*********************************************************************//**
+Map a block to a file page. */
+UNIV_INLINE
+void
+buf_block_set_file_page(
+/*====================*/
+ buf_block_t* block, /*!< in/out: pointer to control block */
+ ulint space, /*!< in: tablespace id */
+ ulint page_no);/*!< in: page number */
+/*********************************************************************//**
+Gets the io_fix state of a block.
+@return io_fix state */
+UNIV_INLINE
+enum buf_io_fix
+buf_page_get_io_fix(
+/*================*/
+ const buf_page_t* bpage) /*!< in: pointer to the control block */
+ __attribute__((pure));
+/*********************************************************************//**
+Gets the io_fix state of a block.
+@return io_fix state */
+UNIV_INLINE
+enum buf_io_fix
+buf_block_get_io_fix(
+/*================*/
+ const buf_block_t* block) /*!< in: pointer to the control block */
+ __attribute__((pure));
+/*********************************************************************//**
+Sets the io_fix state of a block. */
+UNIV_INLINE
+void
+buf_page_set_io_fix(
+/*================*/
+ buf_page_t* bpage, /*!< in/out: control block */
+ enum buf_io_fix io_fix);/*!< in: io_fix state */
+/*********************************************************************//**
+Sets the io_fix state of a block. */
+UNIV_INLINE
+void
+buf_block_set_io_fix(
+/*=================*/
+ buf_block_t* block, /*!< in/out: control block */
+ enum buf_io_fix io_fix);/*!< in: io_fix state */
+
+/********************************************************************//**
+Determine if a buffer block can be relocated in memory. The block
+can be dirty, but it must not be I/O-fixed or bufferfixed. */
+UNIV_INLINE
+ibool
+buf_page_can_relocate(
+/*==================*/
+ const buf_page_t* bpage) /*!< control block being relocated */
+ __attribute__((pure));
+
+/*********************************************************************//**
+Determine if a block has been flagged old.
+@return TRUE if old */
+UNIV_INLINE
+ibool
+buf_page_is_old(
+/*============*/
+ const buf_page_t* bpage) /*!< in: control block */
+ __attribute__((pure));
+/*********************************************************************//**
+Flag a block old. */
+UNIV_INLINE
+void
+buf_page_set_old(
+/*=============*/
+ buf_page_t* bpage, /*!< in/out: control block */
+ ibool old); /*!< in: old */
+/*********************************************************************//**
+Determine if a block has been accessed in the buffer pool.
+@return TRUE if accessed */
+UNIV_INLINE
+ibool
+buf_page_is_accessed(
+/*=================*/
+ const buf_page_t* bpage) /*!< in: control block */
+ __attribute__((pure));
+/*********************************************************************//**
+Flag a block accessed. */
+UNIV_INLINE
+void
+buf_page_set_accessed(
+/*==================*/
+ buf_page_t* bpage, /*!< in/out: control block */
+ ibool accessed); /*!< in: accessed */
+/*********************************************************************//**
+Gets the buf_block_t handle of a buffered file block if an uncompressed
+page frame exists, or NULL.
+@return control block, or NULL */
+UNIV_INLINE
+buf_block_t*
+buf_page_get_block(
+/*===============*/
+ buf_page_t* bpage) /*!< in: control block, or NULL */
+ __attribute__((pure));
+#endif /* !UNIV_HOTBACKUP */
+#ifdef UNIV_DEBUG
+/*********************************************************************//**
+Gets a pointer to the memory frame of a block.
+@return pointer to the frame */
+UNIV_INLINE
+buf_frame_t*
+buf_block_get_frame(
+/*================*/
+ const buf_block_t* block) /*!< in: pointer to the control block */
+ __attribute__((pure));
+#else /* UNIV_DEBUG */
+# define buf_block_get_frame(block) (block)->frame
+#endif /* UNIV_DEBUG */
+/*********************************************************************//**
+Gets the space id of a block.
+@return space id */
+UNIV_INLINE
+ulint
+buf_page_get_space(
+/*===============*/
+ const buf_page_t* bpage) /*!< in: pointer to the control block */
+ __attribute__((pure));
+/*********************************************************************//**
+Gets the space id of a block.
+@return space id */
+UNIV_INLINE
+ulint
+buf_block_get_space(
+/*================*/
+ const buf_block_t* block) /*!< in: pointer to the control block */
+ __attribute__((pure));
+/*********************************************************************//**
+Gets the page number of a block.
+@return page number */
+UNIV_INLINE
+ulint
+buf_page_get_page_no(
+/*=================*/
+ const buf_page_t* bpage) /*!< in: pointer to the control block */
+ __attribute__((pure));
+/*********************************************************************//**
+Gets the page number of a block.
+@return page number */
+UNIV_INLINE
+ulint
+buf_block_get_page_no(
+/*==================*/
+ const buf_block_t* block) /*!< in: pointer to the control block */
+ __attribute__((pure));
+/*********************************************************************//**
+Gets the compressed page size of a block.
+@return compressed page size, or 0 */
+UNIV_INLINE
+ulint
+buf_page_get_zip_size(
+/*==================*/
+ const buf_page_t* bpage) /*!< in: pointer to the control block */
+ __attribute__((pure));
+/*********************************************************************//**
+Gets the compressed page size of a block.
+@return compressed page size, or 0 */
+UNIV_INLINE
+ulint
+buf_block_get_zip_size(
+/*===================*/
+ const buf_block_t* block) /*!< in: pointer to the control block */
+ __attribute__((pure));
+/*********************************************************************//**
+Gets the compressed page descriptor corresponding to an uncompressed page
+if applicable. */
+#define buf_block_get_page_zip(block) \
+ (UNIV_LIKELY_NULL((block)->page.zip.data) ? &(block)->page.zip : NULL)
+#ifndef UNIV_HOTBACKUP
+/*******************************************************************//**
+Gets the block to whose frame the pointer is pointing to.
+@return pointer to block, never NULL */
+UNIV_INTERN
+buf_block_t*
+buf_block_align(
+/*============*/
+ const byte* ptr); /*!< in: pointer to a frame */
+/********************************************************************//**
+Find out if a pointer belongs to a buf_block_t. It can be a pointer to
+the buf_block_t itself or a member of it
+@return TRUE if ptr belongs to a buf_block_t struct */
+UNIV_INTERN
+ibool
+buf_pointer_is_block_field(
+/*=======================*/
+ const void* ptr); /*!< in: pointer not
+ dereferenced */
+/** Find out if a pointer corresponds to a buf_block_t::mutex.
+@param m in: mutex candidate
+@return TRUE if m is a buf_block_t::mutex */
+#define buf_pool_is_block_mutex(m) \
+ buf_pointer_is_block_field((const void*)(m))
+/** Find out if a pointer corresponds to a buf_block_t::lock.
+@param l in: rw-lock candidate
+@return TRUE if l is a buf_block_t::lock */
+#define buf_pool_is_block_lock(l) \
+ buf_pointer_is_block_field((const void*)(l))
+
+#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG
+/*********************************************************************//**
+Gets the compressed page descriptor corresponding to an uncompressed page
+if applicable.
+@return compressed page descriptor, or NULL */
+UNIV_INLINE
+const page_zip_des_t*
+buf_frame_get_page_zip(
+/*===================*/
+ const byte* ptr); /*!< in: pointer to the page */
+#endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */
+/********************************************************************//**
+Function which inits a page for read to the buffer buf_pool. If the page is
+(1) already in buf_pool, or
+(2) if we specify to read only ibuf pages and the page is not an ibuf page, or
+(3) if the space is deleted or being deleted,
+then this function does nothing.
+Sets the io_fix flag to BUF_IO_READ and sets a non-recursive exclusive lock
+on the buffer frame. The io-handler must take care that the flag is cleared
+and the lock released later.
+@return pointer to the block or NULL */
+UNIV_INTERN
+buf_page_t*
+buf_page_init_for_read(
+/*===================*/
+ ulint* err, /*!< out: DB_SUCCESS or DB_TABLESPACE_DELETED */
+ ulint mode, /*!< in: BUF_READ_IBUF_PAGES_ONLY, ... */
+ ulint space, /*!< in: space id */
+ ulint zip_size,/*!< in: compressed page size, or 0 */
+ ibool unzip, /*!< in: TRUE=request uncompressed page */
+ ib_int64_t tablespace_version,/*!< in: prevents reading from a wrong
+ version of the tablespace in case we have done
+ DISCARD + IMPORT */
+ ulint offset);/*!< in: page number */
+/********************************************************************//**
+Completes an asynchronous read or write request of a file page to or from
+the buffer pool. */
+UNIV_INTERN
+void
+buf_page_io_complete(
+/*=================*/
+ buf_page_t* bpage); /*!< in: pointer to the block in question */
+/********************************************************************//**
+Calculates a folded value of a file page address to use in the page hash
+table.
+@return the folded value */
+UNIV_INLINE
+ulint
+buf_page_address_fold(
+/*==================*/
+ ulint space, /*!< in: space id */
+ ulint offset) /*!< in: offset of the page within space */
+ __attribute__((const));
+/******************************************************************//**
+Returns the control block of a file page, NULL if not found.
+@return block, NULL if not found */
+UNIV_INLINE
+buf_page_t*
+buf_page_hash_get(
+/*==============*/
+ ulint space, /*!< in: space id */
+ ulint offset);/*!< in: offset of the page within space */
+/******************************************************************//**
+Returns the control block of a file page, NULL if not found
+or an uncompressed page frame does not exist.
+@return block, NULL if not found */
+UNIV_INLINE
+buf_block_t*
+buf_block_hash_get(
+/*===============*/
+ ulint space, /*!< in: space id */
+ ulint offset);/*!< in: offset of the page within space */
+/*******************************************************************//**
+Increments the pool clock by one and returns its new value. Remember that
+in the 32 bit version the clock wraps around at 4 billion!
+@return new clock value */
+UNIV_INLINE
+ulint
+buf_pool_clock_tic(void);
+/*====================*/
+/*********************************************************************//**
+Gets the current length of the free list of buffer blocks.
+@return length of the free list */
+UNIV_INTERN
+ulint
+buf_get_free_list_len(void);
+/*=======================*/
+#endif /* !UNIV_HOTBACKUP */
+
+
+/** The common buffer control block structure
+for compressed and uncompressed frames */
+
+struct buf_page_struct{
+ /** @name General fields
+ None of these bit-fields must be modified without holding
+ buf_page_get_mutex() [buf_block_struct::mutex or
+ buf_pool_zip_mutex], since they can be stored in the same
+ machine word. Some of these fields are additionally protected
+ by buf_pool_mutex. */
+ /* @{ */
+
+ unsigned space:32; /*!< tablespace id; also protected
+ by buf_pool_mutex. */
+ unsigned offset:32; /*!< page number; also protected
+ by buf_pool_mutex. */
+
+ unsigned state:3; /*!< state of the control block; also
+ protected by buf_pool_mutex.
+ State transitions from
+ BUF_BLOCK_READY_FOR_USE to
+ BUF_BLOCK_MEMORY need not be
+ protected by buf_page_get_mutex().
+ @see enum buf_page_state */
+#ifndef UNIV_HOTBACKUP
+ unsigned flush_type:2; /*!< if this block is currently being
+ flushed to disk, this tells the
+ flush_type.
+ @see enum buf_flush */
+ unsigned accessed:1; /*!< TRUE if the page has been accessed
+ while in the buffer pool: read-ahead
+ may read in pages which have not been
+ accessed yet; a thread is allowed to
+ read this for heuristic purposes
+ without holding any mutex or latch */
+ unsigned io_fix:2; /*!< type of pending I/O operation;
+ also protected by buf_pool_mutex
+ @see enum buf_io_fix */
+ unsigned buf_fix_count:24;/*!< count of how manyfold this block
+ is currently bufferfixed */
+ /* @} */
+#endif /* !UNIV_HOTBACKUP */
+ page_zip_des_t zip; /*!< compressed page; zip.data
+ (but not the data it points to) is
+ also protected by buf_pool_mutex */
+#ifndef UNIV_HOTBACKUP
+ buf_page_t* hash; /*!< node used in chaining to
+ buf_pool->page_hash or
+ buf_pool->zip_hash */
+#ifdef UNIV_DEBUG
+ ibool in_page_hash; /*!< TRUE if in buf_pool->page_hash */
+ ibool in_zip_hash; /*!< TRUE if in buf_pool->zip_hash */
+#endif /* UNIV_DEBUG */
+
+ /** @name Page flushing fields
+ All these are protected by buf_pool_mutex. */
+ /* @{ */
+
+ UT_LIST_NODE_T(buf_page_t) list;
+ /*!< based on state, this is a
+ list node, protected only by
+ buf_pool_mutex, in one of the
+ following lists in buf_pool:
+
+ - BUF_BLOCK_NOT_USED: free
+ - BUF_BLOCK_FILE_PAGE: flush_list
+ - BUF_BLOCK_ZIP_DIRTY: flush_list
+ - BUF_BLOCK_ZIP_PAGE: zip_clean
+ - BUF_BLOCK_ZIP_FREE: zip_free[] */
+#ifdef UNIV_DEBUG
+ ibool in_flush_list; /*!< TRUE if in buf_pool->flush_list;
+ when buf_pool_mutex is free, the
+ following should hold: in_flush_list
+ == (state == BUF_BLOCK_FILE_PAGE
+ || state == BUF_BLOCK_ZIP_DIRTY) */
+ ibool in_free_list; /*!< TRUE if in buf_pool->free; when
+ buf_pool_mutex is free, the following
+ should hold: in_free_list
+ == (state == BUF_BLOCK_NOT_USED) */
+#endif /* UNIV_DEBUG */
+ ib_uint64_t newest_modification;
+ /*!< log sequence number of
+ the youngest modification to
+ this block, zero if not
+ modified */
+ ib_uint64_t oldest_modification;
+ /*!< log sequence number of
+ the START of the log entry
+ written of the oldest
+ modification to this block
+ which has not yet been flushed
+ on disk; zero if all
+ modifications are on disk */
+ /* @} */
+ /** @name LRU replacement algorithm fields
+ These fields are protected by buf_pool_mutex only (not
+ buf_pool_zip_mutex or buf_block_struct::mutex). */
+ /* @{ */
+
+ UT_LIST_NODE_T(buf_page_t) LRU;
+ /*!< node of the LRU list */
+#ifdef UNIV_DEBUG
+ ibool in_LRU_list; /*!< TRUE if the page is in
+ the LRU list; used in
+ debugging */
+#endif /* UNIV_DEBUG */
+ unsigned old:1; /*!< TRUE if the block is in the old
+ blocks in the LRU list */
+ unsigned LRU_position:31;/*!< value which monotonically
+ decreases (or may stay
+ constant if old==TRUE) toward
+ the end of the LRU list, if
+ buf_pool->ulint_clock has not
+ wrapped around: NOTE that this
+ value can only be used in
+ heuristic algorithms, because
+ of the possibility of a
+ wrap-around! */
+ unsigned freed_page_clock:32;/*!< the value of
+ buf_pool->freed_page_clock
+ when this block was the last
+ time put to the head of the
+ LRU list; a thread is allowed
+ to read this for heuristic
+ purposes without holding any
+ mutex or latch */
+ /* @} */
+# ifdef UNIV_DEBUG_FILE_ACCESSES
+ ibool file_page_was_freed;
+ /*!< this is set to TRUE when fsp
+ frees a page in buffer pool */
+# endif /* UNIV_DEBUG_FILE_ACCESSES */
+#endif /* !UNIV_HOTBACKUP */
+};
+
+/** The buffer control block structure */
+
+struct buf_block_struct{
+
+ /** @name General fields */
+ /* @{ */
+
+ buf_page_t page; /*!< page information; this must
+ be the first field, so that
+ buf_pool->page_hash can point
+ to buf_page_t or buf_block_t */
+ byte* frame; /*!< pointer to buffer frame which
+ is of size UNIV_PAGE_SIZE, and
+ aligned to an address divisible by
+ UNIV_PAGE_SIZE */
+#ifndef UNIV_HOTBACKUP
+ UT_LIST_NODE_T(buf_block_t) unzip_LRU;
+ /*!< node of the decompressed LRU list;
+ a block is in the unzip_LRU list
+ if page.state == BUF_BLOCK_FILE_PAGE
+ and page.zip.data != NULL */
+#ifdef UNIV_DEBUG
+ ibool in_unzip_LRU_list;/*!< TRUE if the page is in the
+ decompressed LRU list;
+ used in debugging */
+#endif /* UNIV_DEBUG */
+ mutex_t mutex; /*!< mutex protecting this block:
+ state (also protected by the buffer
+ pool mutex), io_fix, buf_fix_count,
+ and accessed; we introduce this new
+ mutex in InnoDB-5.1 to relieve
+ contention on the buffer pool mutex */
+ rw_lock_t lock; /*!< read-write lock of the buffer
+ frame */
+ unsigned lock_hash_val:32;/*!< hashed value of the page address
+ in the record lock hash table */
+ unsigned check_index_page_at_flush:1;
+ /*!< TRUE if we know that this is
+ an index page, and want the database
+ to check its consistency before flush;
+ note that there may be pages in the
+ buffer pool which are index pages,
+ but this flag is not set because
+ we do not keep track of all pages */
+ /* @} */
+ /** @name Optimistic search field */
+ /* @{ */
+
+ ib_uint64_t modify_clock; /*!< this clock is incremented every
+ time a pointer to a record on the
+ page may become obsolete; this is
+ used in the optimistic cursor
+ positioning: if the modify clock has
+ not changed, we know that the pointer
+ is still valid; this field may be
+ changed if the thread (1) owns the
+ pool mutex and the page is not
+ bufferfixed, or (2) the thread has an
+ x-latch on the block */
+ /* @} */
+ /** @name Hash search fields (unprotected)
+ NOTE that these fields are NOT protected by any semaphore! */
+ /* @{ */
+
+ ulint n_hash_helps; /*!< counter which controls building
+ of a new hash index for the page */
+ ulint n_fields; /*!< recommended prefix length for hash
+ search: number of full fields */
+ ulint n_bytes; /*!< recommended prefix: number of bytes
+ in an incomplete field */
+ ibool left_side; /*!< TRUE or FALSE, depending on
+ whether the leftmost record of several
+ records with the same prefix should be
+ indexed in the hash index */
+ /* @} */
+
+ /** @name Hash search fields
+ These 6 fields may only be modified when we have
+ an x-latch on btr_search_latch AND
+ - we are holding an s-latch or x-latch on buf_block_struct::lock or
+ - we know that buf_block_struct::buf_fix_count == 0.
+
+ An exception to this is when we init or create a page
+ in the buffer pool in buf0buf.c. */
+
+ /* @{ */
+
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+ ulint n_pointers; /*!< used in debugging: the number of
+ pointers in the adaptive hash index
+ pointing to this frame */
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+ unsigned is_hashed:1; /*!< TRUE if hash index has
+ already been built on this
+ page; note that it does not
+ guarantee that the index is
+ complete, though: there may
+ have been hash collisions,
+ record deletions, etc. */
+ unsigned curr_n_fields:10;/*!< prefix length for hash indexing:
+ number of full fields */
+ unsigned curr_n_bytes:15;/*!< number of bytes in hash
+ indexing */
+ unsigned curr_left_side:1;/*!< TRUE or FALSE in hash indexing */
+ dict_index_t* index; /*!< Index for which the adaptive
+ hash index has been created. */
+ /* @} */
+# ifdef UNIV_SYNC_DEBUG
+ /** @name Debug fields */
+ /* @{ */
+ rw_lock_t debug_latch; /*!< in the debug version, each thread
+ which bufferfixes the block acquires
+ an s-latch here; so we can use the
+ debug utilities in sync0rw */
+ /* @} */
+# endif
+#endif /* !UNIV_HOTBACKUP */
+};
+
+/** Check if a buf_block_t object is in a valid state
+@param block buffer block
+@return TRUE if valid */
+#define buf_block_state_valid(block) \
+(buf_block_get_state(block) >= BUF_BLOCK_NOT_USED \
+ && (buf_block_get_state(block) <= BUF_BLOCK_REMOVE_HASH))
+
+#ifndef UNIV_HOTBACKUP
+/**********************************************************************//**
+Compute the hash fold value for blocks in buf_pool->zip_hash. */
+/* @{ */
+#define BUF_POOL_ZIP_FOLD_PTR(ptr) ((ulint) (ptr) / UNIV_PAGE_SIZE)
+#define BUF_POOL_ZIP_FOLD(b) BUF_POOL_ZIP_FOLD_PTR((b)->frame)
+#define BUF_POOL_ZIP_FOLD_BPAGE(b) BUF_POOL_ZIP_FOLD((buf_block_t*) (b))
+/* @} */
+
+/** @brief The buffer pool structure.
+
+NOTE! The definition appears here only for other modules of this
+directory (buf) to see it. Do not use from outside! */
+
+struct buf_pool_struct{
+
+ /** @name General fields */
+ /* @{ */
+
+ ulint n_chunks; /*!< number of buffer pool chunks */
+ buf_chunk_t* chunks; /*!< buffer pool chunks */
+ ulint curr_size; /*!< current pool size in pages */
+ hash_table_t* page_hash; /*!< hash table of buf_page_t or
+ buf_block_t file pages,
+ buf_page_in_file() == TRUE,
+ indexed by (space_id, offset) */
+ hash_table_t* zip_hash; /*!< hash table of buf_block_t blocks
+ whose frames are allocated to the
+ zip buddy system,
+ indexed by block->frame */
+ ulint n_pend_reads; /*!< number of pending read operations */
+ ulint n_pend_unzip; /*!< number of pending decompressions */
+
+ time_t last_printout_time; /*!< when buf_print was last time
+ called */
+ ulint n_pages_read; /*!< number read operations */
+ ulint n_pages_written;/*!< number write operations */
+ ulint n_pages_created;/*!< number of pages created
+ in the pool with no read */
+ ulint n_page_gets; /*!< number of page gets performed;
+ also successful searches through
+ the adaptive hash index are
+ counted as page gets; this field
+ is NOT protected by the buffer
+ pool mutex */
+ ulint n_page_gets_old;/*!< n_page_gets when buf_print was
+ last time called: used to calculate
+ hit rate */
+ ulint n_pages_read_old;/*!< n_pages_read when buf_print was
+ last time called */
+ ulint n_pages_written_old;/*!< number write operations */
+ ulint n_pages_created_old;/*!< number of pages created in
+ the pool with no read */
+ /* @} */
+ /** @name Page flushing algorithm fields */
+ /* @{ */
+
+ UT_LIST_BASE_NODE_T(buf_page_t) flush_list;
+ /*!< base node of the modified block
+ list */
+ ibool init_flush[BUF_FLUSH_N_TYPES];
+ /*!< this is TRUE when a flush of the
+ given type is being initialized */
+ ulint n_flush[BUF_FLUSH_N_TYPES];
+ /*!< this is the number of pending
+ writes in the given flush type */
+ os_event_t no_flush[BUF_FLUSH_N_TYPES];
+ /*!< this is in the set state
+ when there is no flush batch
+ of the given type running */
+ ulint ulint_clock; /*!< a sequence number used to count
+ time. NOTE! This counter wraps
+ around at 4 billion (if ulint ==
+ 32 bits)! */
+ ulint freed_page_clock;/*!< a sequence number used
+ to count the number of buffer
+ blocks removed from the end of
+ the LRU list; NOTE that this
+ counter may wrap around at 4
+ billion! A thread is allowed
+ to read this for heuristic
+ purposes without holding any
+ mutex or latch */
+ ulint LRU_flush_ended;/*!< when an LRU flush ends for a page,
+ this is incremented by one; this is
+ set to zero when a buffer block is
+ allocated */
+
+ /* @} */
+ /** @name LRU replacement algorithm fields */
+ /* @{ */
+
+ UT_LIST_BASE_NODE_T(buf_page_t) free;
+ /*!< base node of the free
+ block list */
+ UT_LIST_BASE_NODE_T(buf_page_t) LRU;
+ /*!< base node of the LRU list */
+ buf_page_t* LRU_old; /*!< pointer to the about 3/8 oldest
+ blocks in the LRU list; NULL if LRU
+ length less than BUF_LRU_OLD_MIN_LEN;
+ NOTE: when LRU_old != NULL, its length
+ should always equal LRU_old_len */
+ ulint LRU_old_len; /*!< length of the LRU list from
+ the block to which LRU_old points
+ onward, including that block;
+ see buf0lru.c for the restrictions
+ on this value; not defined if
+ LRU_old == NULL;
+ NOTE: LRU_old_len must be adjusted
+ whenever LRU_old shrinks or grows! */
+
+ UT_LIST_BASE_NODE_T(buf_block_t) unzip_LRU;
+ /*!< base node of the
+ unzip_LRU list */
+
+ /* @} */
+ /** @name Buddy allocator fields
+ The buddy allocator is used for allocating compressed page
+ frames and buf_page_t descriptors of blocks that exist
+ in the buffer pool only in compressed form. */
+ /* @{ */
+ UT_LIST_BASE_NODE_T(buf_page_t) zip_clean;
+ /*!< unmodified compressed pages */
+ UT_LIST_BASE_NODE_T(buf_page_t) zip_free[BUF_BUDDY_SIZES];
+ /*!< buddy free lists */
+#if BUF_BUDDY_HIGH != UNIV_PAGE_SIZE
+# error "BUF_BUDDY_HIGH != UNIV_PAGE_SIZE"
+#endif
+#if BUF_BUDDY_LOW > PAGE_ZIP_MIN_SIZE
+# error "BUF_BUDDY_LOW > PAGE_ZIP_MIN_SIZE"
+#endif
+ /* @} */
+};
+
+/** mutex protecting the buffer pool struct and control blocks, except the
+read-write lock in them */
+extern mutex_t buf_pool_mutex;
+/** mutex protecting the control blocks of compressed-only pages
+(of type buf_page_t, not buf_block_t) */
+extern mutex_t buf_pool_zip_mutex;
+
+/** @name Accessors for buf_pool_mutex.
+Use these instead of accessing buf_pool_mutex directly. */
+/* @{ */
+
+/** Test if buf_pool_mutex is owned. */
+#define buf_pool_mutex_own() mutex_own(&buf_pool_mutex)
+/** Acquire the buffer pool mutex. */
+#define buf_pool_mutex_enter() do { \
+ ut_ad(!mutex_own(&buf_pool_zip_mutex)); \
+ mutex_enter(&buf_pool_mutex); \
+} while (0)
+
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+/** Flag to forbid the release of the buffer pool mutex.
+Protected by buf_pool_mutex. */
+extern ulint buf_pool_mutex_exit_forbidden;
+/** Forbid the release of the buffer pool mutex. */
+# define buf_pool_mutex_exit_forbid() do { \
+ ut_ad(buf_pool_mutex_own()); \
+ buf_pool_mutex_exit_forbidden++; \
+} while (0)
+/** Allow the release of the buffer pool mutex. */
+# define buf_pool_mutex_exit_allow() do { \
+ ut_ad(buf_pool_mutex_own()); \
+ ut_a(buf_pool_mutex_exit_forbidden); \
+ buf_pool_mutex_exit_forbidden--; \
+} while (0)
+/** Release the buffer pool mutex. */
+# define buf_pool_mutex_exit() do { \
+ ut_a(!buf_pool_mutex_exit_forbidden); \
+ mutex_exit(&buf_pool_mutex); \
+} while (0)
+#else
+/** Forbid the release of the buffer pool mutex. */
+# define buf_pool_mutex_exit_forbid() ((void) 0)
+/** Allow the release of the buffer pool mutex. */
+# define buf_pool_mutex_exit_allow() ((void) 0)
+/** Release the buffer pool mutex. */
+# define buf_pool_mutex_exit() mutex_exit(&buf_pool_mutex)
+#endif
+#endif /* !UNIV_HOTBACKUP */
+/* @} */
+
+/**********************************************************************
+Let us list the consistency conditions for different control block states.
+
+NOT_USED: is in free list, not in LRU list, not in flush list, nor
+ page hash table
+READY_FOR_USE: is not in free list, LRU list, or flush list, nor page
+ hash table
+MEMORY: is not in free list, LRU list, or flush list, nor page
+ hash table
+FILE_PAGE: space and offset are defined, is in page hash table
+ if io_fix == BUF_IO_WRITE,
+ pool: no_flush[flush_type] is in reset state,
+ pool: n_flush[flush_type] > 0
+
+ (1) if buf_fix_count == 0, then
+ is in LRU list, not in free list
+ is in flush list,
+ if and only if oldest_modification > 0
+ is x-locked,
+ if and only if io_fix == BUF_IO_READ
+ is s-locked,
+ if and only if io_fix == BUF_IO_WRITE
+
+ (2) if buf_fix_count > 0, then
+ is not in LRU list, not in free list
+ is in flush list,
+ if and only if oldest_modification > 0
+ if io_fix == BUF_IO_READ,
+ is x-locked
+ if io_fix == BUF_IO_WRITE,
+ is s-locked
+
+State transitions:
+
+NOT_USED => READY_FOR_USE
+READY_FOR_USE => MEMORY
+READY_FOR_USE => FILE_PAGE
+MEMORY => NOT_USED
+FILE_PAGE => NOT_USED NOTE: This transition is allowed if and only if
+ (1) buf_fix_count == 0,
+ (2) oldest_modification == 0, and
+ (3) io_fix == 0.
+*/
+
+#ifndef UNIV_NONINL
+#include "buf0buf.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/buf0buf.ic b/storage/innobase/include/buf0buf.ic
new file mode 100644
index 00000000000..17064342116
--- /dev/null
+++ b/storage/innobase/include/buf0buf.ic
@@ -0,0 +1,1066 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 2008, Google Inc.
+
+Portions of this file contain modifications contributed and copyrighted by
+Google, Inc. Those modifications are gratefully acknowledged and are described
+briefly in the InnoDB documentation. The contributions by Google are
+incorporated with their permission, and subject to the conditions contained in
+the file COPYING.Google.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/buf0buf.ic
+The database buffer buf_pool
+
+Created 11/5/1995 Heikki Tuuri
+*******************************************************/
+
+#include "mtr0mtr.h"
+#ifndef UNIV_HOTBACKUP
+#include "buf0flu.h"
+#include "buf0lru.h"
+#include "buf0rea.h"
+
+/********************************************************************//**
+Reads the freed_page_clock of a buffer block.
+@return freed_page_clock */
+UNIV_INLINE
+ulint
+buf_page_get_freed_page_clock(
+/*==========================*/
+ const buf_page_t* bpage) /*!< in: block */
+{
+ /* This is sometimes read without holding buf_pool_mutex. */
+ return(bpage->freed_page_clock);
+}
+
+/********************************************************************//**
+Reads the freed_page_clock of a buffer block.
+@return freed_page_clock */
+UNIV_INLINE
+ulint
+buf_block_get_freed_page_clock(
+/*===========================*/
+ const buf_block_t* block) /*!< in: block */
+{
+ return(buf_page_get_freed_page_clock(&block->page));
+}
+
+/********************************************************************//**
+Recommends a move of a block to the start of the LRU list if there is danger
+of dropping from the buffer pool. NOTE: does not reserve the buffer pool
+mutex.
+@return TRUE if should be made younger */
+UNIV_INLINE
+ibool
+buf_page_peek_if_too_old(
+/*=====================*/
+ const buf_page_t* bpage) /*!< in: block to make younger */
+{
+ return(buf_pool->freed_page_clock
+ >= buf_page_get_freed_page_clock(bpage)
+ + 1 + (buf_pool->curr_size / 4));
+}
+
+/*********************************************************************//**
+Gets the current size of buffer buf_pool in bytes.
+@return size in bytes */
+UNIV_INLINE
+ulint
+buf_pool_get_curr_size(void)
+/*========================*/
+{
+ return(buf_pool->curr_size * UNIV_PAGE_SIZE);
+}
+
+/********************************************************************//**
+Gets the smallest oldest_modification lsn for any page in the pool. Returns
+zero if all modified pages have been flushed to disk.
+@return oldest modification in pool, zero if none */
+UNIV_INLINE
+ib_uint64_t
+buf_pool_get_oldest_modification(void)
+/*==================================*/
+{
+ buf_page_t* bpage;
+ ib_uint64_t lsn;
+
+ buf_pool_mutex_enter();
+
+ bpage = UT_LIST_GET_LAST(buf_pool->flush_list);
+
+ if (bpage == NULL) {
+ lsn = 0;
+ } else {
+ ut_ad(bpage->in_flush_list);
+ lsn = bpage->oldest_modification;
+ }
+
+ buf_pool_mutex_exit();
+
+ /* The returned answer may be out of date: the flush_list can
+ change after the mutex has been released. */
+
+ return(lsn);
+}
+
+/*******************************************************************//**
+Increments the buf_pool clock by one and returns its new value. Remember
+that in the 32 bit version the clock wraps around at 4 billion!
+@return new clock value */
+UNIV_INLINE
+ulint
+buf_pool_clock_tic(void)
+/*====================*/
+{
+ ut_ad(buf_pool_mutex_own());
+
+ buf_pool->ulint_clock++;
+
+ return(buf_pool->ulint_clock);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/*********************************************************************//**
+Gets the state of a block.
+@return state */
+UNIV_INLINE
+enum buf_page_state
+buf_page_get_state(
+/*===============*/
+ const buf_page_t* bpage) /*!< in: pointer to the control block */
+{
+ enum buf_page_state state = (enum buf_page_state) bpage->state;
+
+#ifdef UNIV_DEBUG
+ switch (state) {
+ case BUF_BLOCK_ZIP_FREE:
+ case BUF_BLOCK_ZIP_PAGE:
+ case BUF_BLOCK_ZIP_DIRTY:
+ case BUF_BLOCK_NOT_USED:
+ case BUF_BLOCK_READY_FOR_USE:
+ case BUF_BLOCK_FILE_PAGE:
+ case BUF_BLOCK_MEMORY:
+ case BUF_BLOCK_REMOVE_HASH:
+ break;
+ default:
+ ut_error;
+ }
+#endif /* UNIV_DEBUG */
+
+ return(state);
+}
+/*********************************************************************//**
+Gets the state of a block.
+@return state */
+UNIV_INLINE
+enum buf_page_state
+buf_block_get_state(
+/*================*/
+ const buf_block_t* block) /*!< in: pointer to the control block */
+{
+ return(buf_page_get_state(&block->page));
+}
+/*********************************************************************//**
+Sets the state of a block. */
+UNIV_INLINE
+void
+buf_page_set_state(
+/*===============*/
+ buf_page_t* bpage, /*!< in/out: pointer to control block */
+ enum buf_page_state state) /*!< in: state */
+{
+#ifdef UNIV_DEBUG
+ enum buf_page_state old_state = buf_page_get_state(bpage);
+
+ switch (old_state) {
+ case BUF_BLOCK_ZIP_FREE:
+ ut_error;
+ break;
+ case BUF_BLOCK_ZIP_PAGE:
+ ut_a(state == BUF_BLOCK_ZIP_DIRTY);
+ break;
+ case BUF_BLOCK_ZIP_DIRTY:
+ ut_a(state == BUF_BLOCK_ZIP_PAGE);
+ break;
+ case BUF_BLOCK_NOT_USED:
+ ut_a(state == BUF_BLOCK_READY_FOR_USE);
+ break;
+ case BUF_BLOCK_READY_FOR_USE:
+ ut_a(state == BUF_BLOCK_MEMORY
+ || state == BUF_BLOCK_FILE_PAGE
+ || state == BUF_BLOCK_NOT_USED);
+ break;
+ case BUF_BLOCK_MEMORY:
+ ut_a(state == BUF_BLOCK_NOT_USED);
+ break;
+ case BUF_BLOCK_FILE_PAGE:
+ ut_a(state == BUF_BLOCK_NOT_USED
+ || state == BUF_BLOCK_REMOVE_HASH);
+ break;
+ case BUF_BLOCK_REMOVE_HASH:
+ ut_a(state == BUF_BLOCK_MEMORY);
+ break;
+ }
+#endif /* UNIV_DEBUG */
+ bpage->state = state;
+ ut_ad(buf_page_get_state(bpage) == state);
+}
+
+/*********************************************************************//**
+Sets the state of a block. */
+UNIV_INLINE
+void
+buf_block_set_state(
+/*================*/
+ buf_block_t* block, /*!< in/out: pointer to control block */
+ enum buf_page_state state) /*!< in: state */
+{
+ buf_page_set_state(&block->page, state);
+}
+
+/*********************************************************************//**
+Determines if a block is mapped to a tablespace.
+@return TRUE if mapped */
+UNIV_INLINE
+ibool
+buf_page_in_file(
+/*=============*/
+ const buf_page_t* bpage) /*!< in: pointer to control block */
+{
+ switch (buf_page_get_state(bpage)) {
+ case BUF_BLOCK_ZIP_FREE:
+ /* This is a free page in buf_pool->zip_free[].
+ Such pages should only be accessed by the buddy allocator. */
+ ut_error;
+ break;
+ case BUF_BLOCK_ZIP_PAGE:
+ case BUF_BLOCK_ZIP_DIRTY:
+ case BUF_BLOCK_FILE_PAGE:
+ return(TRUE);
+ case BUF_BLOCK_NOT_USED:
+ case BUF_BLOCK_READY_FOR_USE:
+ case BUF_BLOCK_MEMORY:
+ case BUF_BLOCK_REMOVE_HASH:
+ break;
+ }
+
+ return(FALSE);
+}
+
+#ifndef UNIV_HOTBACKUP
+/*********************************************************************//**
+Determines if a block should be on unzip_LRU list.
+@return TRUE if block belongs to unzip_LRU */
+UNIV_INLINE
+ibool
+buf_page_belongs_to_unzip_LRU(
+/*==========================*/
+ const buf_page_t* bpage) /*!< in: pointer to control block */
+{
+ ut_ad(buf_page_in_file(bpage));
+
+ return(bpage->zip.data
+ && buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE);
+}
+
+/*********************************************************************//**
+Determine the approximate LRU list position of a block.
+@return LRU list position */
+UNIV_INLINE
+ulint
+buf_page_get_LRU_position(
+/*======================*/
+ const buf_page_t* bpage) /*!< in: control block */
+{
+ ut_ad(buf_page_in_file(bpage));
+ ut_ad(buf_pool_mutex_own());
+
+ return(bpage->LRU_position);
+}
+
+/*********************************************************************//**
+Gets the mutex of a block.
+@return pointer to mutex protecting bpage */
+UNIV_INLINE
+mutex_t*
+buf_page_get_mutex(
+/*===============*/
+ const buf_page_t* bpage) /*!< in: pointer to control block */
+{
+ switch (buf_page_get_state(bpage)) {
+ case BUF_BLOCK_ZIP_FREE:
+ ut_error;
+ return(NULL);
+ case BUF_BLOCK_ZIP_PAGE:
+ case BUF_BLOCK_ZIP_DIRTY:
+ return(&buf_pool_zip_mutex);
+ default:
+ return(&((buf_block_t*) bpage)->mutex);
+ }
+}
+
+/*********************************************************************//**
+Get the flush type of a page.
+@return flush type */
+UNIV_INLINE
+enum buf_flush
+buf_page_get_flush_type(
+/*====================*/
+ const buf_page_t* bpage) /*!< in: buffer page */
+{
+ enum buf_flush flush_type = (enum buf_flush) bpage->flush_type;
+
+#ifdef UNIV_DEBUG
+ switch (flush_type) {
+ case BUF_FLUSH_LRU:
+ case BUF_FLUSH_SINGLE_PAGE:
+ case BUF_FLUSH_LIST:
+ return(flush_type);
+ case BUF_FLUSH_N_TYPES:
+ break;
+ }
+ ut_error;
+#endif /* UNIV_DEBUG */
+ return(flush_type);
+}
+/*********************************************************************//**
+Set the flush type of a page. */
+UNIV_INLINE
+void
+buf_page_set_flush_type(
+/*====================*/
+ buf_page_t* bpage, /*!< in: buffer page */
+ enum buf_flush flush_type) /*!< in: flush type */
+{
+ bpage->flush_type = flush_type;
+ ut_ad(buf_page_get_flush_type(bpage) == flush_type);
+}
+
+/*********************************************************************//**
+Map a block to a file page. */
+UNIV_INLINE
+void
+buf_block_set_file_page(
+/*====================*/
+ buf_block_t* block, /*!< in/out: pointer to control block */
+ ulint space, /*!< in: tablespace id */
+ ulint page_no)/*!< in: page number */
+{
+ buf_block_set_state(block, BUF_BLOCK_FILE_PAGE);
+ block->page.space = space;
+ block->page.offset = page_no;
+}
+
+/*********************************************************************//**
+Gets the io_fix state of a block.
+@return io_fix state */
+UNIV_INLINE
+enum buf_io_fix
+buf_page_get_io_fix(
+/*================*/
+ const buf_page_t* bpage) /*!< in: pointer to the control block */
+{
+ enum buf_io_fix io_fix = (enum buf_io_fix) bpage->io_fix;
+#ifdef UNIV_DEBUG
+ switch (io_fix) {
+ case BUF_IO_NONE:
+ case BUF_IO_READ:
+ case BUF_IO_WRITE:
+ return(io_fix);
+ }
+ ut_error;
+#endif /* UNIV_DEBUG */
+ return(io_fix);
+}
+
+/*********************************************************************//**
+Gets the io_fix state of a block.
+@return io_fix state */
+UNIV_INLINE
+enum buf_io_fix
+buf_block_get_io_fix(
+/*================*/
+ const buf_block_t* block) /*!< in: pointer to the control block */
+{
+ return(buf_page_get_io_fix(&block->page));
+}
+
+/*********************************************************************//**
+Sets the io_fix state of a block. */
+UNIV_INLINE
+void
+buf_page_set_io_fix(
+/*================*/
+ buf_page_t* bpage, /*!< in/out: control block */
+ enum buf_io_fix io_fix) /*!< in: io_fix state */
+{
+ ut_ad(buf_pool_mutex_own());
+ ut_ad(mutex_own(buf_page_get_mutex(bpage)));
+
+ bpage->io_fix = io_fix;
+ ut_ad(buf_page_get_io_fix(bpage) == io_fix);
+}
+
+/*********************************************************************//**
+Sets the io_fix state of a block. */
+UNIV_INLINE
+void
+buf_block_set_io_fix(
+/*=================*/
+ buf_block_t* block, /*!< in/out: control block */
+ enum buf_io_fix io_fix) /*!< in: io_fix state */
+{
+ buf_page_set_io_fix(&block->page, io_fix);
+}
+
+/********************************************************************//**
+Determine if a buffer block can be relocated in memory. The block
+can be dirty, but it must not be I/O-fixed or bufferfixed. */
+UNIV_INLINE
+ibool
+buf_page_can_relocate(
+/*==================*/
+ const buf_page_t* bpage) /*!< control block being relocated */
+{
+ ut_ad(buf_pool_mutex_own());
+ ut_ad(mutex_own(buf_page_get_mutex(bpage)));
+ ut_ad(buf_page_in_file(bpage));
+ ut_ad(bpage->in_LRU_list);
+
+ return(buf_page_get_io_fix(bpage) == BUF_IO_NONE
+ && bpage->buf_fix_count == 0);
+}
+
+/*********************************************************************//**
+Determine if a block has been flagged old.
+@return TRUE if old */
+UNIV_INLINE
+ibool
+buf_page_is_old(
+/*============*/
+ const buf_page_t* bpage) /*!< in: control block */
+{
+ ut_ad(buf_page_in_file(bpage));
+ ut_ad(buf_pool_mutex_own());
+
+ return(bpage->old);
+}
+
+/*********************************************************************//**
+Flag a block old. */
+UNIV_INLINE
+void
+buf_page_set_old(
+/*=============*/
+ buf_page_t* bpage, /*!< in/out: control block */
+ ibool old) /*!< in: old */
+{
+ ut_a(buf_page_in_file(bpage));
+ ut_ad(buf_pool_mutex_own());
+ ut_ad(bpage->in_LRU_list);
+
+#ifdef UNIV_LRU_DEBUG
+ if (UT_LIST_GET_PREV(LRU, bpage) && UT_LIST_GET_NEXT(LRU, bpage)
+ && UT_LIST_GET_PREV(LRU, bpage)->old
+ == UT_LIST_GET_NEXT(LRU, bpage)->old) {
+ ut_a(UT_LIST_GET_PREV(LRU, bpage)->old == old);
+ }
+#endif /* UNIV_LRU_DEBUG */
+
+ bpage->old = old;
+}
+
+/*********************************************************************//**
+Determine if a block has been accessed in the buffer pool.
+@return TRUE if accessed */
+UNIV_INLINE
+ibool
+buf_page_is_accessed(
+/*=================*/
+ const buf_page_t* bpage) /*!< in: control block */
+{
+ ut_ad(buf_page_in_file(bpage));
+
+ return(bpage->accessed);
+}
+
+/*********************************************************************//**
+Flag a block accessed. */
+UNIV_INLINE
+void
+buf_page_set_accessed(
+/*==================*/
+ buf_page_t* bpage, /*!< in/out: control block */
+ ibool accessed) /*!< in: accessed */
+{
+ ut_a(buf_page_in_file(bpage));
+ ut_ad(mutex_own(buf_page_get_mutex(bpage)));
+
+ bpage->accessed = accessed;
+}
+
+/*********************************************************************//**
+Gets the buf_block_t handle of a buffered file block if an uncompressed
+page frame exists, or NULL.
+@return control block, or NULL */
+UNIV_INLINE
+buf_block_t*
+buf_page_get_block(
+/*===============*/
+ buf_page_t* bpage) /*!< in: control block, or NULL */
+{
+ if (UNIV_LIKELY(bpage != NULL)) {
+ ut_ad(buf_page_in_file(bpage));
+
+ if (buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE) {
+ return((buf_block_t*) bpage);
+ }
+ }
+
+ return(NULL);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+#ifdef UNIV_DEBUG
+/*********************************************************************//**
+Gets a pointer to the memory frame of a block.
+@return pointer to the frame */
+UNIV_INLINE
+buf_frame_t*
+buf_block_get_frame(
+/*================*/
+ const buf_block_t* block) /*!< in: pointer to the control block */
+{
+ ut_ad(block);
+
+ switch (buf_block_get_state(block)) {
+ case BUF_BLOCK_ZIP_FREE:
+ case BUF_BLOCK_ZIP_PAGE:
+ case BUF_BLOCK_ZIP_DIRTY:
+ case BUF_BLOCK_NOT_USED:
+ ut_error;
+ break;
+ case BUF_BLOCK_FILE_PAGE:
+# ifndef UNIV_HOTBACKUP
+ ut_a(block->page.buf_fix_count > 0);
+# endif /* !UNIV_HOTBACKUP */
+ /* fall through */
+ case BUF_BLOCK_READY_FOR_USE:
+ case BUF_BLOCK_MEMORY:
+ case BUF_BLOCK_REMOVE_HASH:
+ goto ok;
+ }
+ ut_error;
+ok:
+ return((buf_frame_t*) block->frame);
+}
+#endif /* UNIV_DEBUG */
+
+/*********************************************************************//**
+Gets the space id of a block.
+@return space id */
+UNIV_INLINE
+ulint
+buf_page_get_space(
+/*===============*/
+ const buf_page_t* bpage) /*!< in: pointer to the control block */
+{
+ ut_ad(bpage);
+ ut_a(buf_page_in_file(bpage));
+
+ return(bpage->space);
+}
+
+/*********************************************************************//**
+Gets the space id of a block.
+@return space id */
+UNIV_INLINE
+ulint
+buf_block_get_space(
+/*================*/
+ const buf_block_t* block) /*!< in: pointer to the control block */
+{
+ ut_ad(block);
+ ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
+
+ return(block->page.space);
+}
+
+/*********************************************************************//**
+Gets the page number of a block.
+@return page number */
+UNIV_INLINE
+ulint
+buf_page_get_page_no(
+/*=================*/
+ const buf_page_t* bpage) /*!< in: pointer to the control block */
+{
+ ut_ad(bpage);
+ ut_a(buf_page_in_file(bpage));
+
+ return(bpage->offset);
+}
+
+/*********************************************************************//**
+Gets the page number of a block.
+@return page number */
+UNIV_INLINE
+ulint
+buf_block_get_page_no(
+/*==================*/
+ const buf_block_t* block) /*!< in: pointer to the control block */
+{
+ ut_ad(block);
+ ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
+
+ return(block->page.offset);
+}
+
+/*********************************************************************//**
+Gets the compressed page size of a block.
+@return compressed page size, or 0 */
+UNIV_INLINE
+ulint
+buf_page_get_zip_size(
+/*==================*/
+ const buf_page_t* bpage) /*!< in: pointer to the control block */
+{
+ return(bpage->zip.ssize ? 512 << bpage->zip.ssize : 0);
+}
+
+/*********************************************************************//**
+Gets the compressed page size of a block.
+@return compressed page size, or 0 */
+UNIV_INLINE
+ulint
+buf_block_get_zip_size(
+/*===================*/
+ const buf_block_t* block) /*!< in: pointer to the control block */
+{
+ return(block->page.zip.ssize ? 512 << block->page.zip.ssize : 0);
+}
+
+#ifndef UNIV_HOTBACKUP
+#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG
+/*********************************************************************//**
+Gets the compressed page descriptor corresponding to an uncompressed page
+if applicable.
+@return compressed page descriptor, or NULL */
+UNIV_INLINE
+const page_zip_des_t*
+buf_frame_get_page_zip(
+/*===================*/
+ const byte* ptr) /*!< in: pointer to the page */
+{
+ return(buf_block_get_page_zip(buf_block_align(ptr)));
+}
+#endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */
+#endif /* !UNIV_HOTBACKUP */
+
+/**********************************************************************//**
+Gets the space id, page offset, and byte offset within page of a
+pointer pointing to a buffer frame containing a file page. */
+UNIV_INLINE
+void
+buf_ptr_get_fsp_addr(
+/*=================*/
+ const void* ptr, /*!< in: pointer to a buffer frame */
+ ulint* space, /*!< out: space id */
+ fil_addr_t* addr) /*!< out: page offset and byte offset */
+{
+ const page_t* page = (const page_t*) ut_align_down(ptr,
+ UNIV_PAGE_SIZE);
+
+ *space = mach_read_from_4(page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
+ addr->page = mach_read_from_4(page + FIL_PAGE_OFFSET);
+ addr->boffset = ut_align_offset(ptr, UNIV_PAGE_SIZE);
+}
+
+#ifndef UNIV_HOTBACKUP
+/**********************************************************************//**
+Gets the hash value of the page the pointer is pointing to. This can be used
+in searches in the lock hash table.
+@return lock hash value */
+UNIV_INLINE
+ulint
+buf_block_get_lock_hash_val(
+/*========================*/
+ const buf_block_t* block) /*!< in: block */
+{
+ return(block->lock_hash_val);
+}
+
+/********************************************************************//**
+Allocates a buffer block.
+@return own: the allocated block, in state BUF_BLOCK_MEMORY */
+UNIV_INLINE
+buf_block_t*
+buf_block_alloc(
+/*============*/
+ ulint zip_size) /*!< in: compressed page size in bytes,
+ or 0 if uncompressed tablespace */
+{
+ buf_block_t* block;
+
+ block = buf_LRU_get_free_block(zip_size);
+
+ buf_block_set_state(block, BUF_BLOCK_MEMORY);
+
+ return(block);
+}
+
+/********************************************************************//**
+Frees a buffer block which does not contain a file page. */
+UNIV_INLINE
+void
+buf_block_free(
+/*===========*/
+ buf_block_t* block) /*!< in, own: block to be freed */
+{
+ buf_pool_mutex_enter();
+
+ mutex_enter(&block->mutex);
+
+ ut_a(buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE);
+
+ buf_LRU_block_free_non_file_page(block);
+
+ mutex_exit(&block->mutex);
+
+ buf_pool_mutex_exit();
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/*********************************************************************//**
+Copies contents of a buffer frame to a given buffer.
+@return buf */
+UNIV_INLINE
+byte*
+buf_frame_copy(
+/*===========*/
+ byte* buf, /*!< in: buffer to copy to */
+ const buf_frame_t* frame) /*!< in: buffer frame */
+{
+ ut_ad(buf && frame);
+
+ ut_memcpy(buf, frame, UNIV_PAGE_SIZE);
+
+ return(buf);
+}
+
+#ifndef UNIV_HOTBACKUP
+/********************************************************************//**
+Calculates a folded value of a file page address to use in the page hash
+table.
+@return the folded value */
+UNIV_INLINE
+ulint
+buf_page_address_fold(
+/*==================*/
+ ulint space, /*!< in: space id */
+ ulint offset) /*!< in: offset of the page within space */
+{
+ return((space << 20) + space + offset);
+}
+
+/********************************************************************//**
+Gets the youngest modification log sequence number for a frame.
+Returns zero if not file page or no modification occurred yet.
+@return newest modification to page */
+UNIV_INLINE
+ib_uint64_t
+buf_page_get_newest_modification(
+/*=============================*/
+ const buf_page_t* bpage) /*!< in: block containing the
+ page frame */
+{
+ ib_uint64_t lsn;
+ mutex_t* block_mutex = buf_page_get_mutex(bpage);
+
+ mutex_enter(block_mutex);
+
+ if (buf_page_in_file(bpage)) {
+ lsn = bpage->newest_modification;
+ } else {
+ lsn = 0;
+ }
+
+ mutex_exit(block_mutex);
+
+ return(lsn);
+}
+
+/********************************************************************//**
+Increments the modify clock of a frame by 1. The caller must (1) own the
+buf_pool mutex and block bufferfix count has to be zero, (2) or own an x-lock
+on the block. */
+UNIV_INLINE
+void
+buf_block_modify_clock_inc(
+/*=======================*/
+ buf_block_t* block) /*!< in: block */
+{
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad((buf_pool_mutex_own()
+ && (block->page.buf_fix_count == 0))
+ || rw_lock_own(&(block->lock), RW_LOCK_EXCLUSIVE));
+#endif /* UNIV_SYNC_DEBUG */
+
+ block->modify_clock++;
+}
+
+/********************************************************************//**
+Returns the value of the modify clock. The caller must have an s-lock
+or x-lock on the block.
+@return value */
+UNIV_INLINE
+ib_uint64_t
+buf_block_get_modify_clock(
+/*=======================*/
+ buf_block_t* block) /*!< in: block */
+{
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(rw_lock_own(&(block->lock), RW_LOCK_SHARED)
+ || rw_lock_own(&(block->lock), RW_LOCK_EXCLUSIVE));
+#endif /* UNIV_SYNC_DEBUG */
+
+ return(block->modify_clock);
+}
+
+/*******************************************************************//**
+Increments the bufferfix count. */
+UNIV_INLINE
+void
+buf_block_buf_fix_inc_func(
+/*=======================*/
+#ifdef UNIV_SYNC_DEBUG
+ const char* file, /*!< in: file name */
+ ulint line, /*!< in: line */
+#endif /* UNIV_SYNC_DEBUG */
+ buf_block_t* block) /*!< in/out: block to bufferfix */
+{
+#ifdef UNIV_SYNC_DEBUG
+ ibool ret;
+
+ ret = rw_lock_s_lock_nowait(&(block->debug_latch), file, line);
+ ut_a(ret);
+#endif /* UNIV_SYNC_DEBUG */
+ ut_ad(mutex_own(&block->mutex));
+
+ block->page.buf_fix_count++;
+}
+#ifdef UNIV_SYNC_DEBUG
+/** Increments the bufferfix count.
+@param b in/out: block to bufferfix
+@param f in: file name where requested
+@param l in: line number where requested */
+# define buf_block_buf_fix_inc(b,f,l) buf_block_buf_fix_inc_func(f,l,b)
+#else /* UNIV_SYNC_DEBUG */
+/** Increments the bufferfix count.
+@param b in/out: block to bufferfix
+@param f in: file name where requested
+@param l in: line number where requested */
+# define buf_block_buf_fix_inc(b,f,l) buf_block_buf_fix_inc_func(b)
+#endif /* UNIV_SYNC_DEBUG */
+
+/*******************************************************************//**
+Decrements the bufferfix count. */
+UNIV_INLINE
+void
+buf_block_buf_fix_dec(
+/*==================*/
+ buf_block_t* block) /*!< in/out: block to bufferunfix */
+{
+ ut_ad(mutex_own(&block->mutex));
+
+ block->page.buf_fix_count--;
+#ifdef UNIV_SYNC_DEBUG
+ rw_lock_s_unlock(&block->debug_latch);
+#endif
+}
+
+/******************************************************************//**
+Returns the control block of a file page, NULL if not found.
+@return block, NULL if not found */
+UNIV_INLINE
+buf_page_t*
+buf_page_hash_get(
+/*==============*/
+ ulint space, /*!< in: space id */
+ ulint offset) /*!< in: offset of the page within space */
+{
+ buf_page_t* bpage;
+ ulint fold;
+
+ ut_ad(buf_pool);
+ ut_ad(buf_pool_mutex_own());
+
+ /* Look for the page in the hash table */
+
+ fold = buf_page_address_fold(space, offset);
+
+ HASH_SEARCH(hash, buf_pool->page_hash, fold, buf_page_t*, bpage,
+ ut_ad(bpage->in_page_hash && !bpage->in_zip_hash
+ && buf_page_in_file(bpage)),
+ bpage->space == space && bpage->offset == offset);
+ if (bpage) {
+ ut_a(buf_page_in_file(bpage));
+ ut_ad(bpage->in_page_hash);
+ ut_ad(!bpage->in_zip_hash);
+ UNIV_MEM_ASSERT_RW(bpage, sizeof *bpage);
+ }
+
+ return(bpage);
+}
+
+/******************************************************************//**
+Returns the control block of a file page, NULL if not found
+or an uncompressed page frame does not exist.
+@return block, NULL if not found */
+UNIV_INLINE
+buf_block_t*
+buf_block_hash_get(
+/*===============*/
+ ulint space, /*!< in: space id */
+ ulint offset) /*!< in: offset of the page within space */
+{
+ return(buf_page_get_block(buf_page_hash_get(space, offset)));
+}
+
+/********************************************************************//**
+Returns TRUE if the page can be found in the buffer pool hash table.
+
+NOTE that it is possible that the page is not yet read from disk,
+though.
+
+@return TRUE if found in the page hash table */
+UNIV_INLINE
+ibool
+buf_page_peek(
+/*==========*/
+ ulint space, /*!< in: space id */
+ ulint offset) /*!< in: page number */
+{
+ const buf_page_t* bpage;
+
+ buf_pool_mutex_enter();
+
+ bpage = buf_page_hash_get(space, offset);
+
+ buf_pool_mutex_exit();
+
+ return(bpage != NULL);
+}
+
+/********************************************************************//**
+Releases a compressed-only page acquired with buf_page_get_zip(). */
+UNIV_INLINE
+void
+buf_page_release_zip(
+/*=================*/
+ buf_page_t* bpage) /*!< in: buffer block */
+{
+ buf_block_t* block;
+
+ ut_ad(bpage);
+ ut_a(bpage->buf_fix_count > 0);
+
+ switch (buf_page_get_state(bpage)) {
+ case BUF_BLOCK_ZIP_PAGE:
+ case BUF_BLOCK_ZIP_DIRTY:
+ mutex_enter(&buf_pool_zip_mutex);
+ bpage->buf_fix_count--;
+ mutex_exit(&buf_pool_zip_mutex);
+ return;
+ case BUF_BLOCK_FILE_PAGE:
+ block = (buf_block_t*) bpage;
+ mutex_enter(&block->mutex);
+#ifdef UNIV_SYNC_DEBUG
+ rw_lock_s_unlock(&block->debug_latch);
+#endif
+ bpage->buf_fix_count--;
+ mutex_exit(&block->mutex);
+ return;
+ case BUF_BLOCK_ZIP_FREE:
+ case BUF_BLOCK_NOT_USED:
+ case BUF_BLOCK_READY_FOR_USE:
+ case BUF_BLOCK_MEMORY:
+ case BUF_BLOCK_REMOVE_HASH:
+ break;
+ }
+
+ ut_error;
+}
+
+/********************************************************************//**
+Decrements the bufferfix count of a buffer control block and releases
+a latch, if specified. */
+UNIV_INLINE
+void
+buf_page_release(
+/*=============*/
+ buf_block_t* block, /*!< in: buffer block */
+ ulint rw_latch, /*!< in: RW_S_LATCH, RW_X_LATCH,
+ RW_NO_LATCH */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ut_ad(block);
+
+ ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
+ ut_a(block->page.buf_fix_count > 0);
+
+ if (rw_latch == RW_X_LATCH && mtr->modifications) {
+ buf_pool_mutex_enter();
+ buf_flush_note_modification(block, mtr);
+ buf_pool_mutex_exit();
+ }
+
+ mutex_enter(&block->mutex);
+
+#ifdef UNIV_SYNC_DEBUG
+ rw_lock_s_unlock(&(block->debug_latch));
+#endif
+ block->page.buf_fix_count--;
+
+ mutex_exit(&block->mutex);
+
+ if (rw_latch == RW_S_LATCH) {
+ rw_lock_s_unlock(&(block->lock));
+ } else if (rw_latch == RW_X_LATCH) {
+ rw_lock_x_unlock(&(block->lock));
+ }
+}
+
+#ifdef UNIV_SYNC_DEBUG
+/*********************************************************************//**
+Adds latch level info for the rw-lock protecting the buffer frame. This
+should be called in the debug version after a successful latching of a
+page if we know the latching order level of the acquired latch. */
+UNIV_INLINE
+void
+buf_block_dbg_add_level(
+/*====================*/
+ buf_block_t* block, /*!< in: buffer page
+ where we have acquired latch */
+ ulint level) /*!< in: latching order level */
+{
+ sync_thread_add_level(&block->lock, level);
+}
+#endif /* UNIV_SYNC_DEBUG */
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/innobase/include/buf0flu.h b/storage/innobase/include/buf0flu.h
new file mode 100644
index 00000000000..6c751852f54
--- /dev/null
+++ b/storage/innobase/include/buf0flu.h
@@ -0,0 +1,191 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/buf0flu.h
+The database buffer pool flush algorithm
+
+Created 11/5/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef buf0flu_h
+#define buf0flu_h
+
+#include "univ.i"
+#include "ut0byte.h"
+#ifndef UNIV_HOTBACKUP
+#include "mtr0types.h"
+#include "buf0types.h"
+
+/********************************************************************//**
+Remove a block from the flush list of modified blocks. */
+UNIV_INTERN
+void
+buf_flush_remove(
+/*=============*/
+ buf_page_t* bpage); /*!< in: pointer to the block in question */
+/********************************************************************//**
+Updates the flush system data structures when a write is completed. */
+UNIV_INTERN
+void
+buf_flush_write_complete(
+/*=====================*/
+ buf_page_t* bpage); /*!< in: pointer to the block in question */
+/*********************************************************************//**
+Flushes pages from the end of the LRU list if there is too small
+a margin of replaceable pages there. */
+UNIV_INTERN
+void
+buf_flush_free_margin(void);
+/*=======================*/
+#endif /* !UNIV_HOTBACKUP */
+/********************************************************************//**
+Initializes a page for writing to the tablespace. */
+UNIV_INTERN
+void
+buf_flush_init_for_writing(
+/*=======================*/
+ byte* page, /*!< in/out: page */
+ void* page_zip_, /*!< in/out: compressed page, or NULL */
+ ib_uint64_t newest_lsn); /*!< in: newest modification lsn
+ to the page */
+#ifndef UNIV_HOTBACKUP
+/*******************************************************************//**
+This utility flushes dirty blocks from the end of the LRU list or flush_list.
+NOTE 1: in the case of an LRU flush the calling thread may own latches to
+pages: to avoid deadlocks, this function must be written so that it cannot
+end up waiting for these latches! NOTE 2: in the case of a flush list flush,
+the calling thread is not allowed to own any latches on pages!
+@return number of blocks for which the write request was queued;
+ULINT_UNDEFINED if there was a flush of the same type already running */
+UNIV_INTERN
+ulint
+buf_flush_batch(
+/*============*/
+ enum buf_flush flush_type, /*!< in: BUF_FLUSH_LRU or
+ BUF_FLUSH_LIST; if BUF_FLUSH_LIST,
+ then the caller must not own any
+ latches on pages */
+ ulint min_n, /*!< in: wished minimum mumber of blocks
+ flushed (it is not guaranteed that the
+ actual number is that big, though) */
+ ib_uint64_t lsn_limit); /*!< in the case BUF_FLUSH_LIST all
+ blocks whose oldest_modification is
+ smaller than this should be flushed
+ (if their number does not exceed
+ min_n), otherwise ignored */
+/******************************************************************//**
+Waits until a flush batch of the given type ends */
+UNIV_INTERN
+void
+buf_flush_wait_batch_end(
+/*=====================*/
+ enum buf_flush type); /*!< in: BUF_FLUSH_LRU or BUF_FLUSH_LIST */
+/********************************************************************//**
+This function should be called at a mini-transaction commit, if a page was
+modified in it. Puts the block to the list of modified blocks, if it not
+already in it. */
+UNIV_INLINE
+void
+buf_flush_note_modification(
+/*========================*/
+ buf_block_t* block, /*!< in: block which is modified */
+ mtr_t* mtr); /*!< in: mtr */
+/********************************************************************//**
+This function should be called when recovery has modified a buffer page. */
+UNIV_INLINE
+void
+buf_flush_recv_note_modification(
+/*=============================*/
+ buf_block_t* block, /*!< in: block which is modified */
+ ib_uint64_t start_lsn, /*!< in: start lsn of the first mtr in a
+ set of mtr's */
+ ib_uint64_t end_lsn); /*!< in: end lsn of the last mtr in the
+ set of mtr's */
+/********************************************************************//**
+Returns TRUE if the file page block is immediately suitable for replacement,
+i.e., transition FILE_PAGE => NOT_USED allowed.
+@return TRUE if can replace immediately */
+UNIV_INTERN
+ibool
+buf_flush_ready_for_replace(
+/*========================*/
+ buf_page_t* bpage); /*!< in: buffer control block, must be
+ buf_page_in_file(bpage) and in the LRU list */
+
+/** @brief Statistics for selecting flush rate based on redo log
+generation speed.
+
+These statistics are generated for heuristics used in estimating the
+rate at which we should flush the dirty blocks to avoid bursty IO
+activity. Note that the rate of flushing not only depends on how many
+dirty pages we have in the buffer pool but it is also a fucntion of
+how much redo the workload is generating and at what rate. */
+
+struct buf_flush_stat_struct
+{
+ ib_uint64_t redo; /**< amount of redo generated. */
+ ulint n_flushed; /**< number of pages flushed. */
+};
+
+/** Statistics for selecting flush rate of dirty pages. */
+typedef struct buf_flush_stat_struct buf_flush_stat_t;
+/*********************************************************************
+Update the historical stats that we are collecting for flush rate
+heuristics at the end of each interval. */
+UNIV_INTERN
+void
+buf_flush_stat_update(void);
+/*=======================*/
+/*********************************************************************
+Determines the fraction of dirty pages that need to be flushed based
+on the speed at which we generate redo log. Note that if redo log
+is generated at significant rate without a corresponding increase
+in the number of dirty pages (for example, an in-memory workload)
+it can cause IO bursts of flushing. This function implements heuristics
+to avoid this burstiness.
+@return number of dirty pages to be flushed / second */
+UNIV_INTERN
+ulint
+buf_flush_get_desired_flush_rate(void);
+/*==================================*/
+
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+/******************************************************************//**
+Validates the flush list.
+@return TRUE if ok */
+UNIV_INTERN
+ibool
+buf_flush_validate(void);
+/*====================*/
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+
+/** When buf_flush_free_margin is called, it tries to make this many blocks
+available to replacement in the free list and at the end of the LRU list (to
+make sure that a read-ahead batch can be read efficiently in a single
+sweep). */
+#define BUF_FLUSH_FREE_BLOCK_MARGIN (5 + BUF_READ_AHEAD_AREA)
+/** Extra margin to apply above BUF_FLUSH_FREE_BLOCK_MARGIN */
+#define BUF_FLUSH_EXTRA_MARGIN (BUF_FLUSH_FREE_BLOCK_MARGIN / 4 + 100)
+#endif /* !UNIV_HOTBACKUP */
+
+#ifndef UNIV_NONINL
+#include "buf0flu.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/buf0flu.ic b/storage/innobase/include/buf0flu.ic
new file mode 100644
index 00000000000..c90cd59e4b6
--- /dev/null
+++ b/storage/innobase/include/buf0flu.ic
@@ -0,0 +1,123 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/buf0flu.ic
+The database buffer pool flush algorithm
+
+Created 11/5/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef UNIV_HOTBACKUP
+#include "buf0buf.h"
+#include "mtr0mtr.h"
+
+/********************************************************************//**
+Inserts a modified block into the flush list. */
+UNIV_INTERN
+void
+buf_flush_insert_into_flush_list(
+/*=============================*/
+ buf_block_t* block); /*!< in/out: block which is modified */
+/********************************************************************//**
+Inserts a modified block into the flush list in the right sorted position.
+This function is used by recovery, because there the modifications do not
+necessarily come in the order of lsn's. */
+UNIV_INTERN
+void
+buf_flush_insert_sorted_into_flush_list(
+/*====================================*/
+ buf_block_t* block); /*!< in/out: block which is modified */
+
+/********************************************************************//**
+This function should be called at a mini-transaction commit, if a page was
+modified in it. Puts the block to the list of modified blocks, if it is not
+already in it. */
+UNIV_INLINE
+void
+buf_flush_note_modification(
+/*========================*/
+ buf_block_t* block, /*!< in: block which is modified */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ut_ad(block);
+ ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
+ ut_ad(block->page.buf_fix_count > 0);
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(rw_lock_own(&(block->lock), RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+ ut_ad(buf_pool_mutex_own());
+
+ ut_ad(mtr->start_lsn != 0);
+ ut_ad(mtr->modifications);
+ ut_ad(block->page.newest_modification <= mtr->end_lsn);
+
+ block->page.newest_modification = mtr->end_lsn;
+
+ if (!block->page.oldest_modification) {
+
+ block->page.oldest_modification = mtr->start_lsn;
+ ut_ad(block->page.oldest_modification != 0);
+
+ buf_flush_insert_into_flush_list(block);
+ } else {
+ ut_ad(block->page.oldest_modification <= mtr->start_lsn);
+ }
+
+ ++srv_buf_pool_write_requests;
+}
+
+/********************************************************************//**
+This function should be called when recovery has modified a buffer page. */
+UNIV_INLINE
+void
+buf_flush_recv_note_modification(
+/*=============================*/
+ buf_block_t* block, /*!< in: block which is modified */
+ ib_uint64_t start_lsn, /*!< in: start lsn of the first mtr in a
+ set of mtr's */
+ ib_uint64_t end_lsn) /*!< in: end lsn of the last mtr in the
+ set of mtr's */
+{
+ ut_ad(block);
+ ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
+ ut_ad(block->page.buf_fix_count > 0);
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(rw_lock_own(&(block->lock), RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
+ buf_pool_mutex_enter();
+
+ ut_ad(block->page.newest_modification <= end_lsn);
+
+ block->page.newest_modification = end_lsn;
+
+ if (!block->page.oldest_modification) {
+
+ block->page.oldest_modification = start_lsn;
+
+ ut_ad(block->page.oldest_modification != 0);
+
+ buf_flush_insert_sorted_into_flush_list(block);
+ } else {
+ ut_ad(block->page.oldest_modification <= start_lsn);
+ }
+
+ buf_pool_mutex_exit();
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/innobase/include/buf0lru.h b/storage/innobase/include/buf0lru.h
new file mode 100644
index 00000000000..463aca0982c
--- /dev/null
+++ b/storage/innobase/include/buf0lru.h
@@ -0,0 +1,263 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/buf0lru.h
+The database buffer pool LRU replacement algorithm
+
+Created 11/5/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef buf0lru_h
+#define buf0lru_h
+
+#include "univ.i"
+#include "ut0byte.h"
+#include "buf0types.h"
+
+/** The return type of buf_LRU_free_block() */
+enum buf_lru_free_block_status {
+ /** freed */
+ BUF_LRU_FREED = 0,
+ /** not freed because the caller asked to remove the
+ uncompressed frame but the control block cannot be
+ relocated */
+ BUF_LRU_CANNOT_RELOCATE,
+ /** not freed because of some other reason */
+ BUF_LRU_NOT_FREED
+};
+
+/******************************************************************//**
+Tries to remove LRU flushed blocks from the end of the LRU list and put them
+to the free list. This is beneficial for the efficiency of the insert buffer
+operation, as flushed pages from non-unique non-clustered indexes are here
+taken out of the buffer pool, and their inserts redirected to the insert
+buffer. Otherwise, the flushed blocks could get modified again before read
+operations need new buffer blocks, and the i/o work done in flushing would be
+wasted. */
+UNIV_INTERN
+void
+buf_LRU_try_free_flushed_blocks(void);
+/*==================================*/
+/******************************************************************//**
+Returns TRUE if less than 25 % of the buffer pool is available. This can be
+used in heuristics to prevent huge transactions eating up the whole buffer
+pool for their locks.
+@return TRUE if less than 25 % of buffer pool left */
+UNIV_INTERN
+ibool
+buf_LRU_buf_pool_running_out(void);
+/*==============================*/
+
+/*#######################################################################
+These are low-level functions
+#########################################################################*/
+
+/** Minimum LRU list length for which the LRU_old pointer is defined */
+#define BUF_LRU_OLD_MIN_LEN 80
+
+/** Maximum LRU list search length in buf_flush_LRU_recommendation() */
+#define BUF_LRU_FREE_SEARCH_LEN (5 + 2 * BUF_READ_AHEAD_AREA)
+
+/******************************************************************//**
+Invalidates all pages belonging to a given tablespace when we are deleting
+the data file(s) of that tablespace. A PROBLEM: if readahead is being started,
+what guarantees that it will not try to read in pages after this operation has
+completed? */
+UNIV_INTERN
+void
+buf_LRU_invalidate_tablespace(
+/*==========================*/
+ ulint id); /*!< in: space id */
+/******************************************************************//**
+Gets the minimum LRU_position field for the blocks in an initial segment
+(determined by BUF_LRU_INITIAL_RATIO) of the LRU list. The limit is not
+guaranteed to be precise, because the ulint_clock may wrap around.
+@return the limit; zero if could not determine it */
+UNIV_INTERN
+ulint
+buf_LRU_get_recent_limit(void);
+/*==========================*/
+/********************************************************************//**
+Insert a compressed block into buf_pool->zip_clean in the LRU order. */
+UNIV_INTERN
+void
+buf_LRU_insert_zip_clean(
+/*=====================*/
+ buf_page_t* bpage); /*!< in: pointer to the block in question */
+
+/******************************************************************//**
+Try to free a block. If bpage is a descriptor of a compressed-only
+page, the descriptor object will be freed as well.
+
+NOTE: If this function returns BUF_LRU_FREED, it will not temporarily
+release buf_pool_mutex. Furthermore, the page frame will no longer be
+accessible via bpage.
+
+The caller must hold buf_pool_mutex and buf_page_get_mutex(bpage) and
+release these two mutexes after the call. No other
+buf_page_get_mutex() may be held when calling this function.
+@return BUF_LRU_FREED if freed, BUF_LRU_CANNOT_RELOCATE or
+BUF_LRU_NOT_FREED otherwise. */
+UNIV_INTERN
+enum buf_lru_free_block_status
+buf_LRU_free_block(
+/*===============*/
+ buf_page_t* bpage, /*!< in: block to be freed */
+ ibool zip, /*!< in: TRUE if should remove also the
+ compressed page of an uncompressed page */
+ ibool* buf_pool_mutex_released);
+ /*!< in: pointer to a variable that will
+ be assigned TRUE if buf_pool_mutex
+ was temporarily released, or NULL */
+/******************************************************************//**
+Try to free a replaceable block.
+@return TRUE if found and freed */
+UNIV_INTERN
+ibool
+buf_LRU_search_and_free_block(
+/*==========================*/
+ ulint n_iterations); /*!< in: how many times this has been called
+ repeatedly without result: a high value means
+ that we should search farther; if
+ n_iterations < 10, then we search
+ n_iterations / 10 * buf_pool->curr_size
+ pages from the end of the LRU list; if
+ n_iterations < 5, then we will also search
+ n_iterations / 5 of the unzip_LRU list. */
+/******************************************************************//**
+Returns a free block from the buf_pool. The block is taken off the
+free list. If it is empty, returns NULL.
+@return a free control block, or NULL if the buf_block->free list is empty */
+UNIV_INTERN
+buf_block_t*
+buf_LRU_get_free_only(void);
+/*=======================*/
+/******************************************************************//**
+Returns a free block from the buf_pool. The block is taken off the
+free list. If it is empty, blocks are moved from the end of the
+LRU list to the free list.
+@return the free control block, in state BUF_BLOCK_READY_FOR_USE */
+UNIV_INTERN
+buf_block_t*
+buf_LRU_get_free_block(
+/*===================*/
+ ulint zip_size); /*!< in: compressed page size in bytes,
+ or 0 if uncompressed tablespace */
+
+/******************************************************************//**
+Puts a block back to the free list. */
+UNIV_INTERN
+void
+buf_LRU_block_free_non_file_page(
+/*=============================*/
+ buf_block_t* block); /*!< in: block, must not contain a file page */
+/******************************************************************//**
+Adds a block to the LRU list. */
+UNIV_INTERN
+void
+buf_LRU_add_block(
+/*==============*/
+ buf_page_t* bpage, /*!< in: control block */
+ ibool old); /*!< in: TRUE if should be put to the old
+ blocks in the LRU list, else put to the
+ start; if the LRU list is very short, added to
+ the start regardless of this parameter */
+/******************************************************************//**
+Adds a block to the LRU list of decompressed zip pages. */
+UNIV_INTERN
+void
+buf_unzip_LRU_add_block(
+/*====================*/
+ buf_block_t* block, /*!< in: control block */
+ ibool old); /*!< in: TRUE if should be put to the end
+ of the list, else put to the start */
+/******************************************************************//**
+Moves a block to the start of the LRU list. */
+UNIV_INTERN
+void
+buf_LRU_make_block_young(
+/*=====================*/
+ buf_page_t* bpage); /*!< in: control block */
+/******************************************************************//**
+Moves a block to the end of the LRU list. */
+UNIV_INTERN
+void
+buf_LRU_make_block_old(
+/*===================*/
+ buf_page_t* bpage); /*!< in: control block */
+/********************************************************************//**
+Update the historical stats that we are collecting for LRU eviction
+policy at the end of each interval. */
+UNIV_INTERN
+void
+buf_LRU_stat_update(void);
+/*=====================*/
+
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+/**********************************************************************//**
+Validates the LRU list.
+@return TRUE */
+UNIV_INTERN
+ibool
+buf_LRU_validate(void);
+/*==================*/
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+#if defined UNIV_DEBUG_PRINT || defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+/**********************************************************************//**
+Prints the LRU list. */
+UNIV_INTERN
+void
+buf_LRU_print(void);
+/*===============*/
+#endif /* UNIV_DEBUG_PRINT || UNIV_DEBUG || UNIV_BUF_DEBUG */
+
+/** @brief Statistics for selecting the LRU list for eviction.
+
+These statistics are not 'of' LRU but 'for' LRU. We keep count of I/O
+and page_zip_decompress() operations. Based on the statistics we decide
+if we want to evict from buf_pool->unzip_LRU or buf_pool->LRU. */
+struct buf_LRU_stat_struct
+{
+ ulint io; /**< Counter of buffer pool I/O operations. */
+ ulint unzip; /**< Counter of page_zip_decompress operations. */
+};
+
+/** Statistics for selecting the LRU list for eviction. */
+typedef struct buf_LRU_stat_struct buf_LRU_stat_t;
+
+/** Current operation counters. Not protected by any mutex.
+Cleared by buf_LRU_stat_update(). */
+extern buf_LRU_stat_t buf_LRU_stat_cur;
+
+/** Running sum of past values of buf_LRU_stat_cur.
+Updated by buf_LRU_stat_update(). Protected by buf_pool_mutex. */
+extern buf_LRU_stat_t buf_LRU_stat_sum;
+
+/********************************************************************//**
+Increments the I/O counter in buf_LRU_stat_cur. */
+#define buf_LRU_stat_inc_io() buf_LRU_stat_cur.io++
+/********************************************************************//**
+Increments the page_zip_decompress() counter in buf_LRU_stat_cur. */
+#define buf_LRU_stat_inc_unzip() buf_LRU_stat_cur.unzip++
+
+#ifndef UNIV_NONINL
+#include "buf0lru.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/buf0lru.ic b/storage/innobase/include/buf0lru.ic
new file mode 100644
index 00000000000..556f45d987f
--- /dev/null
+++ b/storage/innobase/include/buf0lru.ic
@@ -0,0 +1,25 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/buf0lru.ic
+The database buffer replacement algorithm
+
+Created 11/5/1995 Heikki Tuuri
+*******************************************************/
+
diff --git a/storage/innobase/include/buf0rea.h b/storage/innobase/include/buf0rea.h
new file mode 100644
index 00000000000..b4d25e6fde0
--- /dev/null
+++ b/storage/innobase/include/buf0rea.h
@@ -0,0 +1,139 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/buf0rea.h
+The database buffer read
+
+Created 11/5/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef buf0rea_h
+#define buf0rea_h
+
+#include "univ.i"
+#include "buf0types.h"
+
+/********************************************************************//**
+High-level function which reads a page asynchronously from a file to the
+buffer buf_pool if it is not already there. Sets the io_fix flag and sets
+an exclusive lock on the buffer frame. The flag is cleared and the x-lock
+released by the i/o-handler thread. Does a random read-ahead if it seems
+sensible.
+@return number of page read requests issued: this can be greater than
+1 if read-ahead occurred */
+UNIV_INTERN
+ulint
+buf_read_page(
+/*==========*/
+ ulint space, /*!< in: space id */
+ ulint zip_size,/*!< in: compressed page size in bytes, or 0 */
+ ulint offset);/*!< in: page number */
+/********************************************************************//**
+Applies linear read-ahead if in the buf_pool the page is a border page of
+a linear read-ahead area and all the pages in the area have been accessed.
+Does not read any page if the read-ahead mechanism is not activated. Note
+that the the algorithm looks at the 'natural' adjacent successor and
+predecessor of the page, which on the leaf level of a B-tree are the next
+and previous page in the chain of leaves. To know these, the page specified
+in (space, offset) must already be present in the buf_pool. Thus, the
+natural way to use this function is to call it when a page in the buf_pool
+is accessed the first time, calling this function just after it has been
+bufferfixed.
+NOTE 1: as this function looks at the natural predecessor and successor
+fields on the page, what happens, if these are not initialized to any
+sensible value? No problem, before applying read-ahead we check that the
+area to read is within the span of the space, if not, read-ahead is not
+applied. An uninitialized value may result in a useless read operation, but
+only very improbably.
+NOTE 2: the calling thread may own latches on pages: to avoid deadlocks this
+function must be written such that it cannot end up waiting for these
+latches!
+NOTE 3: the calling thread must want access to the page given: this rule is
+set to prevent unintended read-aheads performed by ibuf routines, a situation
+which could result in a deadlock if the OS does not support asynchronous io.
+@return number of page read requests issued */
+UNIV_INTERN
+ulint
+buf_read_ahead_linear(
+/*==================*/
+ ulint space, /*!< in: space id */
+ ulint zip_size,/*!< in: compressed page size in bytes, or 0 */
+ ulint offset);/*!< in: page number of a page; NOTE: the current thread
+ must want access to this page (see NOTE 3 above) */
+/********************************************************************//**
+Issues read requests for pages which the ibuf module wants to read in, in
+order to contract the insert buffer tree. Technically, this function is like
+a read-ahead function. */
+UNIV_INTERN
+void
+buf_read_ibuf_merge_pages(
+/*======================*/
+ ibool sync, /*!< in: TRUE if the caller
+ wants this function to wait
+ for the highest address page
+ to get read in, before this
+ function returns */
+ const ulint* space_ids, /*!< in: array of space ids */
+ const ib_int64_t* space_versions,/*!< in: the spaces must have
+ this version number
+ (timestamp), otherwise we
+ discard the read; we use this
+ to cancel reads if DISCARD +
+ IMPORT may have changed the
+ tablespace size */
+ const ulint* page_nos, /*!< in: array of page numbers
+ to read, with the highest page
+ number the last in the
+ array */
+ ulint n_stored); /*!< in: number of elements
+ in the arrays */
+/********************************************************************//**
+Issues read requests for pages which recovery wants to read in. */
+UNIV_INTERN
+void
+buf_read_recv_pages(
+/*================*/
+ ibool sync, /*!< in: TRUE if the caller
+ wants this function to wait
+ for the highest address page
+ to get read in, before this
+ function returns */
+ ulint space, /*!< in: space id */
+ ulint zip_size, /*!< in: compressed page size in
+ bytes, or 0 */
+ const ulint* page_nos, /*!< in: array of page numbers
+ to read, with the highest page
+ number the last in the
+ array */
+ ulint n_stored); /*!< in: number of page numbers
+ in the array */
+
+/** The size in pages of the area which the read-ahead algorithms read if
+invoked */
+#define BUF_READ_AHEAD_AREA \
+ ut_min(64, ut_2_power_up(buf_pool->curr_size / 32))
+
+/** @name Modes used in read-ahead @{ */
+/** read only pages belonging to the insert buffer tree */
+#define BUF_READ_IBUF_PAGES_ONLY 131
+/** read any page */
+#define BUF_READ_ANY_PAGE 132
+/* @} */
+
+#endif
diff --git a/storage/innobase/include/buf0types.h b/storage/innobase/include/buf0types.h
new file mode 100644
index 00000000000..e7167d716a0
--- /dev/null
+++ b/storage/innobase/include/buf0types.h
@@ -0,0 +1,80 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/buf0types.h
+The database buffer pool global types for the directory
+
+Created 11/17/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef buf0types_h
+#define buf0types_h
+
+/** Buffer page (uncompressed or compressed) */
+typedef struct buf_page_struct buf_page_t;
+/** Buffer block for which an uncompressed page exists */
+typedef struct buf_block_struct buf_block_t;
+/** Buffer pool chunk comprising buf_block_t */
+typedef struct buf_chunk_struct buf_chunk_t;
+/** Buffer pool comprising buf_chunk_t */
+typedef struct buf_pool_struct buf_pool_t;
+
+/** A buffer frame. @see page_t */
+typedef byte buf_frame_t;
+
+/** Flags for flush types */
+enum buf_flush {
+ BUF_FLUSH_LRU = 0, /*!< flush via the LRU list */
+ BUF_FLUSH_SINGLE_PAGE, /*!< flush a single page */
+ BUF_FLUSH_LIST, /*!< flush via the flush list
+ of dirty blocks */
+ BUF_FLUSH_N_TYPES /*!< index of last element + 1 */
+};
+
+/** Flags for io_fix types */
+enum buf_io_fix {
+ BUF_IO_NONE = 0, /**< no pending I/O */
+ BUF_IO_READ, /**< read pending */
+ BUF_IO_WRITE /**< write pending */
+};
+
+/** Parameters of binary buddy system for compressed pages (buf0buddy.h) */
+/* @{ */
+#if UNIV_WORD_SIZE <= 4 /* 32-bit system */
+/** Base-2 logarithm of the smallest buddy block size */
+# define BUF_BUDDY_LOW_SHIFT 6
+#else /* 64-bit system */
+/** Base-2 logarithm of the smallest buddy block size */
+# define BUF_BUDDY_LOW_SHIFT 7
+#endif
+#define BUF_BUDDY_LOW (1 << BUF_BUDDY_LOW_SHIFT)
+ /*!< minimum block size in the binary
+ buddy system; must be at least
+ sizeof(buf_page_t) */
+#define BUF_BUDDY_SIZES (UNIV_PAGE_SIZE_SHIFT - BUF_BUDDY_LOW_SHIFT)
+ /*!< number of buddy sizes */
+
+/** twice the maximum block size of the buddy system;
+the underlying memory is aligned by this amount:
+this must be equal to UNIV_PAGE_SIZE */
+#define BUF_BUDDY_HIGH (BUF_BUDDY_LOW << BUF_BUDDY_SIZES)
+/* @} */
+
+#endif
+
diff --git a/storage/innobase/include/data0data.h b/storage/innobase/include/data0data.h
new file mode 100644
index 00000000000..f9fce3f3657
--- /dev/null
+++ b/storage/innobase/include/data0data.h
@@ -0,0 +1,483 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/data0data.h
+SQL data field and tuple
+
+Created 5/30/1994 Heikki Tuuri
+*************************************************************************/
+
+#ifndef data0data_h
+#define data0data_h
+
+#include "univ.i"
+
+#include "data0types.h"
+#include "data0type.h"
+#include "mem0mem.h"
+#include "dict0types.h"
+
+/** Storage for overflow data in a big record, that is, a clustered
+index record which needs external storage of data fields */
+typedef struct big_rec_struct big_rec_t;
+
+#ifdef UNIV_DEBUG
+/*********************************************************************//**
+Gets pointer to the type struct of SQL data field.
+@return pointer to the type struct */
+UNIV_INLINE
+dtype_t*
+dfield_get_type(
+/*============*/
+ const dfield_t* field); /*!< in: SQL data field */
+/*********************************************************************//**
+Gets pointer to the data in a field.
+@return pointer to data */
+UNIV_INLINE
+void*
+dfield_get_data(
+/*============*/
+ const dfield_t* field); /*!< in: field */
+#else /* UNIV_DEBUG */
+# define dfield_get_type(field) (&(field)->type)
+# define dfield_get_data(field) ((field)->data)
+#endif /* UNIV_DEBUG */
+/*********************************************************************//**
+Sets the type struct of SQL data field. */
+UNIV_INLINE
+void
+dfield_set_type(
+/*============*/
+ dfield_t* field, /*!< in: SQL data field */
+ dtype_t* type); /*!< in: pointer to data type struct */
+/*********************************************************************//**
+Gets length of field data.
+@return length of data; UNIV_SQL_NULL if SQL null data */
+UNIV_INLINE
+ulint
+dfield_get_len(
+/*===========*/
+ const dfield_t* field); /*!< in: field */
+/*********************************************************************//**
+Sets length in a field. */
+UNIV_INLINE
+void
+dfield_set_len(
+/*===========*/
+ dfield_t* field, /*!< in: field */
+ ulint len); /*!< in: length or UNIV_SQL_NULL */
+/*********************************************************************//**
+Determines if a field is SQL NULL
+@return nonzero if SQL null data */
+UNIV_INLINE
+ulint
+dfield_is_null(
+/*===========*/
+ const dfield_t* field); /*!< in: field */
+/*********************************************************************//**
+Determines if a field is externally stored
+@return nonzero if externally stored */
+UNIV_INLINE
+ulint
+dfield_is_ext(
+/*==========*/
+ const dfield_t* field); /*!< in: field */
+/*********************************************************************//**
+Sets the "external storage" flag */
+UNIV_INLINE
+void
+dfield_set_ext(
+/*===========*/
+ dfield_t* field); /*!< in/out: field */
+/*********************************************************************//**
+Sets pointer to the data and length in a field. */
+UNIV_INLINE
+void
+dfield_set_data(
+/*============*/
+ dfield_t* field, /*!< in: field */
+ const void* data, /*!< in: data */
+ ulint len); /*!< in: length or UNIV_SQL_NULL */
+/*********************************************************************//**
+Sets a data field to SQL NULL. */
+UNIV_INLINE
+void
+dfield_set_null(
+/*============*/
+ dfield_t* field); /*!< in/out: field */
+/**********************************************************************//**
+Writes an SQL null field full of zeros. */
+UNIV_INLINE
+void
+data_write_sql_null(
+/*================*/
+ byte* data, /*!< in: pointer to a buffer of size len */
+ ulint len); /*!< in: SQL null size in bytes */
+/*********************************************************************//**
+Copies the data and len fields. */
+UNIV_INLINE
+void
+dfield_copy_data(
+/*=============*/
+ dfield_t* field1, /*!< out: field to copy to */
+ const dfield_t* field2);/*!< in: field to copy from */
+/*********************************************************************//**
+Copies a data field to another. */
+UNIV_INLINE
+void
+dfield_copy(
+/*========*/
+ dfield_t* field1, /*!< out: field to copy to */
+ const dfield_t* field2);/*!< in: field to copy from */
+/*********************************************************************//**
+Copies the data pointed to by a data field. */
+UNIV_INLINE
+void
+dfield_dup(
+/*=======*/
+ dfield_t* field, /*!< in/out: data field */
+ mem_heap_t* heap); /*!< in: memory heap where allocated */
+/*********************************************************************//**
+Tests if data length and content is equal for two dfields.
+@return TRUE if equal */
+UNIV_INLINE
+ibool
+dfield_datas_are_binary_equal(
+/*==========================*/
+ const dfield_t* field1, /*!< in: field */
+ const dfield_t* field2);/*!< in: field */
+/*********************************************************************//**
+Tests if dfield data length and content is equal to the given.
+@return TRUE if equal */
+UNIV_INTERN
+ibool
+dfield_data_is_binary_equal(
+/*========================*/
+ const dfield_t* field, /*!< in: field */
+ ulint len, /*!< in: data length or UNIV_SQL_NULL */
+ const byte* data); /*!< in: data */
+/*********************************************************************//**
+Gets number of fields in a data tuple.
+@return number of fields */
+UNIV_INLINE
+ulint
+dtuple_get_n_fields(
+/*================*/
+ const dtuple_t* tuple); /*!< in: tuple */
+#ifdef UNIV_DEBUG
+/*********************************************************************//**
+Gets nth field of a tuple.
+@return nth field */
+UNIV_INLINE
+dfield_t*
+dtuple_get_nth_field(
+/*=================*/
+ const dtuple_t* tuple, /*!< in: tuple */
+ ulint n); /*!< in: index of field */
+#else /* UNIV_DEBUG */
+# define dtuple_get_nth_field(tuple, n) ((tuple)->fields + (n))
+#endif /* UNIV_DEBUG */
+/*********************************************************************//**
+Gets info bits in a data tuple.
+@return info bits */
+UNIV_INLINE
+ulint
+dtuple_get_info_bits(
+/*=================*/
+ const dtuple_t* tuple); /*!< in: tuple */
+/*********************************************************************//**
+Sets info bits in a data tuple. */
+UNIV_INLINE
+void
+dtuple_set_info_bits(
+/*=================*/
+ dtuple_t* tuple, /*!< in: tuple */
+ ulint info_bits); /*!< in: info bits */
+/*********************************************************************//**
+Gets number of fields used in record comparisons.
+@return number of fields used in comparisons in rem0cmp.* */
+UNIV_INLINE
+ulint
+dtuple_get_n_fields_cmp(
+/*====================*/
+ const dtuple_t* tuple); /*!< in: tuple */
+/*********************************************************************//**
+Gets number of fields used in record comparisons. */
+UNIV_INLINE
+void
+dtuple_set_n_fields_cmp(
+/*====================*/
+ dtuple_t* tuple, /*!< in: tuple */
+ ulint n_fields_cmp); /*!< in: number of fields used in
+ comparisons in rem0cmp.* */
+/**********************************************************//**
+Creates a data tuple to a memory heap. The default value for number
+of fields used in record comparisons for this tuple is n_fields.
+@return own: created tuple */
+UNIV_INLINE
+dtuple_t*
+dtuple_create(
+/*==========*/
+ mem_heap_t* heap, /*!< in: memory heap where the tuple
+ is created */
+ ulint n_fields); /*!< in: number of fields */
+
+/**********************************************************//**
+Wrap data fields in a tuple. The default value for number
+of fields used in record comparisons for this tuple is n_fields.
+@return data tuple */
+UNIV_INLINE
+const dtuple_t*
+dtuple_from_fields(
+/*===============*/
+ dtuple_t* tuple, /*!< in: storage for data tuple */
+ const dfield_t* fields, /*!< in: fields */
+ ulint n_fields); /*!< in: number of fields */
+
+/*********************************************************************//**
+Sets number of fields used in a tuple. Normally this is set in
+dtuple_create, but if you want later to set it smaller, you can use this. */
+UNIV_INTERN
+void
+dtuple_set_n_fields(
+/*================*/
+ dtuple_t* tuple, /*!< in: tuple */
+ ulint n_fields); /*!< in: number of fields */
+/*********************************************************************//**
+Copies a data tuple to another. This is a shallow copy; if a deep copy
+is desired, dfield_dup() will have to be invoked on each field.
+@return own: copy of tuple */
+UNIV_INLINE
+dtuple_t*
+dtuple_copy(
+/*========*/
+ const dtuple_t* tuple, /*!< in: tuple to copy from */
+ mem_heap_t* heap); /*!< in: memory heap
+ where the tuple is created */
+/**********************************************************//**
+The following function returns the sum of data lengths of a tuple. The space
+occupied by the field structs or the tuple struct is not counted.
+@return sum of data lens */
+UNIV_INLINE
+ulint
+dtuple_get_data_size(
+/*=================*/
+ const dtuple_t* tuple, /*!< in: typed data tuple */
+ ulint comp); /*!< in: nonzero=ROW_FORMAT=COMPACT */
+/*********************************************************************//**
+Computes the number of externally stored fields in a data tuple.
+@return number of fields */
+UNIV_INLINE
+ulint
+dtuple_get_n_ext(
+/*=============*/
+ const dtuple_t* tuple); /*!< in: tuple */
+/************************************************************//**
+Compare two data tuples, respecting the collation of character fields.
+@return 1, 0 , -1 if tuple1 is greater, equal, less, respectively,
+than tuple2 */
+UNIV_INTERN
+int
+dtuple_coll_cmp(
+/*============*/
+ const dtuple_t* tuple1, /*!< in: tuple 1 */
+ const dtuple_t* tuple2);/*!< in: tuple 2 */
+/************************************************************//**
+Folds a prefix given as the number of fields of a tuple.
+@return the folded value */
+UNIV_INLINE
+ulint
+dtuple_fold(
+/*========*/
+ const dtuple_t* tuple, /*!< in: the tuple */
+ ulint n_fields,/*!< in: number of complete fields to fold */
+ ulint n_bytes,/*!< in: number of bytes to fold in an
+ incomplete last field */
+ dulint tree_id)/*!< in: index tree id */
+ __attribute__((pure));
+/*******************************************************************//**
+Sets types of fields binary in a tuple. */
+UNIV_INLINE
+void
+dtuple_set_types_binary(
+/*====================*/
+ dtuple_t* tuple, /*!< in: data tuple */
+ ulint n); /*!< in: number of fields to set */
+/**********************************************************************//**
+Checks if a dtuple contains an SQL null value.
+@return TRUE if some field is SQL null */
+UNIV_INLINE
+ibool
+dtuple_contains_null(
+/*=================*/
+ const dtuple_t* tuple); /*!< in: dtuple */
+/**********************************************************//**
+Checks that a data field is typed. Asserts an error if not.
+@return TRUE if ok */
+UNIV_INTERN
+ibool
+dfield_check_typed(
+/*===============*/
+ const dfield_t* field); /*!< in: data field */
+/**********************************************************//**
+Checks that a data tuple is typed. Asserts an error if not.
+@return TRUE if ok */
+UNIV_INTERN
+ibool
+dtuple_check_typed(
+/*===============*/
+ const dtuple_t* tuple); /*!< in: tuple */
+/**********************************************************//**
+Checks that a data tuple is typed.
+@return TRUE if ok */
+UNIV_INTERN
+ibool
+dtuple_check_typed_no_assert(
+/*=========================*/
+ const dtuple_t* tuple); /*!< in: tuple */
+#ifdef UNIV_DEBUG
+/**********************************************************//**
+Validates the consistency of a tuple which must be complete, i.e,
+all fields must have been set.
+@return TRUE if ok */
+UNIV_INTERN
+ibool
+dtuple_validate(
+/*============*/
+ const dtuple_t* tuple); /*!< in: tuple */
+#endif /* UNIV_DEBUG */
+/*************************************************************//**
+Pretty prints a dfield value according to its data type. */
+UNIV_INTERN
+void
+dfield_print(
+/*=========*/
+ const dfield_t* dfield);/*!< in: dfield */
+/*************************************************************//**
+Pretty prints a dfield value according to its data type. Also the hex string
+is printed if a string contains non-printable characters. */
+UNIV_INTERN
+void
+dfield_print_also_hex(
+/*==================*/
+ const dfield_t* dfield); /*!< in: dfield */
+/**********************************************************//**
+The following function prints the contents of a tuple. */
+UNIV_INTERN
+void
+dtuple_print(
+/*=========*/
+ FILE* f, /*!< in: output stream */
+ const dtuple_t* tuple); /*!< in: tuple */
+/**************************************************************//**
+Moves parts of long fields in entry to the big record vector so that
+the size of tuple drops below the maximum record size allowed in the
+database. Moves data only from those fields which are not necessary
+to determine uniquely the insertion place of the tuple in the index.
+@return own: created big record vector, NULL if we are not able to
+shorten the entry enough, i.e., if there are too many fixed-length or
+short fields in entry or the index is clustered */
+UNIV_INTERN
+big_rec_t*
+dtuple_convert_big_rec(
+/*===================*/
+ dict_index_t* index, /*!< in: index */
+ dtuple_t* entry, /*!< in/out: index entry */
+ ulint* n_ext); /*!< in/out: number of
+ externally stored columns */
+/**************************************************************//**
+Puts back to entry the data stored in vector. Note that to ensure the
+fields in entry can accommodate the data, vector must have been created
+from entry with dtuple_convert_big_rec. */
+UNIV_INTERN
+void
+dtuple_convert_back_big_rec(
+/*========================*/
+ dict_index_t* index, /*!< in: index */
+ dtuple_t* entry, /*!< in: entry whose data was put to vector */
+ big_rec_t* vector);/*!< in, own: big rec vector; it is
+ freed in this function */
+/**************************************************************//**
+Frees the memory in a big rec vector. */
+UNIV_INLINE
+void
+dtuple_big_rec_free(
+/*================*/
+ big_rec_t* vector); /*!< in, own: big rec vector; it is
+ freed in this function */
+
+/*######################################################################*/
+
+/** Structure for an SQL data field */
+struct dfield_struct{
+ void* data; /*!< pointer to data */
+ unsigned ext:1; /*!< TRUE=externally stored, FALSE=local */
+ unsigned len:32; /*!< data length; UNIV_SQL_NULL if SQL null */
+ dtype_t type; /*!< type of data */
+};
+
+/** Structure for an SQL data tuple of fields (logical record) */
+struct dtuple_struct {
+ ulint info_bits; /*!< info bits of an index record:
+ the default is 0; this field is used
+ if an index record is built from
+ a data tuple */
+ ulint n_fields; /*!< number of fields in dtuple */
+ ulint n_fields_cmp; /*!< number of fields which should
+ be used in comparison services
+ of rem0cmp.*; the index search
+ is performed by comparing only these
+ fields, others are ignored; the
+ default value in dtuple creation is
+ the same value as n_fields */
+ dfield_t* fields; /*!< fields */
+ UT_LIST_NODE_T(dtuple_t) tuple_list;
+ /*!< data tuples can be linked into a
+ list using this field */
+#ifdef UNIV_DEBUG
+ ulint magic_n; /*!< magic number, used in
+ debug assertions */
+/** Value of dtuple_struct::magic_n */
+# define DATA_TUPLE_MAGIC_N 65478679
+#endif /* UNIV_DEBUG */
+};
+
+/** A slot for a field in a big rec vector */
+typedef struct big_rec_field_struct big_rec_field_t;
+/** A slot for a field in a big rec vector */
+struct big_rec_field_struct {
+ ulint field_no; /*!< field number in record */
+ ulint len; /*!< stored data length, in bytes */
+ const void* data; /*!< stored data */
+};
+
+/** Storage format for overflow data in a big record, that is, a
+clustered index record which needs external storage of data fields */
+struct big_rec_struct {
+ mem_heap_t* heap; /*!< memory heap from which
+ allocated */
+ ulint n_fields; /*!< number of stored fields */
+ big_rec_field_t*fields; /*!< stored fields */
+};
+
+#ifndef UNIV_NONINL
+#include "data0data.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/data0data.ic b/storage/innobase/include/data0data.ic
new file mode 100644
index 00000000000..da79aa33702
--- /dev/null
+++ b/storage/innobase/include/data0data.ic
@@ -0,0 +1,612 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/data0data.ic
+SQL data field and tuple
+
+Created 5/30/1994 Heikki Tuuri
+*************************************************************************/
+
+#include "mem0mem.h"
+#include "ut0rnd.h"
+
+#ifdef UNIV_DEBUG
+/** Dummy variable to catch access to uninitialized fields. In the
+debug version, dtuple_create() will make all fields of dtuple_t point
+to data_error. */
+extern byte data_error;
+
+/*********************************************************************//**
+Gets pointer to the type struct of SQL data field.
+@return pointer to the type struct */
+UNIV_INLINE
+dtype_t*
+dfield_get_type(
+/*============*/
+ const dfield_t* field) /*!< in: SQL data field */
+{
+ ut_ad(field);
+
+ return((dtype_t*) &(field->type));
+}
+#endif /* UNIV_DEBUG */
+
+/*********************************************************************//**
+Sets the type struct of SQL data field. */
+UNIV_INLINE
+void
+dfield_set_type(
+/*============*/
+ dfield_t* field, /*!< in: SQL data field */
+ dtype_t* type) /*!< in: pointer to data type struct */
+{
+ ut_ad(field && type);
+
+ field->type = *type;
+}
+
+#ifdef UNIV_DEBUG
+/*********************************************************************//**
+Gets pointer to the data in a field.
+@return pointer to data */
+UNIV_INLINE
+void*
+dfield_get_data(
+/*============*/
+ const dfield_t* field) /*!< in: field */
+{
+ ut_ad(field);
+ ut_ad((field->len == UNIV_SQL_NULL)
+ || (field->data != &data_error));
+
+ return((void*) field->data);
+}
+#endif /* UNIV_DEBUG */
+
+/*********************************************************************//**
+Gets length of field data.
+@return length of data; UNIV_SQL_NULL if SQL null data */
+UNIV_INLINE
+ulint
+dfield_get_len(
+/*===========*/
+ const dfield_t* field) /*!< in: field */
+{
+ ut_ad(field);
+ ut_ad((field->len == UNIV_SQL_NULL)
+ || (field->data != &data_error));
+
+ return(field->len);
+}
+
+/*********************************************************************//**
+Sets length in a field. */
+UNIV_INLINE
+void
+dfield_set_len(
+/*===========*/
+ dfield_t* field, /*!< in: field */
+ ulint len) /*!< in: length or UNIV_SQL_NULL */
+{
+ ut_ad(field);
+#ifdef UNIV_VALGRIND_DEBUG
+ if (len != UNIV_SQL_NULL) UNIV_MEM_ASSERT_RW(field->data, len);
+#endif /* UNIV_VALGRIND_DEBUG */
+
+ field->ext = 0;
+ field->len = len;
+}
+
+/*********************************************************************//**
+Determines if a field is SQL NULL
+@return nonzero if SQL null data */
+UNIV_INLINE
+ulint
+dfield_is_null(
+/*===========*/
+ const dfield_t* field) /*!< in: field */
+{
+ ut_ad(field);
+
+ return(field->len == UNIV_SQL_NULL);
+}
+
+/*********************************************************************//**
+Determines if a field is externally stored
+@return nonzero if externally stored */
+UNIV_INLINE
+ulint
+dfield_is_ext(
+/*==========*/
+ const dfield_t* field) /*!< in: field */
+{
+ ut_ad(field);
+
+ return(UNIV_UNLIKELY(field->ext));
+}
+
+/*********************************************************************//**
+Sets the "external storage" flag */
+UNIV_INLINE
+void
+dfield_set_ext(
+/*===========*/
+ dfield_t* field) /*!< in/out: field */
+{
+ ut_ad(field);
+
+ field->ext = 1;
+}
+
+/*********************************************************************//**
+Sets pointer to the data and length in a field. */
+UNIV_INLINE
+void
+dfield_set_data(
+/*============*/
+ dfield_t* field, /*!< in: field */
+ const void* data, /*!< in: data */
+ ulint len) /*!< in: length or UNIV_SQL_NULL */
+{
+ ut_ad(field);
+
+#ifdef UNIV_VALGRIND_DEBUG
+ if (len != UNIV_SQL_NULL) UNIV_MEM_ASSERT_RW(data, len);
+#endif /* UNIV_VALGRIND_DEBUG */
+ field->data = (void*) data;
+ field->ext = 0;
+ field->len = len;
+}
+
+/*********************************************************************//**
+Sets a data field to SQL NULL. */
+UNIV_INLINE
+void
+dfield_set_null(
+/*============*/
+ dfield_t* field) /*!< in/out: field */
+{
+ dfield_set_data(field, NULL, UNIV_SQL_NULL);
+}
+
+/*********************************************************************//**
+Copies the data and len fields. */
+UNIV_INLINE
+void
+dfield_copy_data(
+/*=============*/
+ dfield_t* field1, /*!< out: field to copy to */
+ const dfield_t* field2) /*!< in: field to copy from */
+{
+ ut_ad(field1 && field2);
+
+ field1->data = field2->data;
+ field1->len = field2->len;
+ field1->ext = field2->ext;
+}
+
+/*********************************************************************//**
+Copies a data field to another. */
+UNIV_INLINE
+void
+dfield_copy(
+/*========*/
+ dfield_t* field1, /*!< out: field to copy to */
+ const dfield_t* field2) /*!< in: field to copy from */
+{
+ *field1 = *field2;
+}
+
+/*********************************************************************//**
+Copies the data pointed to by a data field. */
+UNIV_INLINE
+void
+dfield_dup(
+/*=======*/
+ dfield_t* field, /*!< in/out: data field */
+ mem_heap_t* heap) /*!< in: memory heap where allocated */
+{
+ if (!dfield_is_null(field)) {
+ UNIV_MEM_ASSERT_RW(field->data, field->len);
+ field->data = mem_heap_dup(heap, field->data, field->len);
+ }
+}
+
+/*********************************************************************//**
+Tests if data length and content is equal for two dfields.
+@return TRUE if equal */
+UNIV_INLINE
+ibool
+dfield_datas_are_binary_equal(
+/*==========================*/
+ const dfield_t* field1, /*!< in: field */
+ const dfield_t* field2) /*!< in: field */
+{
+ ulint len;
+
+ len = field1->len;
+
+ return(len == field2->len
+ && (len == UNIV_SQL_NULL
+ || !memcmp(field1->data, field2->data, len)));
+}
+
+/*********************************************************************//**
+Gets info bits in a data tuple.
+@return info bits */
+UNIV_INLINE
+ulint
+dtuple_get_info_bits(
+/*=================*/
+ const dtuple_t* tuple) /*!< in: tuple */
+{
+ ut_ad(tuple);
+
+ return(tuple->info_bits);
+}
+
+/*********************************************************************//**
+Sets info bits in a data tuple. */
+UNIV_INLINE
+void
+dtuple_set_info_bits(
+/*=================*/
+ dtuple_t* tuple, /*!< in: tuple */
+ ulint info_bits) /*!< in: info bits */
+{
+ ut_ad(tuple);
+
+ tuple->info_bits = info_bits;
+}
+
+/*********************************************************************//**
+Gets number of fields used in record comparisons.
+@return number of fields used in comparisons in rem0cmp.* */
+UNIV_INLINE
+ulint
+dtuple_get_n_fields_cmp(
+/*====================*/
+ const dtuple_t* tuple) /*!< in: tuple */
+{
+ ut_ad(tuple);
+
+ return(tuple->n_fields_cmp);
+}
+
+/*********************************************************************//**
+Sets number of fields used in record comparisons. */
+UNIV_INLINE
+void
+dtuple_set_n_fields_cmp(
+/*====================*/
+ dtuple_t* tuple, /*!< in: tuple */
+ ulint n_fields_cmp) /*!< in: number of fields used in
+ comparisons in rem0cmp.* */
+{
+ ut_ad(tuple);
+ ut_ad(n_fields_cmp <= tuple->n_fields);
+
+ tuple->n_fields_cmp = n_fields_cmp;
+}
+
+/*********************************************************************//**
+Gets number of fields in a data tuple.
+@return number of fields */
+UNIV_INLINE
+ulint
+dtuple_get_n_fields(
+/*================*/
+ const dtuple_t* tuple) /*!< in: tuple */
+{
+ ut_ad(tuple);
+
+ return(tuple->n_fields);
+}
+
+#ifdef UNIV_DEBUG
+/*********************************************************************//**
+Gets nth field of a tuple.
+@return nth field */
+UNIV_INLINE
+dfield_t*
+dtuple_get_nth_field(
+/*=================*/
+ const dtuple_t* tuple, /*!< in: tuple */
+ ulint n) /*!< in: index of field */
+{
+ ut_ad(tuple);
+ ut_ad(n < tuple->n_fields);
+
+ return((dfield_t*) tuple->fields + n);
+}
+#endif /* UNIV_DEBUG */
+
+/**********************************************************//**
+Creates a data tuple to a memory heap. The default value for number
+of fields used in record comparisons for this tuple is n_fields.
+@return own: created tuple */
+UNIV_INLINE
+dtuple_t*
+dtuple_create(
+/*==========*/
+ mem_heap_t* heap, /*!< in: memory heap where the tuple
+ is created */
+ ulint n_fields) /*!< in: number of fields */
+{
+ dtuple_t* tuple;
+
+ ut_ad(heap);
+
+ tuple = (dtuple_t*) mem_heap_alloc(heap, sizeof(dtuple_t)
+ + n_fields * sizeof(dfield_t));
+ tuple->info_bits = 0;
+ tuple->n_fields = n_fields;
+ tuple->n_fields_cmp = n_fields;
+ tuple->fields = (dfield_t*) &tuple[1];
+
+#ifdef UNIV_DEBUG
+ tuple->magic_n = DATA_TUPLE_MAGIC_N;
+
+ { /* In the debug version, initialize fields to an error value */
+ ulint i;
+
+ for (i = 0; i < n_fields; i++) {
+ dfield_t* field;
+
+ field = dtuple_get_nth_field(tuple, i);
+
+ dfield_set_len(field, UNIV_SQL_NULL);
+ field->data = &data_error;
+ dfield_get_type(field)->mtype = DATA_ERROR;
+ }
+ }
+
+ UNIV_MEM_INVALID(tuple->fields, n_fields * sizeof *tuple->fields);
+#endif
+ return(tuple);
+}
+
+/**********************************************************//**
+Wrap data fields in a tuple. The default value for number
+of fields used in record comparisons for this tuple is n_fields.
+@return data tuple */
+UNIV_INLINE
+const dtuple_t*
+dtuple_from_fields(
+/*===============*/
+ dtuple_t* tuple, /*!< in: storage for data tuple */
+ const dfield_t* fields, /*!< in: fields */
+ ulint n_fields) /*!< in: number of fields */
+{
+ tuple->info_bits = 0;
+ tuple->n_fields = tuple->n_fields_cmp = n_fields;
+ tuple->fields = (dfield_t*) fields;
+ ut_d(tuple->magic_n = DATA_TUPLE_MAGIC_N);
+
+ return(tuple);
+}
+
+/*********************************************************************//**
+Copies a data tuple to another. This is a shallow copy; if a deep copy
+is desired, dfield_dup() will have to be invoked on each field.
+@return own: copy of tuple */
+UNIV_INLINE
+dtuple_t*
+dtuple_copy(
+/*========*/
+ const dtuple_t* tuple, /*!< in: tuple to copy from */
+ mem_heap_t* heap) /*!< in: memory heap
+ where the tuple is created */
+{
+ ulint n_fields = dtuple_get_n_fields(tuple);
+ dtuple_t* new_tuple = dtuple_create(heap, n_fields);
+ ulint i;
+
+ for (i = 0; i < n_fields; i++) {
+ dfield_copy(dtuple_get_nth_field(new_tuple, i),
+ dtuple_get_nth_field(tuple, i));
+ }
+
+ return(new_tuple);
+}
+
+/**********************************************************//**
+The following function returns the sum of data lengths of a tuple. The space
+occupied by the field structs or the tuple struct is not counted. Neither
+is possible space in externally stored parts of the field.
+@return sum of data lengths */
+UNIV_INLINE
+ulint
+dtuple_get_data_size(
+/*=================*/
+ const dtuple_t* tuple, /*!< in: typed data tuple */
+ ulint comp) /*!< in: nonzero=ROW_FORMAT=COMPACT */
+{
+ const dfield_t* field;
+ ulint n_fields;
+ ulint len;
+ ulint i;
+ ulint sum = 0;
+
+ ut_ad(tuple);
+ ut_ad(dtuple_check_typed(tuple));
+ ut_ad(tuple->magic_n == DATA_TUPLE_MAGIC_N);
+
+ n_fields = tuple->n_fields;
+
+ for (i = 0; i < n_fields; i++) {
+ field = dtuple_get_nth_field(tuple, i);
+ len = dfield_get_len(field);
+
+ if (len == UNIV_SQL_NULL) {
+ len = dtype_get_sql_null_size(dfield_get_type(field),
+ comp);
+ }
+
+ sum += len;
+ }
+
+ return(sum);
+}
+
+/*********************************************************************//**
+Computes the number of externally stored fields in a data tuple.
+@return number of externally stored fields */
+UNIV_INLINE
+ulint
+dtuple_get_n_ext(
+/*=============*/
+ const dtuple_t* tuple) /*!< in: tuple */
+{
+ ulint n_ext = 0;
+ ulint n_fields = tuple->n_fields;
+ ulint i;
+
+ ut_ad(tuple);
+ ut_ad(dtuple_check_typed(tuple));
+ ut_ad(tuple->magic_n == DATA_TUPLE_MAGIC_N);
+
+ for (i = 0; i < n_fields; i++) {
+ n_ext += dtuple_get_nth_field(tuple, i)->ext;
+ }
+
+ return(n_ext);
+}
+
+/*******************************************************************//**
+Sets types of fields binary in a tuple. */
+UNIV_INLINE
+void
+dtuple_set_types_binary(
+/*====================*/
+ dtuple_t* tuple, /*!< in: data tuple */
+ ulint n) /*!< in: number of fields to set */
+{
+ dtype_t* dfield_type;
+ ulint i;
+
+ for (i = 0; i < n; i++) {
+ dfield_type = dfield_get_type(dtuple_get_nth_field(tuple, i));
+ dtype_set(dfield_type, DATA_BINARY, 0, 0);
+ }
+}
+
+/************************************************************//**
+Folds a prefix given as the number of fields of a tuple.
+@return the folded value */
+UNIV_INLINE
+ulint
+dtuple_fold(
+/*========*/
+ const dtuple_t* tuple, /*!< in: the tuple */
+ ulint n_fields,/*!< in: number of complete fields to fold */
+ ulint n_bytes,/*!< in: number of bytes to fold in an
+ incomplete last field */
+ dulint tree_id)/*!< in: index tree id */
+{
+ const dfield_t* field;
+ ulint i;
+ const byte* data;
+ ulint len;
+ ulint fold;
+
+ ut_ad(tuple);
+ ut_ad(tuple->magic_n == DATA_TUPLE_MAGIC_N);
+ ut_ad(dtuple_check_typed(tuple));
+
+ fold = ut_fold_dulint(tree_id);
+
+ for (i = 0; i < n_fields; i++) {
+ field = dtuple_get_nth_field(tuple, i);
+
+ data = (const byte*) dfield_get_data(field);
+ len = dfield_get_len(field);
+
+ if (len != UNIV_SQL_NULL) {
+ fold = ut_fold_ulint_pair(fold,
+ ut_fold_binary(data, len));
+ }
+ }
+
+ if (n_bytes > 0) {
+ field = dtuple_get_nth_field(tuple, i);
+
+ data = (const byte*) dfield_get_data(field);
+ len = dfield_get_len(field);
+
+ if (len != UNIV_SQL_NULL) {
+ if (len > n_bytes) {
+ len = n_bytes;
+ }
+
+ fold = ut_fold_ulint_pair(fold,
+ ut_fold_binary(data, len));
+ }
+ }
+
+ return(fold);
+}
+
+/**********************************************************************//**
+Writes an SQL null field full of zeros. */
+UNIV_INLINE
+void
+data_write_sql_null(
+/*================*/
+ byte* data, /*!< in: pointer to a buffer of size len */
+ ulint len) /*!< in: SQL null size in bytes */
+{
+ memset(data, 0, len);
+}
+
+/**********************************************************************//**
+Checks if a dtuple contains an SQL null value.
+@return TRUE if some field is SQL null */
+UNIV_INLINE
+ibool
+dtuple_contains_null(
+/*=================*/
+ const dtuple_t* tuple) /*!< in: dtuple */
+{
+ ulint n;
+ ulint i;
+
+ n = dtuple_get_n_fields(tuple);
+
+ for (i = 0; i < n; i++) {
+ if (dfield_is_null(dtuple_get_nth_field(tuple, i))) {
+
+ return(TRUE);
+ }
+ }
+
+ return(FALSE);
+}
+
+/**************************************************************//**
+Frees the memory in a big rec vector. */
+UNIV_INLINE
+void
+dtuple_big_rec_free(
+/*================*/
+ big_rec_t* vector) /*!< in, own: big rec vector; it is
+ freed in this function */
+{
+ mem_heap_free(vector->heap);
+}
diff --git a/storage/innobase/include/data0type.h b/storage/innobase/include/data0type.h
new file mode 100644
index 00000000000..a73bed3a9f5
--- /dev/null
+++ b/storage/innobase/include/data0type.h
@@ -0,0 +1,486 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/data0type.h
+Data types
+
+Created 1/16/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef data0type_h
+#define data0type_h
+
+#include "univ.i"
+
+extern ulint data_mysql_default_charset_coll;
+#define DATA_MYSQL_LATIN1_SWEDISH_CHARSET_COLL 8
+#define DATA_MYSQL_BINARY_CHARSET_COLL 63
+
+/* SQL data type struct */
+typedef struct dtype_struct dtype_t;
+
+/*-------------------------------------------*/
+/* The 'MAIN TYPE' of a column */
+#define DATA_VARCHAR 1 /* character varying of the
+ latin1_swedish_ci charset-collation; note
+ that the MySQL format for this, DATA_BINARY,
+ DATA_VARMYSQL, is also affected by whether the
+ 'precise type' contains
+ DATA_MYSQL_TRUE_VARCHAR */
+#define DATA_CHAR 2 /* fixed length character of the
+ latin1_swedish_ci charset-collation */
+#define DATA_FIXBINARY 3 /* binary string of fixed length */
+#define DATA_BINARY 4 /* binary string */
+#define DATA_BLOB 5 /* binary large object, or a TEXT type;
+ if prtype & DATA_BINARY_TYPE == 0, then this is
+ actually a TEXT column (or a BLOB created
+ with < 4.0.14; since column prefix indexes
+ came only in 4.0.14, the missing flag in BLOBs
+ created before that does not cause any harm) */
+#define DATA_INT 6 /* integer: can be any size 1 - 8 bytes */
+#define DATA_SYS_CHILD 7 /* address of the child page in node pointer */
+#define DATA_SYS 8 /* system column */
+
+/* Data types >= DATA_FLOAT must be compared using the whole field, not as
+binary strings */
+
+#define DATA_FLOAT 9
+#define DATA_DOUBLE 10
+#define DATA_DECIMAL 11 /* decimal number stored as an ASCII string */
+#define DATA_VARMYSQL 12 /* any charset varying length char */
+#define DATA_MYSQL 13 /* any charset fixed length char */
+ /* NOTE that 4.1.1 used DATA_MYSQL and
+ DATA_VARMYSQL for all character sets, and the
+ charset-collation for tables created with it
+ can also be latin1_swedish_ci */
+#define DATA_MTYPE_MAX 63 /* dtype_store_for_order_and_null_size()
+ requires the values are <= 63 */
+/*-------------------------------------------*/
+/* The 'PRECISE TYPE' of a column */
+/*
+Tables created by a MySQL user have the following convention:
+
+- In the least significant byte in the precise type we store the MySQL type
+code (not applicable for system columns).
+
+- In the second least significant byte we OR flags DATA_NOT_NULL,
+DATA_UNSIGNED, DATA_BINARY_TYPE.
+
+- In the third least significant byte of the precise type of string types we
+store the MySQL charset-collation code. In DATA_BLOB columns created with
+< 4.0.14 we do not actually know if it is a BLOB or a TEXT column. Since there
+are no indexes on prefixes of BLOB or TEXT columns in < 4.0.14, this is no
+problem, though.
+
+Note that versions < 4.1.2 or < 5.0.1 did not store the charset code to the
+precise type, since the charset was always the default charset of the MySQL
+installation. If the stored charset code is 0 in the system table SYS_COLUMNS
+of InnoDB, that means that the default charset of this MySQL installation
+should be used.
+
+When loading a table definition from the system tables to the InnoDB data
+dictionary cache in main memory, InnoDB versions >= 4.1.2 and >= 5.0.1 check
+if the stored charset-collation is 0, and if that is the case and the type is
+a non-binary string, replace that 0 by the default charset-collation code of
+this MySQL installation. In short, in old tables, the charset-collation code
+in the system tables on disk can be 0, but in in-memory data structures
+(dtype_t), the charset-collation code is always != 0 for non-binary string
+types.
+
+In new tables, in binary string types, the charset-collation code is the
+MySQL code for the 'binary charset', that is, != 0.
+
+For binary string types and for DATA_CHAR, DATA_VARCHAR, and for those
+DATA_BLOB which are binary or have the charset-collation latin1_swedish_ci,
+InnoDB performs all comparisons internally, without resorting to the MySQL
+comparison functions. This is to save CPU time.
+
+InnoDB's own internal system tables have different precise types for their
+columns, and for them the precise type is usually not used at all.
+*/
+
+#define DATA_ENGLISH 4 /* English language character string: this
+ is a relic from pre-MySQL time and only used
+ for InnoDB's own system tables */
+#define DATA_ERROR 111 /* another relic from pre-MySQL time */
+
+#define DATA_MYSQL_TYPE_MASK 255 /* AND with this mask to extract the MySQL
+ type from the precise type */
+#define DATA_MYSQL_TRUE_VARCHAR 15 /* MySQL type code for the >= 5.0.3
+ format true VARCHAR */
+
+/* Precise data types for system columns and the length of those columns;
+NOTE: the values must run from 0 up in the order given! All codes must
+be less than 256 */
+#define DATA_ROW_ID 0 /* row id: a dulint */
+#define DATA_ROW_ID_LEN 6 /* stored length for row id */
+
+#define DATA_TRX_ID 1 /* transaction id: 6 bytes */
+#define DATA_TRX_ID_LEN 6
+
+#define DATA_ROLL_PTR 2 /* rollback data pointer: 7 bytes */
+#define DATA_ROLL_PTR_LEN 7
+
+#define DATA_N_SYS_COLS 3 /* number of system columns defined above */
+
+#define DATA_SYS_PRTYPE_MASK 0xF /* mask to extract the above from prtype */
+
+/* Flags ORed to the precise data type */
+#define DATA_NOT_NULL 256 /* this is ORed to the precise type when
+ the column is declared as NOT NULL */
+#define DATA_UNSIGNED 512 /* this id ORed to the precise type when
+ we have an unsigned integer type */
+#define DATA_BINARY_TYPE 1024 /* if the data type is a binary character
+ string, this is ORed to the precise type:
+ this only holds for tables created with
+ >= MySQL-4.0.14 */
+/* #define DATA_NONLATIN1 2048 This is a relic from < 4.1.2 and < 5.0.1.
+ In earlier versions this was set for some
+ BLOB columns.
+*/
+#define DATA_LONG_TRUE_VARCHAR 4096 /* this is ORed to the precise data
+ type when the column is true VARCHAR where
+ MySQL uses 2 bytes to store the data len;
+ for shorter VARCHARs MySQL uses only 1 byte */
+/*-------------------------------------------*/
+
+/* This many bytes we need to store the type information affecting the
+alphabetical order for a single field and decide the storage size of an
+SQL null*/
+#define DATA_ORDER_NULL_TYPE_BUF_SIZE 4
+/* In the >= 4.1.x storage format we add 2 bytes more so that we can also
+store the charset-collation number; one byte is left unused, though */
+#define DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE 6
+
+#ifndef UNIV_HOTBACKUP
+/*********************************************************************//**
+Gets the MySQL type code from a dtype.
+@return MySQL type code; this is NOT an InnoDB type code! */
+UNIV_INLINE
+ulint
+dtype_get_mysql_type(
+/*=================*/
+ const dtype_t* type); /*!< in: type struct */
+/*********************************************************************//**
+Determine how many bytes the first n characters of the given string occupy.
+If the string is shorter than n characters, returns the number of bytes
+the characters in the string occupy.
+@return length of the prefix, in bytes */
+UNIV_INTERN
+ulint
+dtype_get_at_most_n_mbchars(
+/*========================*/
+ ulint prtype, /*!< in: precise type */
+ ulint mbminlen, /*!< in: minimum length of a
+ multi-byte character */
+ ulint mbmaxlen, /*!< in: maximum length of a
+ multi-byte character */
+ ulint prefix_len, /*!< in: length of the requested
+ prefix, in characters, multiplied by
+ dtype_get_mbmaxlen(dtype) */
+ ulint data_len, /*!< in: length of str (in bytes) */
+ const char* str); /*!< in: the string whose prefix
+ length is being determined */
+#endif /* !UNIV_HOTBACKUP */
+/*********************************************************************//**
+Checks if a data main type is a string type. Also a BLOB is considered a
+string type.
+@return TRUE if string type */
+UNIV_INTERN
+ibool
+dtype_is_string_type(
+/*=================*/
+ ulint mtype); /*!< in: InnoDB main data type code: DATA_CHAR, ... */
+/*********************************************************************//**
+Checks if a type is a binary string type. Note that for tables created with
+< 4.0.14, we do not know if a DATA_BLOB column is a BLOB or a TEXT column. For
+those DATA_BLOB columns this function currently returns FALSE.
+@return TRUE if binary string type */
+UNIV_INTERN
+ibool
+dtype_is_binary_string_type(
+/*========================*/
+ ulint mtype, /*!< in: main data type */
+ ulint prtype);/*!< in: precise type */
+/*********************************************************************//**
+Checks if a type is a non-binary string type. That is, dtype_is_string_type is
+TRUE and dtype_is_binary_string_type is FALSE. Note that for tables created
+with < 4.0.14, we do not know if a DATA_BLOB column is a BLOB or a TEXT column.
+For those DATA_BLOB columns this function currently returns TRUE.
+@return TRUE if non-binary string type */
+UNIV_INTERN
+ibool
+dtype_is_non_binary_string_type(
+/*============================*/
+ ulint mtype, /*!< in: main data type */
+ ulint prtype);/*!< in: precise type */
+/*********************************************************************//**
+Sets a data type structure. */
+UNIV_INLINE
+void
+dtype_set(
+/*======*/
+ dtype_t* type, /*!< in: type struct to init */
+ ulint mtype, /*!< in: main data type */
+ ulint prtype, /*!< in: precise type */
+ ulint len); /*!< in: precision of type */
+/*********************************************************************//**
+Copies a data type structure. */
+UNIV_INLINE
+void
+dtype_copy(
+/*=======*/
+ dtype_t* type1, /*!< in: type struct to copy to */
+ const dtype_t* type2); /*!< in: type struct to copy from */
+/*********************************************************************//**
+Gets the SQL main data type.
+@return SQL main data type */
+UNIV_INLINE
+ulint
+dtype_get_mtype(
+/*============*/
+ const dtype_t* type); /*!< in: data type */
+/*********************************************************************//**
+Gets the precise data type.
+@return precise data type */
+UNIV_INLINE
+ulint
+dtype_get_prtype(
+/*=============*/
+ const dtype_t* type); /*!< in: data type */
+#ifndef UNIV_HOTBACKUP
+/*********************************************************************//**
+Compute the mbminlen and mbmaxlen members of a data type structure. */
+UNIV_INLINE
+void
+dtype_get_mblen(
+/*============*/
+ ulint mtype, /*!< in: main type */
+ ulint prtype, /*!< in: precise type (and collation) */
+ ulint* mbminlen, /*!< out: minimum length of a
+ multi-byte character */
+ ulint* mbmaxlen); /*!< out: maximum length of a
+ multi-byte character */
+/*********************************************************************//**
+Gets the MySQL charset-collation code for MySQL string types.
+@return MySQL charset-collation code */
+UNIV_INLINE
+ulint
+dtype_get_charset_coll(
+/*===================*/
+ ulint prtype);/*!< in: precise data type */
+/*********************************************************************//**
+Forms a precise type from the < 4.1.2 format precise type plus the
+charset-collation code.
+@return precise type, including the charset-collation code */
+UNIV_INTERN
+ulint
+dtype_form_prtype(
+/*==============*/
+ ulint old_prtype, /*!< in: the MySQL type code and the flags
+ DATA_BINARY_TYPE etc. */
+ ulint charset_coll); /*!< in: MySQL charset-collation code */
+/*********************************************************************//**
+Determines if a MySQL string type is a subset of UTF-8. This function
+may return false negatives, in case further character-set collation
+codes are introduced in MySQL later.
+@return TRUE if a subset of UTF-8 */
+UNIV_INLINE
+ibool
+dtype_is_utf8(
+/*==========*/
+ ulint prtype);/*!< in: precise data type */
+#endif /* !UNIV_HOTBACKUP */
+/*********************************************************************//**
+Gets the type length.
+@return fixed length of the type, in bytes, or 0 if variable-length */
+UNIV_INLINE
+ulint
+dtype_get_len(
+/*==========*/
+ const dtype_t* type); /*!< in: data type */
+#ifndef UNIV_HOTBACKUP
+/*********************************************************************//**
+Gets the minimum length of a character, in bytes.
+@return minimum length of a char, in bytes, or 0 if this is not a
+character type */
+UNIV_INLINE
+ulint
+dtype_get_mbminlen(
+/*===============*/
+ const dtype_t* type); /*!< in: type */
+/*********************************************************************//**
+Gets the maximum length of a character, in bytes.
+@return maximum length of a char, in bytes, or 0 if this is not a
+character type */
+UNIV_INLINE
+ulint
+dtype_get_mbmaxlen(
+/*===============*/
+ const dtype_t* type); /*!< in: type */
+/*********************************************************************//**
+Gets the padding character code for the type.
+@return padding character code, or ULINT_UNDEFINED if no padding specified */
+UNIV_INLINE
+ulint
+dtype_get_pad_char(
+/*===============*/
+ ulint mtype, /*!< in: main type */
+ ulint prtype); /*!< in: precise type */
+#endif /* !UNIV_HOTBACKUP */
+/***********************************************************************//**
+Returns the size of a fixed size data type, 0 if not a fixed size type.
+@return fixed size, or 0 */
+UNIV_INLINE
+ulint
+dtype_get_fixed_size_low(
+/*=====================*/
+ ulint mtype, /*!< in: main type */
+ ulint prtype, /*!< in: precise type */
+ ulint len, /*!< in: length */
+ ulint mbminlen, /*!< in: minimum length of a multibyte char */
+ ulint mbmaxlen, /*!< in: maximum length of a multibyte char */
+ ulint comp); /*!< in: nonzero=ROW_FORMAT=COMPACT */
+#ifndef UNIV_HOTBACKUP
+/***********************************************************************//**
+Returns the minimum size of a data type.
+@return minimum size */
+UNIV_INLINE
+ulint
+dtype_get_min_size_low(
+/*===================*/
+ ulint mtype, /*!< in: main type */
+ ulint prtype, /*!< in: precise type */
+ ulint len, /*!< in: length */
+ ulint mbminlen, /*!< in: minimum length of a multibyte char */
+ ulint mbmaxlen); /*!< in: maximum length of a multibyte char */
+/***********************************************************************//**
+Returns the maximum size of a data type. Note: types in system tables may be
+incomplete and return incorrect information.
+@return maximum size */
+UNIV_INLINE
+ulint
+dtype_get_max_size_low(
+/*===================*/
+ ulint mtype, /*!< in: main type */
+ ulint len); /*!< in: length */
+#endif /* !UNIV_HOTBACKUP */
+/***********************************************************************//**
+Returns the ROW_FORMAT=REDUNDANT stored SQL NULL size of a type.
+For fixed length types it is the fixed length of the type, otherwise 0.
+@return SQL null storage size in ROW_FORMAT=REDUNDANT */
+UNIV_INLINE
+ulint
+dtype_get_sql_null_size(
+/*====================*/
+ const dtype_t* type, /*!< in: type */
+ ulint comp); /*!< in: nonzero=ROW_FORMAT=COMPACT */
+#ifndef UNIV_HOTBACKUP
+/**********************************************************************//**
+Reads to a type the stored information which determines its alphabetical
+ordering and the storage size of an SQL NULL value. */
+UNIV_INLINE
+void
+dtype_read_for_order_and_null_size(
+/*===============================*/
+ dtype_t* type, /*!< in: type struct */
+ const byte* buf); /*!< in: buffer for the stored order info */
+/**********************************************************************//**
+Stores for a type the information which determines its alphabetical ordering
+and the storage size of an SQL NULL value. This is the >= 4.1.x storage
+format. */
+UNIV_INLINE
+void
+dtype_new_store_for_order_and_null_size(
+/*====================================*/
+ byte* buf, /*!< in: buffer for
+ DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE
+ bytes where we store the info */
+ const dtype_t* type, /*!< in: type struct */
+ ulint prefix_len);/*!< in: prefix length to
+ replace type->len, or 0 */
+/**********************************************************************//**
+Reads to a type the stored information which determines its alphabetical
+ordering and the storage size of an SQL NULL value. This is the 4.1.x storage
+format. */
+UNIV_INLINE
+void
+dtype_new_read_for_order_and_null_size(
+/*===================================*/
+ dtype_t* type, /*!< in: type struct */
+ const byte* buf); /*!< in: buffer for stored type order info */
+#endif /* !UNIV_HOTBACKUP */
+
+/*********************************************************************//**
+Validates a data type structure.
+@return TRUE if ok */
+UNIV_INTERN
+ibool
+dtype_validate(
+/*===========*/
+ const dtype_t* type); /*!< in: type struct to validate */
+/*********************************************************************//**
+Prints a data type structure. */
+UNIV_INTERN
+void
+dtype_print(
+/*========*/
+ const dtype_t* type); /*!< in: type */
+
+/* Structure for an SQL data type.
+If you add fields to this structure, be sure to initialize them everywhere.
+This structure is initialized in the following functions:
+dtype_set()
+dtype_read_for_order_and_null_size()
+dtype_new_read_for_order_and_null_size()
+sym_tab_add_null_lit() */
+
+struct dtype_struct{
+ unsigned mtype:8; /*!< main data type */
+ unsigned prtype:24; /*!< precise type; MySQL data
+ type, charset code, flags to
+ indicate nullability,
+ signedness, whether this is a
+ binary string, whether this is
+ a true VARCHAR where MySQL
+ uses 2 bytes to store the length */
+
+ /* the remaining fields do not affect alphabetical ordering: */
+
+ unsigned len:16; /*!< length; for MySQL data this
+ is field->pack_length(),
+ except that for a >= 5.0.3
+ type true VARCHAR this is the
+ maximum byte length of the
+ string data (in addition to
+ the string, MySQL uses 1 or 2
+ bytes to store the string length) */
+#ifndef UNIV_HOTBACKUP
+ unsigned mbminlen:2; /*!< minimum length of a
+ character, in bytes */
+ unsigned mbmaxlen:3; /*!< maximum length of a
+ character, in bytes */
+#endif /* !UNIV_HOTBACKUP */
+};
+
+#ifndef UNIV_NONINL
+#include "data0type.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/data0type.ic b/storage/innobase/include/data0type.ic
new file mode 100644
index 00000000000..240b4288f39
--- /dev/null
+++ b/storage/innobase/include/data0type.ic
@@ -0,0 +1,599 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/data0type.ic
+Data types
+
+Created 1/16/1996 Heikki Tuuri
+*******************************************************/
+
+#include "mach0data.h"
+#ifndef UNIV_HOTBACKUP
+# include "ha_prototypes.h"
+
+/*********************************************************************//**
+Gets the MySQL charset-collation code for MySQL string types.
+@return MySQL charset-collation code */
+UNIV_INLINE
+ulint
+dtype_get_charset_coll(
+/*===================*/
+ ulint prtype) /*!< in: precise data type */
+{
+ return((prtype >> 16) & 0xFFUL);
+}
+
+/*********************************************************************//**
+Determines if a MySQL string type is a subset of UTF-8. This function
+may return false negatives, in case further character-set collation
+codes are introduced in MySQL later.
+@return TRUE if a subset of UTF-8 */
+UNIV_INLINE
+ibool
+dtype_is_utf8(
+/*==========*/
+ ulint prtype) /*!< in: precise data type */
+{
+ /* These codes have been copied from strings/ctype-extra.c
+ and strings/ctype-utf8.c. */
+ switch (dtype_get_charset_coll(prtype)) {
+ case 11: /* ascii_general_ci */
+ case 65: /* ascii_bin */
+ case 33: /* utf8_general_ci */
+ case 83: /* utf8_bin */
+ case 254: /* utf8_general_cs */
+ return(TRUE);
+ }
+
+ return(FALSE);
+}
+
+/*********************************************************************//**
+Gets the MySQL type code from a dtype.
+@return MySQL type code; this is NOT an InnoDB type code! */
+UNIV_INLINE
+ulint
+dtype_get_mysql_type(
+/*=================*/
+ const dtype_t* type) /*!< in: type struct */
+{
+ return(type->prtype & 0xFFUL);
+}
+
+/*********************************************************************//**
+Compute the mbminlen and mbmaxlen members of a data type structure. */
+UNIV_INLINE
+void
+dtype_get_mblen(
+/*============*/
+ ulint mtype, /*!< in: main type */
+ ulint prtype, /*!< in: precise type (and collation) */
+ ulint* mbminlen, /*!< out: minimum length of a
+ multi-byte character */
+ ulint* mbmaxlen) /*!< out: maximum length of a
+ multi-byte character */
+{
+ if (dtype_is_string_type(mtype)) {
+ innobase_get_cset_width(dtype_get_charset_coll(prtype),
+ mbminlen, mbmaxlen);
+ ut_ad(*mbminlen <= *mbmaxlen);
+ ut_ad(*mbminlen <= 2); /* mbminlen in dtype_t is 0..3 */
+ ut_ad(*mbmaxlen < 1 << 3); /* mbmaxlen in dtype_t is 0..7 */
+ } else {
+ *mbminlen = *mbmaxlen = 0;
+ }
+}
+
+/*********************************************************************//**
+Compute the mbminlen and mbmaxlen members of a data type structure. */
+UNIV_INLINE
+void
+dtype_set_mblen(
+/*============*/
+ dtype_t* type) /*!< in/out: type */
+{
+ ulint mbminlen;
+ ulint mbmaxlen;
+
+ dtype_get_mblen(type->mtype, type->prtype, &mbminlen, &mbmaxlen);
+ type->mbminlen = mbminlen;
+ type->mbmaxlen = mbmaxlen;
+
+ ut_ad(dtype_validate(type));
+}
+#else /* !UNIV_HOTBACKUP */
+# define dtype_set_mblen(type) (void) 0
+#endif /* !UNIV_HOTBACKUP */
+
+/*********************************************************************//**
+Sets a data type structure. */
+UNIV_INLINE
+void
+dtype_set(
+/*======*/
+ dtype_t* type, /*!< in: type struct to init */
+ ulint mtype, /*!< in: main data type */
+ ulint prtype, /*!< in: precise type */
+ ulint len) /*!< in: precision of type */
+{
+ ut_ad(type);
+ ut_ad(mtype <= DATA_MTYPE_MAX);
+
+ type->mtype = mtype;
+ type->prtype = prtype;
+ type->len = len;
+
+ dtype_set_mblen(type);
+}
+
+/*********************************************************************//**
+Copies a data type structure. */
+UNIV_INLINE
+void
+dtype_copy(
+/*=======*/
+ dtype_t* type1, /*!< in: type struct to copy to */
+ const dtype_t* type2) /*!< in: type struct to copy from */
+{
+ *type1 = *type2;
+
+ ut_ad(dtype_validate(type1));
+}
+
+/*********************************************************************//**
+Gets the SQL main data type.
+@return SQL main data type */
+UNIV_INLINE
+ulint
+dtype_get_mtype(
+/*============*/
+ const dtype_t* type) /*!< in: data type */
+{
+ ut_ad(type);
+
+ return(type->mtype);
+}
+
+/*********************************************************************//**
+Gets the precise data type.
+@return precise data type */
+UNIV_INLINE
+ulint
+dtype_get_prtype(
+/*=============*/
+ const dtype_t* type) /*!< in: data type */
+{
+ ut_ad(type);
+
+ return(type->prtype);
+}
+
+/*********************************************************************//**
+Gets the type length.
+@return fixed length of the type, in bytes, or 0 if variable-length */
+UNIV_INLINE
+ulint
+dtype_get_len(
+/*==========*/
+ const dtype_t* type) /*!< in: data type */
+{
+ ut_ad(type);
+
+ return(type->len);
+}
+
+#ifndef UNIV_HOTBACKUP
+/*********************************************************************//**
+Gets the minimum length of a character, in bytes.
+@return minimum length of a char, in bytes, or 0 if this is not a
+character type */
+UNIV_INLINE
+ulint
+dtype_get_mbminlen(
+/*===============*/
+ const dtype_t* type) /*!< in: type */
+{
+ ut_ad(type);
+ return(type->mbminlen);
+}
+/*********************************************************************//**
+Gets the maximum length of a character, in bytes.
+@return maximum length of a char, in bytes, or 0 if this is not a
+character type */
+UNIV_INLINE
+ulint
+dtype_get_mbmaxlen(
+/*===============*/
+ const dtype_t* type) /*!< in: type */
+{
+ ut_ad(type);
+ return(type->mbmaxlen);
+}
+
+/*********************************************************************//**
+Gets the padding character code for a type.
+@return padding character code, or ULINT_UNDEFINED if no padding specified */
+UNIV_INLINE
+ulint
+dtype_get_pad_char(
+/*===============*/
+ ulint mtype, /*!< in: main type */
+ ulint prtype) /*!< in: precise type */
+{
+ switch (mtype) {
+ case DATA_FIXBINARY:
+ case DATA_BINARY:
+ if (UNIV_UNLIKELY(dtype_get_charset_coll(prtype)
+ == DATA_MYSQL_BINARY_CHARSET_COLL)) {
+ /* Starting from 5.0.18, do not pad
+ VARBINARY or BINARY columns. */
+ return(ULINT_UNDEFINED);
+ }
+ /* Fall through */
+ case DATA_CHAR:
+ case DATA_VARCHAR:
+ case DATA_MYSQL:
+ case DATA_VARMYSQL:
+ /* Space is the padding character for all char and binary
+ strings, and starting from 5.0.3, also for TEXT strings. */
+
+ return(0x20);
+ case DATA_BLOB:
+ if (!(prtype & DATA_BINARY_TYPE)) {
+ return(0x20);
+ }
+ /* Fall through */
+ default:
+ /* No padding specified */
+ return(ULINT_UNDEFINED);
+ }
+}
+
+/**********************************************************************//**
+Stores for a type the information which determines its alphabetical ordering
+and the storage size of an SQL NULL value. This is the >= 4.1.x storage
+format. */
+UNIV_INLINE
+void
+dtype_new_store_for_order_and_null_size(
+/*====================================*/
+ byte* buf, /*!< in: buffer for
+ DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE
+ bytes where we store the info */
+ const dtype_t* type, /*!< in: type struct */
+ ulint prefix_len)/*!< in: prefix length to
+ replace type->len, or 0 */
+{
+#if 6 != DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE
+#error "6 != DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE"
+#endif
+ ulint len;
+
+ buf[0] = (byte)(type->mtype & 0xFFUL);
+
+ if (type->prtype & DATA_BINARY_TYPE) {
+ buf[0] = buf[0] | 128;
+ }
+
+ /* In versions < 4.1.2 we had: if (type->prtype & DATA_NONLATIN1) {
+ buf[0] = buf[0] | 64;
+ }
+ */
+
+ buf[1] = (byte)(type->prtype & 0xFFUL);
+
+ len = prefix_len ? prefix_len : type->len;
+
+ mach_write_to_2(buf + 2, len & 0xFFFFUL);
+
+ ut_ad(dtype_get_charset_coll(type->prtype) < 256);
+ mach_write_to_2(buf + 4, dtype_get_charset_coll(type->prtype));
+
+ if (type->prtype & DATA_NOT_NULL) {
+ buf[4] |= 128;
+ }
+}
+
+/**********************************************************************//**
+Reads to a type the stored information which determines its alphabetical
+ordering and the storage size of an SQL NULL value. This is the < 4.1.x
+storage format. */
+UNIV_INLINE
+void
+dtype_read_for_order_and_null_size(
+/*===============================*/
+ dtype_t* type, /*!< in: type struct */
+ const byte* buf) /*!< in: buffer for stored type order info */
+{
+#if 4 != DATA_ORDER_NULL_TYPE_BUF_SIZE
+# error "4 != DATA_ORDER_NULL_TYPE_BUF_SIZE"
+#endif
+
+ type->mtype = buf[0] & 63;
+ type->prtype = buf[1];
+
+ if (buf[0] & 128) {
+ type->prtype = type->prtype | DATA_BINARY_TYPE;
+ }
+
+ type->len = mach_read_from_2(buf + 2);
+
+ type->prtype = dtype_form_prtype(type->prtype,
+ data_mysql_default_charset_coll);
+ dtype_set_mblen(type);
+}
+
+/**********************************************************************//**
+Reads to a type the stored information which determines its alphabetical
+ordering and the storage size of an SQL NULL value. This is the >= 4.1.x
+storage format. */
+UNIV_INLINE
+void
+dtype_new_read_for_order_and_null_size(
+/*===================================*/
+ dtype_t* type, /*!< in: type struct */
+ const byte* buf) /*!< in: buffer for stored type order info */
+{
+ ulint charset_coll;
+
+#if 6 != DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE
+#error "6 != DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE"
+#endif
+
+ type->mtype = buf[0] & 63;
+ type->prtype = buf[1];
+
+ if (buf[0] & 128) {
+ type->prtype |= DATA_BINARY_TYPE;
+ }
+
+ if (buf[4] & 128) {
+ type->prtype |= DATA_NOT_NULL;
+ }
+
+ type->len = mach_read_from_2(buf + 2);
+
+ charset_coll = mach_read_from_2(buf + 4) & 0x7fff;
+
+ if (dtype_is_string_type(type->mtype)) {
+ ut_a(charset_coll < 256);
+
+ if (charset_coll == 0) {
+ /* This insert buffer record was inserted with MySQL
+ version < 4.1.2, and the charset-collation code was not
+ explicitly stored to dtype->prtype at that time. It
+ must be the default charset-collation of this MySQL
+ installation. */
+
+ charset_coll = data_mysql_default_charset_coll;
+ }
+
+ type->prtype = dtype_form_prtype(type->prtype, charset_coll);
+ }
+ dtype_set_mblen(type);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/***********************************************************************//**
+Returns the size of a fixed size data type, 0 if not a fixed size type.
+@return fixed size, or 0 */
+UNIV_INLINE
+ulint
+dtype_get_fixed_size_low(
+/*=====================*/
+ ulint mtype, /*!< in: main type */
+ ulint prtype, /*!< in: precise type */
+ ulint len, /*!< in: length */
+ ulint mbminlen, /*!< in: minimum length of a multibyte char */
+ ulint mbmaxlen, /*!< in: maximum length of a multibyte char */
+ ulint comp) /*!< in: nonzero=ROW_FORMAT=COMPACT */
+{
+ switch (mtype) {
+ case DATA_SYS:
+#ifdef UNIV_DEBUG
+ switch (prtype & DATA_MYSQL_TYPE_MASK) {
+ case DATA_ROW_ID:
+ ut_ad(len == DATA_ROW_ID_LEN);
+ break;
+ case DATA_TRX_ID:
+ ut_ad(len == DATA_TRX_ID_LEN);
+ break;
+ case DATA_ROLL_PTR:
+ ut_ad(len == DATA_ROLL_PTR_LEN);
+ break;
+ default:
+ ut_ad(0);
+ return(0);
+ }
+#endif /* UNIV_DEBUG */
+ case DATA_CHAR:
+ case DATA_FIXBINARY:
+ case DATA_INT:
+ case DATA_FLOAT:
+ case DATA_DOUBLE:
+ return(len);
+ case DATA_MYSQL:
+#ifndef UNIV_HOTBACKUP
+ if (prtype & DATA_BINARY_TYPE) {
+ return(len);
+ } else if (!comp) {
+ return(len);
+ } else {
+ /* We play it safe here and ask MySQL for
+ mbminlen and mbmaxlen. Although
+ mbminlen and mbmaxlen are
+ initialized if and only if prtype
+ is (in one of the 3 functions in this file),
+ it could be that none of these functions
+ has been called. */
+
+ ulint i_mbminlen, i_mbmaxlen;
+
+ innobase_get_cset_width(
+ dtype_get_charset_coll(prtype),
+ &i_mbminlen, &i_mbmaxlen);
+
+ if (UNIV_UNLIKELY(mbminlen != i_mbminlen)
+ || UNIV_UNLIKELY(mbmaxlen != i_mbmaxlen)) {
+
+ ut_print_timestamp(stderr);
+ fprintf(stderr, " InnoDB: "
+ "mbminlen=%lu, "
+ "mbmaxlen=%lu, "
+ "type->mbminlen=%lu, "
+ "type->mbmaxlen=%lu\n",
+ (ulong) i_mbminlen,
+ (ulong) i_mbmaxlen,
+ (ulong) mbminlen,
+ (ulong) mbmaxlen);
+ }
+ if (mbminlen == mbmaxlen) {
+ return(len);
+ }
+ }
+#else /* !UNIV_HOTBACKUP */
+ return(len);
+#endif /* !UNIV_HOTBACKUP */
+ /* fall through for variable-length charsets */
+ case DATA_VARCHAR:
+ case DATA_BINARY:
+ case DATA_DECIMAL:
+ case DATA_VARMYSQL:
+ case DATA_BLOB:
+ return(0);
+ default:
+ ut_error;
+ }
+
+ return(0);
+}
+
+#ifndef UNIV_HOTBACKUP
+/***********************************************************************//**
+Returns the minimum size of a data type.
+@return minimum size */
+UNIV_INLINE
+ulint
+dtype_get_min_size_low(
+/*===================*/
+ ulint mtype, /*!< in: main type */
+ ulint prtype, /*!< in: precise type */
+ ulint len, /*!< in: length */
+ ulint mbminlen, /*!< in: minimum length of a multibyte char */
+ ulint mbmaxlen) /*!< in: maximum length of a multibyte char */
+{
+ switch (mtype) {
+ case DATA_SYS:
+#ifdef UNIV_DEBUG
+ switch (prtype & DATA_MYSQL_TYPE_MASK) {
+ case DATA_ROW_ID:
+ ut_ad(len == DATA_ROW_ID_LEN);
+ break;
+ case DATA_TRX_ID:
+ ut_ad(len == DATA_TRX_ID_LEN);
+ break;
+ case DATA_ROLL_PTR:
+ ut_ad(len == DATA_ROLL_PTR_LEN);
+ break;
+ default:
+ ut_ad(0);
+ return(0);
+ }
+#endif /* UNIV_DEBUG */
+ case DATA_CHAR:
+ case DATA_FIXBINARY:
+ case DATA_INT:
+ case DATA_FLOAT:
+ case DATA_DOUBLE:
+ return(len);
+ case DATA_MYSQL:
+ if ((prtype & DATA_BINARY_TYPE) || mbminlen == mbmaxlen) {
+ return(len);
+ }
+ /* this is a variable-length character set */
+ ut_a(mbminlen > 0);
+ ut_a(mbmaxlen > mbminlen);
+ ut_a(len % mbmaxlen == 0);
+ return(len * mbminlen / mbmaxlen);
+ case DATA_VARCHAR:
+ case DATA_BINARY:
+ case DATA_DECIMAL:
+ case DATA_VARMYSQL:
+ case DATA_BLOB:
+ return(0);
+ default:
+ ut_error;
+ }
+
+ return(0);
+}
+
+/***********************************************************************//**
+Returns the maximum size of a data type. Note: types in system tables may be
+incomplete and return incorrect information.
+@return maximum size */
+UNIV_INLINE
+ulint
+dtype_get_max_size_low(
+/*===================*/
+ ulint mtype, /*!< in: main type */
+ ulint len) /*!< in: length */
+{
+ switch (mtype) {
+ case DATA_SYS:
+ case DATA_CHAR:
+ case DATA_FIXBINARY:
+ case DATA_INT:
+ case DATA_FLOAT:
+ case DATA_DOUBLE:
+ case DATA_MYSQL:
+ case DATA_VARCHAR:
+ case DATA_BINARY:
+ case DATA_DECIMAL:
+ case DATA_VARMYSQL:
+ return(len);
+ case DATA_BLOB:
+ break;
+ default:
+ ut_error;
+ }
+
+ return(ULINT_MAX);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/***********************************************************************//**
+Returns the ROW_FORMAT=REDUNDANT stored SQL NULL size of a type.
+For fixed length types it is the fixed length of the type, otherwise 0.
+@return SQL null storage size in ROW_FORMAT=REDUNDANT */
+UNIV_INLINE
+ulint
+dtype_get_sql_null_size(
+/*====================*/
+ const dtype_t* type, /*!< in: type */
+ ulint comp) /*!< in: nonzero=ROW_FORMAT=COMPACT */
+{
+#ifndef UNIV_HOTBACKUP
+ return(dtype_get_fixed_size_low(type->mtype, type->prtype, type->len,
+ type->mbminlen, type->mbmaxlen, comp));
+#else /* !UNIV_HOTBACKUP */
+ return(dtype_get_fixed_size_low(type->mtype, type->prtype, type->len,
+ 0, 0, 0));
+#endif /* !UNIV_HOTBACKUP */
+}
diff --git a/storage/innobase/include/data0types.h b/storage/innobase/include/data0types.h
new file mode 100644
index 00000000000..04e835bc401
--- /dev/null
+++ b/storage/innobase/include/data0types.h
@@ -0,0 +1,36 @@
+/*****************************************************************************
+
+Copyright (c) 2000, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/data0types.h
+Some type definitions
+
+Created 9/21/2000 Heikki Tuuri
+*************************************************************************/
+
+#ifndef data0types_h
+#define data0types_h
+
+/* SQL data field struct */
+typedef struct dfield_struct dfield_t;
+
+/* SQL data tuple struct */
+typedef struct dtuple_struct dtuple_t;
+
+#endif
+
diff --git a/storage/innobase/include/db0err.h b/storage/innobase/include/db0err.h
new file mode 100644
index 00000000000..23898583b72
--- /dev/null
+++ b/storage/innobase/include/db0err.h
@@ -0,0 +1,105 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/db0err.h
+Global error codes for the database
+
+Created 5/24/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef db0err_h
+#define db0err_h
+
+
+enum db_err {
+ DB_SUCCESS = 10,
+
+ /* The following are error codes */
+ DB_ERROR,
+ DB_OUT_OF_MEMORY,
+ DB_OUT_OF_FILE_SPACE,
+ DB_LOCK_WAIT,
+ DB_DEADLOCK,
+ DB_ROLLBACK,
+ DB_DUPLICATE_KEY,
+ DB_QUE_THR_SUSPENDED,
+ DB_MISSING_HISTORY, /* required history data has been
+ deleted due to lack of space in
+ rollback segment */
+ DB_CLUSTER_NOT_FOUND = 30,
+ DB_TABLE_NOT_FOUND,
+ DB_MUST_GET_MORE_FILE_SPACE, /* the database has to be stopped
+ and restarted with more file space */
+ DB_TABLE_IS_BEING_USED,
+ DB_TOO_BIG_RECORD, /* a record in an index would not fit
+ on a compressed page, or it would
+ become bigger than 1/2 free space in
+ an uncompressed page frame */
+ DB_LOCK_WAIT_TIMEOUT, /* lock wait lasted too long */
+ DB_NO_REFERENCED_ROW, /* referenced key value not found
+ for a foreign key in an insert or
+ update of a row */
+ DB_ROW_IS_REFERENCED, /* cannot delete or update a row
+ because it contains a key value
+ which is referenced */
+ DB_CANNOT_ADD_CONSTRAINT, /* adding a foreign key constraint
+ to a table failed */
+ DB_CORRUPTION, /* data structure corruption noticed */
+ DB_COL_APPEARS_TWICE_IN_INDEX, /* InnoDB cannot handle an index
+ where same column appears twice */
+ DB_CANNOT_DROP_CONSTRAINT, /* dropping a foreign key constraint
+ from a table failed */
+ DB_NO_SAVEPOINT, /* no savepoint exists with the given
+ name */
+ DB_TABLESPACE_ALREADY_EXISTS, /* we cannot create a new single-table
+ tablespace because a file of the same
+ name already exists */
+ DB_TABLESPACE_DELETED, /* tablespace does not exist or is
+ being dropped right now */
+ DB_LOCK_TABLE_FULL, /* lock structs have exhausted the
+ buffer pool (for big transactions,
+ InnoDB stores the lock structs in the
+ buffer pool) */
+ DB_FOREIGN_DUPLICATE_KEY, /* foreign key constraints
+ activated by the operation would
+ lead to a duplicate key in some
+ table */
+ DB_TOO_MANY_CONCURRENT_TRXS, /* when InnoDB runs out of the
+ preconfigured undo slots, this can
+ only happen when there are too many
+ concurrent transactions */
+ DB_UNSUPPORTED, /* when InnoDB sees any artefact or
+ a feature that it can't recoginize or
+ work with e.g., FT indexes created by
+ a later version of the engine. */
+
+ DB_PRIMARY_KEY_IS_NULL, /* a column in the PRIMARY KEY
+ was found to be NULL */
+
+ /* The following are partial failure codes */
+ DB_FAIL = 1000,
+ DB_OVERFLOW,
+ DB_UNDERFLOW,
+ DB_STRONG_FAIL,
+ DB_ZIP_OVERFLOW,
+ DB_RECORD_NOT_FOUND = 1500,
+ DB_END_OF_INDEX
+};
+
+#endif
diff --git a/storage/innobase/include/dict0boot.h b/storage/innobase/include/dict0boot.h
new file mode 100644
index 00000000000..51d37ee98d1
--- /dev/null
+++ b/storage/innobase/include/dict0boot.h
@@ -0,0 +1,150 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/dict0boot.h
+Data dictionary creation and booting
+
+Created 4/18/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef dict0boot_h
+#define dict0boot_h
+
+#include "univ.i"
+
+#include "mtr0mtr.h"
+#include "mtr0log.h"
+#include "ut0byte.h"
+#include "buf0buf.h"
+#include "fsp0fsp.h"
+#include "dict0dict.h"
+
+typedef byte dict_hdr_t;
+
+/**********************************************************************//**
+Gets a pointer to the dictionary header and x-latches its page.
+@return pointer to the dictionary header, page x-latched */
+UNIV_INTERN
+dict_hdr_t*
+dict_hdr_get(
+/*=========*/
+ mtr_t* mtr); /*!< in: mtr */
+/**********************************************************************//**
+Returns a new row, table, index, or tree id.
+@return the new id */
+UNIV_INTERN
+dulint
+dict_hdr_get_new_id(
+/*================*/
+ ulint type); /*!< in: DICT_HDR_ROW_ID, ... */
+/**********************************************************************//**
+Returns a new row id.
+@return the new id */
+UNIV_INLINE
+dulint
+dict_sys_get_new_row_id(void);
+/*=========================*/
+/**********************************************************************//**
+Reads a row id from a record or other 6-byte stored form.
+@return row id */
+UNIV_INLINE
+dulint
+dict_sys_read_row_id(
+/*=================*/
+ byte* field); /*!< in: record field */
+/**********************************************************************//**
+Writes a row id to a record or other 6-byte stored form. */
+UNIV_INLINE
+void
+dict_sys_write_row_id(
+/*==================*/
+ byte* field, /*!< in: record field */
+ dulint row_id);/*!< in: row id */
+/*****************************************************************//**
+Initializes the data dictionary memory structures when the database is
+started. This function is also called when the data dictionary is created. */
+UNIV_INTERN
+void
+dict_boot(void);
+/*===========*/
+/*****************************************************************//**
+Creates and initializes the data dictionary at the database creation. */
+UNIV_INTERN
+void
+dict_create(void);
+/*=============*/
+
+
+/* Space id and page no where the dictionary header resides */
+#define DICT_HDR_SPACE 0 /* the SYSTEM tablespace */
+#define DICT_HDR_PAGE_NO FSP_DICT_HDR_PAGE_NO
+
+/* The ids for the basic system tables and their indexes */
+#define DICT_TABLES_ID ut_dulint_create(0, 1)
+#define DICT_COLUMNS_ID ut_dulint_create(0, 2)
+#define DICT_INDEXES_ID ut_dulint_create(0, 3)
+#define DICT_FIELDS_ID ut_dulint_create(0, 4)
+/* The following is a secondary index on SYS_TABLES */
+#define DICT_TABLE_IDS_ID ut_dulint_create(0, 5)
+
+#define DICT_HDR_FIRST_ID 10 /* the ids for tables etc. start
+ from this number, except for basic
+ system tables and their above defined
+ indexes; ibuf tables and indexes are
+ assigned as the id the number
+ DICT_IBUF_ID_MIN plus the space id */
+#define DICT_IBUF_ID_MIN ut_dulint_create(0xFFFFFFFFUL, 0)
+
+/* The offset of the dictionary header on the page */
+#define DICT_HDR FSEG_PAGE_DATA
+
+/*-------------------------------------------------------------*/
+/* Dictionary header offsets */
+#define DICT_HDR_ROW_ID 0 /* The latest assigned row id */
+#define DICT_HDR_TABLE_ID 8 /* The latest assigned table id */
+#define DICT_HDR_INDEX_ID 16 /* The latest assigned index id */
+#define DICT_HDR_MIX_ID 24 /* Obsolete, always 0. */
+#define DICT_HDR_TABLES 32 /* Root of the table index tree */
+#define DICT_HDR_TABLE_IDS 36 /* Root of the table index tree */
+#define DICT_HDR_COLUMNS 40 /* Root of the column index tree */
+#define DICT_HDR_INDEXES 44 /* Root of the index index tree */
+#define DICT_HDR_FIELDS 48 /* Root of the index field
+ index tree */
+
+#define DICT_HDR_FSEG_HEADER 56 /* Segment header for the tablespace
+ segment into which the dictionary
+ header is created */
+/*-------------------------------------------------------------*/
+
+/* The field number of the page number field in the sys_indexes table
+clustered index */
+#define DICT_SYS_INDEXES_PAGE_NO_FIELD 8
+#define DICT_SYS_INDEXES_SPACE_NO_FIELD 7
+#define DICT_SYS_INDEXES_TYPE_FIELD 6
+
+/* When a row id which is zero modulo this number (which must be a power of
+two) is assigned, the field DICT_HDR_ROW_ID on the dictionary header page is
+updated */
+#define DICT_HDR_ROW_ID_WRITE_MARGIN 256
+
+#ifndef UNIV_NONINL
+#include "dict0boot.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/dict0boot.ic b/storage/innobase/include/dict0boot.ic
new file mode 100644
index 00000000000..d5f372e38c4
--- /dev/null
+++ b/storage/innobase/include/dict0boot.ic
@@ -0,0 +1,93 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/dict0boot.ic
+Data dictionary creation and booting
+
+Created 4/18/1996 Heikki Tuuri
+*******************************************************/
+
+/**********************************************************************//**
+Writes the current value of the row id counter to the dictionary header file
+page. */
+UNIV_INTERN
+void
+dict_hdr_flush_row_id(void);
+/*=======================*/
+
+
+/**********************************************************************//**
+Returns a new row id.
+@return the new id */
+UNIV_INLINE
+dulint
+dict_sys_get_new_row_id(void)
+/*=========================*/
+{
+ dulint id;
+
+ mutex_enter(&(dict_sys->mutex));
+
+ id = dict_sys->row_id;
+
+ if (0 == (ut_dulint_get_low(id) % DICT_HDR_ROW_ID_WRITE_MARGIN)) {
+
+ dict_hdr_flush_row_id();
+ }
+
+ UT_DULINT_INC(dict_sys->row_id);
+
+ mutex_exit(&(dict_sys->mutex));
+
+ return(id);
+}
+
+/**********************************************************************//**
+Reads a row id from a record or other 6-byte stored form.
+@return row id */
+UNIV_INLINE
+dulint
+dict_sys_read_row_id(
+/*=================*/
+ byte* field) /*!< in: record field */
+{
+#if DATA_ROW_ID_LEN != 6
+# error "DATA_ROW_ID_LEN != 6"
+#endif
+
+ return(mach_read_from_6(field));
+}
+
+/**********************************************************************//**
+Writes a row id to a record or other 6-byte stored form. */
+UNIV_INLINE
+void
+dict_sys_write_row_id(
+/*==================*/
+ byte* field, /*!< in: record field */
+ dulint row_id) /*!< in: row id */
+{
+#if DATA_ROW_ID_LEN != 6
+# error "DATA_ROW_ID_LEN != 6"
+#endif
+
+ mach_write_to_6(field, row_id);
+}
+
+
diff --git a/storage/innobase/include/dict0crea.h b/storage/innobase/include/dict0crea.h
new file mode 100644
index 00000000000..3107d771d88
--- /dev/null
+++ b/storage/innobase/include/dict0crea.h
@@ -0,0 +1,197 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/dict0crea.h
+Database object creation
+
+Created 1/8/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef dict0crea_h
+#define dict0crea_h
+
+#include "univ.i"
+#include "dict0types.h"
+#include "dict0dict.h"
+#include "que0types.h"
+#include "row0types.h"
+#include "mtr0mtr.h"
+
+/*********************************************************************//**
+Creates a table create graph.
+@return own: table create node */
+UNIV_INTERN
+tab_node_t*
+tab_create_graph_create(
+/*====================*/
+ dict_table_t* table, /*!< in: table to create, built as a memory data
+ structure */
+ mem_heap_t* heap); /*!< in: heap where created */
+/*********************************************************************//**
+Creates an index create graph.
+@return own: index create node */
+UNIV_INTERN
+ind_node_t*
+ind_create_graph_create(
+/*====================*/
+ dict_index_t* index, /*!< in: index to create, built as a memory data
+ structure */
+ mem_heap_t* heap); /*!< in: heap where created */
+/***********************************************************//**
+Creates a table. This is a high-level function used in SQL execution graphs.
+@return query thread to run next or NULL */
+UNIV_INTERN
+que_thr_t*
+dict_create_table_step(
+/*===================*/
+ que_thr_t* thr); /*!< in: query thread */
+/***********************************************************//**
+Creates an index. This is a high-level function used in SQL execution
+graphs.
+@return query thread to run next or NULL */
+UNIV_INTERN
+que_thr_t*
+dict_create_index_step(
+/*===================*/
+ que_thr_t* thr); /*!< in: query thread */
+/*******************************************************************//**
+Truncates the index tree associated with a row in SYS_INDEXES table.
+@return new root page number, or FIL_NULL on failure */
+UNIV_INTERN
+ulint
+dict_truncate_index_tree(
+/*=====================*/
+ dict_table_t* table, /*!< in: the table the index belongs to */
+ ulint space, /*!< in: 0=truncate,
+ nonzero=create the index tree in the
+ given tablespace */
+ btr_pcur_t* pcur, /*!< in/out: persistent cursor pointing to
+ record in the clustered index of
+ SYS_INDEXES table. The cursor may be
+ repositioned in this call. */
+ mtr_t* mtr); /*!< in: mtr having the latch
+ on the record page. The mtr may be
+ committed and restarted in this call. */
+/*******************************************************************//**
+Drops the index tree associated with a row in SYS_INDEXES table. */
+UNIV_INTERN
+void
+dict_drop_index_tree(
+/*=================*/
+ rec_t* rec, /*!< in/out: record in the clustered index
+ of SYS_INDEXES table */
+ mtr_t* mtr); /*!< in: mtr having the latch on the record page */
+/****************************************************************//**
+Creates the foreign key constraints system tables inside InnoDB
+at database creation or database start if they are not found or are
+not of the right form.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+ulint
+dict_create_or_check_foreign_constraint_tables(void);
+/*================================================*/
+/********************************************************************//**
+Adds foreign key definitions to data dictionary tables in the database. We
+look at table->foreign_list, and also generate names to constraints that were
+not named by the user. A generated constraint has a name of the format
+databasename/tablename_ibfk_<number>, where the numbers start from 1, and are
+given locally for this table, that is, the number is not global, as in the
+old format constraints < 4.0.18 it used to be.
+@return error code or DB_SUCCESS */
+UNIV_INTERN
+ulint
+dict_create_add_foreigns_to_dictionary(
+/*===================================*/
+ ulint start_id,/*!< in: if we are actually doing ALTER TABLE
+ ADD CONSTRAINT, we want to generate constraint
+ numbers which are bigger than in the table so
+ far; we number the constraints from
+ start_id + 1 up; start_id should be set to 0 if
+ we are creating a new table, or if the table
+ so far has no constraints for which the name
+ was generated here */
+ dict_table_t* table, /*!< in: table */
+ trx_t* trx); /*!< in: transaction */
+
+/* Table create node structure */
+
+struct tab_node_struct{
+ que_common_t common; /*!< node type: QUE_NODE_TABLE_CREATE */
+ dict_table_t* table; /*!< table to create, built as a memory data
+ structure with dict_mem_... functions */
+ ins_node_t* tab_def; /* child node which does the insert of
+ the table definition; the row to be inserted
+ is built by the parent node */
+ ins_node_t* col_def; /* child node which does the inserts of
+ the column definitions; the row to be inserted
+ is built by the parent node */
+ commit_node_t* commit_node;
+ /* child node which performs a commit after
+ a successful table creation */
+ /*----------------------*/
+ /* Local storage for this graph node */
+ ulint state; /*!< node execution state */
+ ulint col_no; /*!< next column definition to insert */
+ mem_heap_t* heap; /*!< memory heap used as auxiliary storage */
+};
+
+/* Table create node states */
+#define TABLE_BUILD_TABLE_DEF 1
+#define TABLE_BUILD_COL_DEF 2
+#define TABLE_COMMIT_WORK 3
+#define TABLE_ADD_TO_CACHE 4
+#define TABLE_COMPLETED 5
+
+/* Index create node struct */
+
+struct ind_node_struct{
+ que_common_t common; /*!< node type: QUE_NODE_INDEX_CREATE */
+ dict_index_t* index; /*!< index to create, built as a memory data
+ structure with dict_mem_... functions */
+ ins_node_t* ind_def; /* child node which does the insert of
+ the index definition; the row to be inserted
+ is built by the parent node */
+ ins_node_t* field_def; /* child node which does the inserts of
+ the field definitions; the row to be inserted
+ is built by the parent node */
+ commit_node_t* commit_node;
+ /* child node which performs a commit after
+ a successful index creation */
+ /*----------------------*/
+ /* Local storage for this graph node */
+ ulint state; /*!< node execution state */
+ ulint page_no;/* root page number of the index */
+ dict_table_t* table; /*!< table which owns the index */
+ dtuple_t* ind_row;/* index definition row built */
+ ulint field_no;/* next field definition to insert */
+ mem_heap_t* heap; /*!< memory heap used as auxiliary storage */
+};
+
+/* Index create node states */
+#define INDEX_BUILD_INDEX_DEF 1
+#define INDEX_BUILD_FIELD_DEF 2
+#define INDEX_CREATE_INDEX_TREE 3
+#define INDEX_COMMIT_WORK 4
+#define INDEX_ADD_TO_CACHE 5
+
+#ifndef UNIV_NONINL
+#include "dict0crea.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/dict0crea.ic b/storage/innobase/include/dict0crea.ic
new file mode 100644
index 00000000000..c5365ce7489
--- /dev/null
+++ b/storage/innobase/include/dict0crea.ic
@@ -0,0 +1,25 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/dict0crea.ic
+Database object creation
+
+Created 1/8/1996 Heikki Tuuri
+*******************************************************/
+
diff --git a/storage/innobase/include/dict0dict.h b/storage/innobase/include/dict0dict.h
new file mode 100644
index 00000000000..b2029699e51
--- /dev/null
+++ b/storage/innobase/include/dict0dict.h
@@ -0,0 +1,1158 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/dict0dict.h
+Data dictionary system
+
+Created 1/8/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef dict0dict_h
+#define dict0dict_h
+
+#include "univ.i"
+#include "dict0types.h"
+#include "dict0mem.h"
+#include "data0type.h"
+#include "data0data.h"
+#include "mem0mem.h"
+#include "rem0types.h"
+#include "ut0mem.h"
+#include "ut0lst.h"
+#include "hash0hash.h"
+#include "ut0rnd.h"
+#include "ut0byte.h"
+#include "trx0types.h"
+
+#ifndef UNIV_HOTBACKUP
+# include "sync0sync.h"
+# include "sync0rw.h"
+/******************************************************************//**
+Makes all characters in a NUL-terminated UTF-8 string lower case. */
+UNIV_INTERN
+void
+dict_casedn_str(
+/*============*/
+ char* a); /*!< in/out: string to put in lower case */
+/********************************************************************//**
+Get the database name length in a table name.
+@return database name length */
+UNIV_INTERN
+ulint
+dict_get_db_name_len(
+/*=================*/
+ const char* name); /*!< in: table name in the form
+ dbname '/' tablename */
+/********************************************************************//**
+Return the end of table name where we have removed dbname and '/'.
+@return table name */
+
+const char*
+dict_remove_db_name(
+/*================*/
+ const char* name); /*!< in: table name in the form
+ dbname '/' tablename */
+/**********************************************************************//**
+Returns a table object based on table id.
+@return table, NULL if does not exist */
+UNIV_INTERN
+dict_table_t*
+dict_table_get_on_id(
+/*=================*/
+ dulint table_id, /*!< in: table id */
+ trx_t* trx); /*!< in: transaction handle */
+/********************************************************************//**
+Decrements the count of open MySQL handles to a table. */
+UNIV_INTERN
+void
+dict_table_decrement_handle_count(
+/*==============================*/
+ dict_table_t* table, /*!< in/out: table */
+ ibool dict_locked); /*!< in: TRUE=data dictionary locked */
+/**********************************************************************//**
+Inits the data dictionary module. */
+UNIV_INTERN
+void
+dict_init(void);
+/*===========*/
+/********************************************************************//**
+Gets the space id of every table of the data dictionary and makes a linear
+list and a hash table of them to the data dictionary cache. This function
+can be called at database startup if we did not need to do a crash recovery.
+In crash recovery we must scan the space id's from the .ibd files in MySQL
+database directories. */
+UNIV_INTERN
+void
+dict_load_space_id_list(void);
+/*=========================*/
+/*********************************************************************//**
+Gets the column data type. */
+UNIV_INLINE
+void
+dict_col_copy_type(
+/*===============*/
+ const dict_col_t* col, /*!< in: column */
+ dtype_t* type); /*!< out: data type */
+#endif /* !UNIV_HOTBACKUP */
+#ifdef UNIV_DEBUG
+/*********************************************************************//**
+Assert that a column and a data type match.
+@return TRUE */
+UNIV_INLINE
+ibool
+dict_col_type_assert_equal(
+/*=======================*/
+ const dict_col_t* col, /*!< in: column */
+ const dtype_t* type); /*!< in: data type */
+#endif /* UNIV_DEBUG */
+#ifndef UNIV_HOTBACKUP
+/***********************************************************************//**
+Returns the minimum size of the column.
+@return minimum size */
+UNIV_INLINE
+ulint
+dict_col_get_min_size(
+/*==================*/
+ const dict_col_t* col); /*!< in: column */
+/***********************************************************************//**
+Returns the maximum size of the column.
+@return maximum size */
+UNIV_INLINE
+ulint
+dict_col_get_max_size(
+/*==================*/
+ const dict_col_t* col); /*!< in: column */
+/***********************************************************************//**
+Returns the size of a fixed size column, 0 if not a fixed size column.
+@return fixed size, or 0 */
+UNIV_INLINE
+ulint
+dict_col_get_fixed_size(
+/*====================*/
+ const dict_col_t* col, /*!< in: column */
+ ulint comp); /*!< in: nonzero=ROW_FORMAT=COMPACT */
+/***********************************************************************//**
+Returns the ROW_FORMAT=REDUNDANT stored SQL NULL size of a column.
+For fixed length types it is the fixed length of the type, otherwise 0.
+@return SQL null storage size in ROW_FORMAT=REDUNDANT */
+UNIV_INLINE
+ulint
+dict_col_get_sql_null_size(
+/*=======================*/
+ const dict_col_t* col, /*!< in: column */
+ ulint comp); /*!< in: nonzero=ROW_FORMAT=COMPACT */
+
+/*********************************************************************//**
+Gets the column number.
+@return col->ind, table column position (starting from 0) */
+UNIV_INLINE
+ulint
+dict_col_get_no(
+/*============*/
+ const dict_col_t* col); /*!< in: column */
+/*********************************************************************//**
+Gets the column position in the clustered index. */
+UNIV_INLINE
+ulint
+dict_col_get_clust_pos(
+/*===================*/
+ const dict_col_t* col, /*!< in: table column */
+ const dict_index_t* clust_index); /*!< in: clustered index */
+/****************************************************************//**
+If the given column name is reserved for InnoDB system columns, return
+TRUE.
+@return TRUE if name is reserved */
+UNIV_INTERN
+ibool
+dict_col_name_is_reserved(
+/*======================*/
+ const char* name); /*!< in: column name */
+/********************************************************************//**
+Acquire the autoinc lock. */
+UNIV_INTERN
+void
+dict_table_autoinc_lock(
+/*====================*/
+ dict_table_t* table); /*!< in/out: table */
+/********************************************************************//**
+Unconditionally set the autoinc counter. */
+UNIV_INTERN
+void
+dict_table_autoinc_initialize(
+/*==========================*/
+ dict_table_t* table, /*!< in/out: table */
+ ib_uint64_t value); /*!< in: next value to assign to a row */
+/********************************************************************//**
+Reads the next autoinc value (== autoinc counter value), 0 if not yet
+initialized.
+@return value for a new row, or 0 */
+UNIV_INTERN
+ib_uint64_t
+dict_table_autoinc_read(
+/*====================*/
+ const dict_table_t* table); /*!< in: table */
+/********************************************************************//**
+Updates the autoinc counter if the value supplied is greater than the
+current value. */
+UNIV_INTERN
+void
+dict_table_autoinc_update_if_greater(
+/*=================================*/
+
+ dict_table_t* table, /*!< in/out: table */
+ ib_uint64_t value); /*!< in: value which was assigned to a row */
+/********************************************************************//**
+Release the autoinc lock. */
+UNIV_INTERN
+void
+dict_table_autoinc_unlock(
+/*======================*/
+ dict_table_t* table); /*!< in/out: table */
+#endif /* !UNIV_HOTBACKUP */
+/**********************************************************************//**
+Adds system columns to a table object. */
+UNIV_INTERN
+void
+dict_table_add_system_columns(
+/*==========================*/
+ dict_table_t* table, /*!< in/out: table */
+ mem_heap_t* heap); /*!< in: temporary heap */
+#ifndef UNIV_HOTBACKUP
+/**********************************************************************//**
+Adds a table object to the dictionary cache. */
+UNIV_INTERN
+void
+dict_table_add_to_cache(
+/*====================*/
+ dict_table_t* table, /*!< in: table */
+ mem_heap_t* heap); /*!< in: temporary heap */
+/**********************************************************************//**
+Removes a table object from the dictionary cache. */
+UNIV_INTERN
+void
+dict_table_remove_from_cache(
+/*=========================*/
+ dict_table_t* table); /*!< in, own: table */
+/**********************************************************************//**
+Renames a table object.
+@return TRUE if success */
+UNIV_INTERN
+ibool
+dict_table_rename_in_cache(
+/*=======================*/
+ dict_table_t* table, /*!< in/out: table */
+ const char* new_name, /*!< in: new name */
+ ibool rename_also_foreigns);/*!< in: in ALTER TABLE we want
+ to preserve the original table name
+ in constraints which reference it */
+/**********************************************************************//**
+Removes an index from the dictionary cache. */
+UNIV_INTERN
+void
+dict_index_remove_from_cache(
+/*=========================*/
+ dict_table_t* table, /*!< in/out: table */
+ dict_index_t* index); /*!< in, own: index */
+/**********************************************************************//**
+Change the id of a table object in the dictionary cache. This is used in
+DISCARD TABLESPACE. */
+UNIV_INTERN
+void
+dict_table_change_id_in_cache(
+/*==========================*/
+ dict_table_t* table, /*!< in/out: table object already in cache */
+ dulint new_id);/*!< in: new id to set */
+/**********************************************************************//**
+Adds a foreign key constraint object to the dictionary cache. May free
+the object if there already is an object with the same identifier in.
+At least one of foreign table or referenced table must already be in
+the dictionary cache!
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+ulint
+dict_foreign_add_to_cache(
+/*======================*/
+ dict_foreign_t* foreign, /*!< in, own: foreign key constraint */
+ ibool check_charsets);/*!< in: TRUE=check charset
+ compatibility */
+/*********************************************************************//**
+Check if the index is referenced by a foreign key, if TRUE return the
+matching instance NULL otherwise.
+@return pointer to foreign key struct if index is defined for foreign
+key, otherwise NULL */
+UNIV_INTERN
+dict_foreign_t*
+dict_table_get_referenced_constraint(
+/*=================================*/
+ dict_table_t* table, /*!< in: InnoDB table */
+ dict_index_t* index); /*!< in: InnoDB index */
+/*********************************************************************//**
+Checks if a table is referenced by foreign keys.
+@return TRUE if table is referenced by a foreign key */
+UNIV_INTERN
+ibool
+dict_table_is_referenced_by_foreign_key(
+/*====================================*/
+ const dict_table_t* table); /*!< in: InnoDB table */
+/**********************************************************************//**
+Replace the index in the foreign key list that matches this index's
+definition with an equivalent index. */
+UNIV_INTERN
+void
+dict_table_replace_index_in_foreign_list(
+/*=====================================*/
+ dict_table_t* table, /*!< in/out: table */
+ dict_index_t* index); /*!< in: index to be replaced */
+/*********************************************************************//**
+Checks if a index is defined for a foreign key constraint. Index is a part
+of a foreign key constraint if the index is referenced by foreign key
+or index is a foreign key index
+@return pointer to foreign key struct if index is defined for foreign
+key, otherwise NULL */
+UNIV_INTERN
+dict_foreign_t*
+dict_table_get_foreign_constraint(
+/*==============================*/
+ dict_table_t* table, /*!< in: InnoDB table */
+ dict_index_t* index); /*!< in: InnoDB index */
+/*********************************************************************//**
+Scans a table create SQL string and adds to the data dictionary
+the foreign key constraints declared in the string. This function
+should be called after the indexes for a table have been created.
+Each foreign key constraint must be accompanied with indexes in
+bot participating tables. The indexes are allowed to contain more
+fields than mentioned in the constraint.
+@return error code or DB_SUCCESS */
+UNIV_INTERN
+ulint
+dict_create_foreign_constraints(
+/*============================*/
+ trx_t* trx, /*!< in: transaction */
+ const char* sql_string, /*!< in: table create statement where
+ foreign keys are declared like:
+ FOREIGN KEY (a, b) REFERENCES
+ table2(c, d), table2 can be written
+ also with the database
+ name before it: test.table2; the
+ default database id the database of
+ parameter name */
+ const char* name, /*!< in: table full name in the
+ normalized form
+ database_name/table_name */
+ ibool reject_fks); /*!< in: if TRUE, fail with error
+ code DB_CANNOT_ADD_CONSTRAINT if
+ any foreign keys are found. */
+/**********************************************************************//**
+Parses the CONSTRAINT id's to be dropped in an ALTER TABLE statement.
+@return DB_SUCCESS or DB_CANNOT_DROP_CONSTRAINT if syntax error or the
+constraint id does not match */
+UNIV_INTERN
+ulint
+dict_foreign_parse_drop_constraints(
+/*================================*/
+ mem_heap_t* heap, /*!< in: heap from which we can
+ allocate memory */
+ trx_t* trx, /*!< in: transaction */
+ dict_table_t* table, /*!< in: table */
+ ulint* n, /*!< out: number of constraints
+ to drop */
+ const char*** constraints_to_drop); /*!< out: id's of the
+ constraints to drop */
+/**********************************************************************//**
+Returns a table object and optionally increment its MySQL open handle count.
+NOTE! This is a high-level function to be used mainly from outside the
+'dict' directory. Inside this directory dict_table_get_low is usually the
+appropriate function.
+@return table, NULL if does not exist */
+UNIV_INTERN
+dict_table_t*
+dict_table_get(
+/*===========*/
+ const char* table_name, /*!< in: table name */
+ ibool inc_mysql_count);
+ /*!< in: whether to increment the open
+ handle count on the table */
+/**********************************************************************//**
+Returns a index object, based on table and index id, and memoryfixes it.
+@return index, NULL if does not exist */
+UNIV_INTERN
+dict_index_t*
+dict_index_get_on_id_low(
+/*=====================*/
+ dict_table_t* table, /*!< in: table */
+ dulint index_id); /*!< in: index id */
+/**********************************************************************//**
+Checks if a table is in the dictionary cache.
+@return table, NULL if not found */
+
+UNIV_INLINE
+dict_table_t*
+dict_table_check_if_in_cache_low(
+/*=============================*/
+ const char* table_name); /*!< in: table name */
+/**********************************************************************//**
+Gets a table; loads it to the dictionary cache if necessary. A low-level
+function.
+@return table, NULL if not found */
+UNIV_INLINE
+dict_table_t*
+dict_table_get_low(
+/*===============*/
+ const char* table_name); /*!< in: table name */
+/**********************************************************************//**
+Returns a table object based on table id.
+@return table, NULL if does not exist */
+UNIV_INLINE
+dict_table_t*
+dict_table_get_on_id_low(
+/*=====================*/
+ dulint table_id); /*!< in: table id */
+/**********************************************************************//**
+Find an index that is equivalent to the one passed in and is not marked
+for deletion.
+@return index equivalent to foreign->foreign_index, or NULL */
+UNIV_INTERN
+dict_index_t*
+dict_foreign_find_equiv_index(
+/*==========================*/
+ dict_foreign_t* foreign);/*!< in: foreign key */
+/**********************************************************************//**
+Returns an index object by matching on the name and column names and
+if more than one index matches return the index with the max id
+@return matching index, NULL if not found */
+UNIV_INTERN
+dict_index_t*
+dict_table_get_index_by_max_id(
+/*===========================*/
+ dict_table_t* table, /*!< in: table */
+ const char* name, /*!< in: the index name to find */
+ const char** columns,/*!< in: array of column names */
+ ulint n_cols);/*!< in: number of columns */
+/**********************************************************************//**
+Returns a column's name.
+@return column name. NOTE: not guaranteed to stay valid if table is
+modified in any way (columns added, etc.). */
+UNIV_INTERN
+const char*
+dict_table_get_col_name(
+/*====================*/
+ const dict_table_t* table, /*!< in: table */
+ ulint col_nr);/*!< in: column number */
+
+/**********************************************************************//**
+Prints a table definition. */
+UNIV_INTERN
+void
+dict_table_print(
+/*=============*/
+ dict_table_t* table); /*!< in: table */
+/**********************************************************************//**
+Prints a table data. */
+UNIV_INTERN
+void
+dict_table_print_low(
+/*=================*/
+ dict_table_t* table); /*!< in: table */
+/**********************************************************************//**
+Prints a table data when we know the table name. */
+UNIV_INTERN
+void
+dict_table_print_by_name(
+/*=====================*/
+ const char* name); /*!< in: table name */
+/**********************************************************************//**
+Outputs info on foreign keys of a table. */
+UNIV_INTERN
+void
+dict_print_info_on_foreign_keys(
+/*============================*/
+ ibool create_table_format, /*!< in: if TRUE then print in
+ a format suitable to be inserted into
+ a CREATE TABLE, otherwise in the format
+ of SHOW TABLE STATUS */
+ FILE* file, /*!< in: file where to print */
+ trx_t* trx, /*!< in: transaction */
+ dict_table_t* table); /*!< in: table */
+/**********************************************************************//**
+Outputs info on a foreign key of a table in a format suitable for
+CREATE TABLE. */
+UNIV_INTERN
+void
+dict_print_info_on_foreign_key_in_create_format(
+/*============================================*/
+ FILE* file, /*!< in: file where to print */
+ trx_t* trx, /*!< in: transaction */
+ dict_foreign_t* foreign, /*!< in: foreign key constraint */
+ ibool add_newline); /*!< in: whether to add a newline */
+/********************************************************************//**
+Displays the names of the index and the table. */
+UNIV_INTERN
+void
+dict_index_name_print(
+/*==================*/
+ FILE* file, /*!< in: output stream */
+ trx_t* trx, /*!< in: transaction */
+ const dict_index_t* index); /*!< in: index to print */
+#ifdef UNIV_DEBUG
+/********************************************************************//**
+Gets the first index on the table (the clustered index).
+@return index, NULL if none exists */
+UNIV_INLINE
+dict_index_t*
+dict_table_get_first_index(
+/*=======================*/
+ const dict_table_t* table); /*!< in: table */
+/********************************************************************//**
+Gets the next index on the table.
+@return index, NULL if none left */
+UNIV_INLINE
+dict_index_t*
+dict_table_get_next_index(
+/*======================*/
+ const dict_index_t* index); /*!< in: index */
+#else /* UNIV_DEBUG */
+# define dict_table_get_first_index(table) UT_LIST_GET_FIRST((table)->indexes)
+# define dict_table_get_next_index(index) UT_LIST_GET_NEXT(indexes, index)
+#endif /* UNIV_DEBUG */
+#endif /* !UNIV_HOTBACKUP */
+/********************************************************************//**
+Check whether the index is the clustered index.
+@return nonzero for clustered index, zero for other indexes */
+UNIV_INLINE
+ulint
+dict_index_is_clust(
+/*================*/
+ const dict_index_t* index) /*!< in: index */
+ __attribute__((pure));
+/********************************************************************//**
+Check whether the index is unique.
+@return nonzero for unique index, zero for other indexes */
+UNIV_INLINE
+ulint
+dict_index_is_unique(
+/*=================*/
+ const dict_index_t* index) /*!< in: index */
+ __attribute__((pure));
+/********************************************************************//**
+Check whether the index is the insert buffer tree.
+@return nonzero for insert buffer, zero for other indexes */
+UNIV_INLINE
+ulint
+dict_index_is_ibuf(
+/*===============*/
+ const dict_index_t* index) /*!< in: index */
+ __attribute__((pure));
+/********************************************************************//**
+Check whether the index is a secondary index or the insert buffer tree.
+@return nonzero for insert buffer, zero for other indexes */
+UNIV_INLINE
+ulint
+dict_index_is_sec_or_ibuf(
+/*======================*/
+ const dict_index_t* index) /*!< in: index */
+ __attribute__((pure));
+
+/********************************************************************//**
+Gets the number of user-defined columns in a table in the dictionary
+cache.
+@return number of user-defined (e.g., not ROW_ID) columns of a table */
+UNIV_INLINE
+ulint
+dict_table_get_n_user_cols(
+/*=======================*/
+ const dict_table_t* table); /*!< in: table */
+/********************************************************************//**
+Gets the number of system columns in a table in the dictionary cache.
+@return number of system (e.g., ROW_ID) columns of a table */
+UNIV_INLINE
+ulint
+dict_table_get_n_sys_cols(
+/*======================*/
+ const dict_table_t* table); /*!< in: table */
+/********************************************************************//**
+Gets the number of all columns (also system) in a table in the dictionary
+cache.
+@return number of columns of a table */
+UNIV_INLINE
+ulint
+dict_table_get_n_cols(
+/*==================*/
+ const dict_table_t* table); /*!< in: table */
+#ifdef UNIV_DEBUG
+/********************************************************************//**
+Gets the nth column of a table.
+@return pointer to column object */
+UNIV_INLINE
+dict_col_t*
+dict_table_get_nth_col(
+/*===================*/
+ const dict_table_t* table, /*!< in: table */
+ ulint pos); /*!< in: position of column */
+/********************************************************************//**
+Gets the given system column of a table.
+@return pointer to column object */
+UNIV_INLINE
+dict_col_t*
+dict_table_get_sys_col(
+/*===================*/
+ const dict_table_t* table, /*!< in: table */
+ ulint sys); /*!< in: DATA_ROW_ID, ... */
+#else /* UNIV_DEBUG */
+#define dict_table_get_nth_col(table, pos) \
+((table)->cols + (pos))
+#define dict_table_get_sys_col(table, sys) \
+((table)->cols + (table)->n_cols + (sys) - DATA_N_SYS_COLS)
+#endif /* UNIV_DEBUG */
+/********************************************************************//**
+Gets the given system column number of a table.
+@return column number */
+UNIV_INLINE
+ulint
+dict_table_get_sys_col_no(
+/*======================*/
+ const dict_table_t* table, /*!< in: table */
+ ulint sys); /*!< in: DATA_ROW_ID, ... */
+#ifndef UNIV_HOTBACKUP
+/********************************************************************//**
+Returns the minimum data size of an index record.
+@return minimum data size in bytes */
+UNIV_INLINE
+ulint
+dict_index_get_min_size(
+/*====================*/
+ const dict_index_t* index); /*!< in: index */
+#endif /* !UNIV_HOTBACKUP */
+/********************************************************************//**
+Check whether the table uses the compact page format.
+@return TRUE if table uses the compact page format */
+UNIV_INLINE
+ibool
+dict_table_is_comp(
+/*===============*/
+ const dict_table_t* table); /*!< in: table */
+/********************************************************************//**
+Determine the file format of a table.
+@return file format version */
+UNIV_INLINE
+ulint
+dict_table_get_format(
+/*==================*/
+ const dict_table_t* table); /*!< in: table */
+/********************************************************************//**
+Set the file format of a table. */
+UNIV_INLINE
+void
+dict_table_set_format(
+/*==================*/
+ dict_table_t* table, /*!< in/out: table */
+ ulint format);/*!< in: file format version */
+/********************************************************************//**
+Extract the compressed page size from table flags.
+@return compressed page size, or 0 if not compressed */
+UNIV_INLINE
+ulint
+dict_table_flags_to_zip_size(
+/*=========================*/
+ ulint flags) /*!< in: flags */
+ __attribute__((const));
+/********************************************************************//**
+Check whether the table uses the compressed compact page format.
+@return compressed page size, or 0 if not compressed */
+UNIV_INLINE
+ulint
+dict_table_zip_size(
+/*================*/
+ const dict_table_t* table); /*!< in: table */
+/********************************************************************//**
+Checks if a column is in the ordering columns of the clustered index of a
+table. Column prefixes are treated like whole columns.
+@return TRUE if the column, or its prefix, is in the clustered key */
+UNIV_INTERN
+ibool
+dict_table_col_in_clustered_key(
+/*============================*/
+ const dict_table_t* table, /*!< in: table */
+ ulint n); /*!< in: column number */
+#ifndef UNIV_HOTBACKUP
+/*******************************************************************//**
+Copies types of columns contained in table to tuple and sets all
+fields of the tuple to the SQL NULL value. This function should
+be called right after dtuple_create(). */
+UNIV_INTERN
+void
+dict_table_copy_types(
+/*==================*/
+ dtuple_t* tuple, /*!< in/out: data tuple */
+ const dict_table_t* table); /*!< in: table */
+/**********************************************************************//**
+Looks for an index with the given id. NOTE that we do not reserve
+the dictionary mutex: this function is for emergency purposes like
+printing info of a corrupt database page!
+@return index or NULL if not found from cache */
+UNIV_INTERN
+dict_index_t*
+dict_index_find_on_id_low(
+/*======================*/
+ dulint id); /*!< in: index id */
+/**********************************************************************//**
+Adds an index to the dictionary cache.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+ulint
+dict_index_add_to_cache(
+/*====================*/
+ dict_table_t* table, /*!< in: table on which the index is */
+ dict_index_t* index, /*!< in, own: index; NOTE! The index memory
+ object is freed in this function! */
+ ulint page_no,/*!< in: root page number of the index */
+ ibool strict);/*!< in: TRUE=refuse to create the index
+ if records could be too big to fit in
+ an B-tree page */
+/**********************************************************************//**
+Removes an index from the dictionary cache. */
+UNIV_INTERN
+void
+dict_index_remove_from_cache(
+/*=========================*/
+ dict_table_t* table, /*!< in/out: table */
+ dict_index_t* index); /*!< in, own: index */
+#endif /* !UNIV_HOTBACKUP */
+/********************************************************************//**
+Gets the number of fields in the internal representation of an index,
+including fields added by the dictionary system.
+@return number of fields */
+UNIV_INLINE
+ulint
+dict_index_get_n_fields(
+/*====================*/
+ const dict_index_t* index); /*!< in: an internal
+ representation of index (in
+ the dictionary cache) */
+/********************************************************************//**
+Gets the number of fields in the internal representation of an index
+that uniquely determine the position of an index entry in the index, if
+we do not take multiversioning into account: in the B-tree use the value
+returned by dict_index_get_n_unique_in_tree.
+@return number of fields */
+UNIV_INLINE
+ulint
+dict_index_get_n_unique(
+/*====================*/
+ const dict_index_t* index); /*!< in: an internal representation
+ of index (in the dictionary cache) */
+/********************************************************************//**
+Gets the number of fields in the internal representation of an index
+which uniquely determine the position of an index entry in the index, if
+we also take multiversioning into account.
+@return number of fields */
+UNIV_INLINE
+ulint
+dict_index_get_n_unique_in_tree(
+/*============================*/
+ const dict_index_t* index); /*!< in: an internal representation
+ of index (in the dictionary cache) */
+/********************************************************************//**
+Gets the number of user-defined ordering fields in the index. In the internal
+representation we add the row id to the ordering fields to make all indexes
+unique, but this function returns the number of fields the user defined
+in the index as ordering fields.
+@return number of fields */
+UNIV_INLINE
+ulint
+dict_index_get_n_ordering_defined_by_user(
+/*======================================*/
+ const dict_index_t* index); /*!< in: an internal representation
+ of index (in the dictionary cache) */
+#ifdef UNIV_DEBUG
+/********************************************************************//**
+Gets the nth field of an index.
+@return pointer to field object */
+UNIV_INLINE
+dict_field_t*
+dict_index_get_nth_field(
+/*=====================*/
+ const dict_index_t* index, /*!< in: index */
+ ulint pos); /*!< in: position of field */
+#else /* UNIV_DEBUG */
+# define dict_index_get_nth_field(index, pos) ((index)->fields + (pos))
+#endif /* UNIV_DEBUG */
+/********************************************************************//**
+Gets pointer to the nth column in an index.
+@return column */
+UNIV_INLINE
+const dict_col_t*
+dict_index_get_nth_col(
+/*===================*/
+ const dict_index_t* index, /*!< in: index */
+ ulint pos); /*!< in: position of the field */
+/********************************************************************//**
+Gets the column number of the nth field in an index.
+@return column number */
+UNIV_INLINE
+ulint
+dict_index_get_nth_col_no(
+/*======================*/
+ const dict_index_t* index, /*!< in: index */
+ ulint pos); /*!< in: position of the field */
+/********************************************************************//**
+Looks for column n in an index.
+@return position in internal representation of the index;
+ULINT_UNDEFINED if not contained */
+UNIV_INTERN
+ulint
+dict_index_get_nth_col_pos(
+/*=======================*/
+ const dict_index_t* index, /*!< in: index */
+ ulint n); /*!< in: column number */
+/********************************************************************//**
+Returns TRUE if the index contains a column or a prefix of that column.
+@return TRUE if contains the column or its prefix */
+UNIV_INTERN
+ibool
+dict_index_contains_col_or_prefix(
+/*==============================*/
+ const dict_index_t* index, /*!< in: index */
+ ulint n); /*!< in: column number */
+/********************************************************************//**
+Looks for a matching field in an index. The column has to be the same. The
+column in index must be complete, or must contain a prefix longer than the
+column in index2. That is, we must be able to construct the prefix in index2
+from the prefix in index.
+@return position in internal representation of the index;
+ULINT_UNDEFINED if not contained */
+UNIV_INTERN
+ulint
+dict_index_get_nth_field_pos(
+/*=========================*/
+ const dict_index_t* index, /*!< in: index from which to search */
+ const dict_index_t* index2, /*!< in: index */
+ ulint n); /*!< in: field number in index2 */
+/********************************************************************//**
+Looks for column n position in the clustered index.
+@return position in internal representation of the clustered index */
+UNIV_INTERN
+ulint
+dict_table_get_nth_col_pos(
+/*=======================*/
+ const dict_table_t* table, /*!< in: table */
+ ulint n); /*!< in: column number */
+/********************************************************************//**
+Returns the position of a system column in an index.
+@return position, ULINT_UNDEFINED if not contained */
+UNIV_INLINE
+ulint
+dict_index_get_sys_col_pos(
+/*=======================*/
+ const dict_index_t* index, /*!< in: index */
+ ulint type); /*!< in: DATA_ROW_ID, ... */
+/*******************************************************************//**
+Adds a column to index. */
+UNIV_INTERN
+void
+dict_index_add_col(
+/*===============*/
+ dict_index_t* index, /*!< in/out: index */
+ const dict_table_t* table, /*!< in: table */
+ dict_col_t* col, /*!< in: column */
+ ulint prefix_len); /*!< in: column prefix length */
+#ifndef UNIV_HOTBACKUP
+/*******************************************************************//**
+Copies types of fields contained in index to tuple. */
+UNIV_INTERN
+void
+dict_index_copy_types(
+/*==================*/
+ dtuple_t* tuple, /*!< in/out: data tuple */
+ const dict_index_t* index, /*!< in: index */
+ ulint n_fields); /*!< in: number of
+ field types to copy */
+#endif /* !UNIV_HOTBACKUP */
+/*********************************************************************//**
+Gets the field column.
+@return field->col, pointer to the table column */
+UNIV_INLINE
+const dict_col_t*
+dict_field_get_col(
+/*===============*/
+ const dict_field_t* field); /*!< in: index field */
+#ifndef UNIV_HOTBACKUP
+/**********************************************************************//**
+Returns an index object if it is found in the dictionary cache.
+Assumes that dict_sys->mutex is already being held.
+@return index, NULL if not found */
+UNIV_INTERN
+dict_index_t*
+dict_index_get_if_in_cache_low(
+/*===========================*/
+ dulint index_id); /*!< in: index id */
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+/**********************************************************************//**
+Returns an index object if it is found in the dictionary cache.
+@return index, NULL if not found */
+UNIV_INTERN
+dict_index_t*
+dict_index_get_if_in_cache(
+/*=======================*/
+ dulint index_id); /*!< in: index id */
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+#ifdef UNIV_DEBUG
+/**********************************************************************//**
+Checks that a tuple has n_fields_cmp value in a sensible range, so that
+no comparison can occur with the page number field in a node pointer.
+@return TRUE if ok */
+UNIV_INTERN
+ibool
+dict_index_check_search_tuple(
+/*==========================*/
+ const dict_index_t* index, /*!< in: index tree */
+ const dtuple_t* tuple); /*!< in: tuple used in a search */
+/**********************************************************************//**
+Check for duplicate index entries in a table [using the index name] */
+UNIV_INTERN
+void
+dict_table_check_for_dup_indexes(
+/*=============================*/
+ const dict_table_t* table); /*!< in: Check for dup indexes
+ in this table */
+
+#endif /* UNIV_DEBUG */
+/**********************************************************************//**
+Builds a node pointer out of a physical record and a page number.
+@return own: node pointer */
+UNIV_INTERN
+dtuple_t*
+dict_index_build_node_ptr(
+/*======================*/
+ const dict_index_t* index, /*!< in: index */
+ const rec_t* rec, /*!< in: record for which to build node
+ pointer */
+ ulint page_no,/*!< in: page number to put in node
+ pointer */
+ mem_heap_t* heap, /*!< in: memory heap where pointer
+ created */
+ ulint level); /*!< in: level of rec in tree:
+ 0 means leaf level */
+/**********************************************************************//**
+Copies an initial segment of a physical record, long enough to specify an
+index entry uniquely.
+@return pointer to the prefix record */
+UNIV_INTERN
+rec_t*
+dict_index_copy_rec_order_prefix(
+/*=============================*/
+ const dict_index_t* index, /*!< in: index */
+ const rec_t* rec, /*!< in: record for which to
+ copy prefix */
+ ulint* n_fields,/*!< out: number of fields copied */
+ byte** buf, /*!< in/out: memory buffer for the
+ copied prefix, or NULL */
+ ulint* buf_size);/*!< in/out: buffer size */
+/**********************************************************************//**
+Builds a typed data tuple out of a physical record.
+@return own: data tuple */
+UNIV_INTERN
+dtuple_t*
+dict_index_build_data_tuple(
+/*========================*/
+ dict_index_t* index, /*!< in: index */
+ rec_t* rec, /*!< in: record for which to build data tuple */
+ ulint n_fields,/*!< in: number of data fields */
+ mem_heap_t* heap); /*!< in: memory heap where tuple created */
+/*********************************************************************//**
+Gets the space id of the root of the index tree.
+@return space id */
+UNIV_INLINE
+ulint
+dict_index_get_space(
+/*=================*/
+ const dict_index_t* index); /*!< in: index */
+/*********************************************************************//**
+Sets the space id of the root of the index tree. */
+UNIV_INLINE
+void
+dict_index_set_space(
+/*=================*/
+ dict_index_t* index, /*!< in/out: index */
+ ulint space); /*!< in: space id */
+/*********************************************************************//**
+Gets the page number of the root of the index tree.
+@return page number */
+UNIV_INLINE
+ulint
+dict_index_get_page(
+/*================*/
+ const dict_index_t* tree); /*!< in: index */
+/*********************************************************************//**
+Sets the page number of the root of index tree. */
+UNIV_INLINE
+void
+dict_index_set_page(
+/*================*/
+ dict_index_t* index, /*!< in/out: index */
+ ulint page); /*!< in: page number */
+/*********************************************************************//**
+Gets the read-write lock of the index tree.
+@return read-write lock */
+UNIV_INLINE
+rw_lock_t*
+dict_index_get_lock(
+/*================*/
+ dict_index_t* index); /*!< in: index */
+/********************************************************************//**
+Returns free space reserved for future updates of records. This is
+relevant only in the case of many consecutive inserts, as updates
+which make the records bigger might fragment the index.
+@return number of free bytes on page, reserved for updates */
+UNIV_INLINE
+ulint
+dict_index_get_space_reserve(void);
+/*==============================*/
+/*********************************************************************//**
+Calculates the minimum record length in an index. */
+UNIV_INTERN
+ulint
+dict_index_calc_min_rec_len(
+/*========================*/
+ const dict_index_t* index); /*!< in: index */
+/*********************************************************************//**
+Calculates new estimates for table and index statistics. The statistics
+are used in query optimization. */
+UNIV_INTERN
+void
+dict_update_statistics_low(
+/*=======================*/
+ dict_table_t* table, /*!< in/out: table */
+ ibool has_dict_mutex);/*!< in: TRUE if the caller has the
+ dictionary mutex */
+/*********************************************************************//**
+Calculates new estimates for table and index statistics. The statistics
+are used in query optimization. */
+UNIV_INTERN
+void
+dict_update_statistics(
+/*===================*/
+ dict_table_t* table); /*!< in/out: table */
+/********************************************************************//**
+Reserves the dictionary system mutex for MySQL. */
+UNIV_INTERN
+void
+dict_mutex_enter_for_mysql(void);
+/*============================*/
+/********************************************************************//**
+Releases the dictionary system mutex for MySQL. */
+UNIV_INTERN
+void
+dict_mutex_exit_for_mysql(void);
+/*===========================*/
+/********************************************************************//**
+Checks if the database name in two table names is the same.
+@return TRUE if same db name */
+UNIV_INTERN
+ibool
+dict_tables_have_same_db(
+/*=====================*/
+ const char* name1, /*!< in: table name in the form
+ dbname '/' tablename */
+ const char* name2); /*!< in: table name in the form
+ dbname '/' tablename */
+/*********************************************************************//**
+Removes an index from the cache */
+UNIV_INTERN
+void
+dict_index_remove_from_cache(
+/*=========================*/
+ dict_table_t* table, /*!< in/out: table */
+ dict_index_t* index); /*!< in, own: index */
+/**********************************************************************//**
+Get index by name
+@return index, NULL if does not exist */
+UNIV_INTERN
+dict_index_t*
+dict_table_get_index_on_name(
+/*=========================*/
+ dict_table_t* table, /*!< in: table */
+ const char* name); /*!< in: name of the index to find */
+/**********************************************************************//**
+In case there is more than one index with the same name return the index
+with the min(id).
+@return index, NULL if does not exist */
+UNIV_INTERN
+dict_index_t*
+dict_table_get_index_on_name_and_min_id(
+/*====================================*/
+ dict_table_t* table, /*!< in: table */
+ const char* name); /*!< in: name of the index to find */
+/* Buffers for storing detailed information about the latest foreign key
+and unique key errors */
+extern FILE* dict_foreign_err_file;
+extern mutex_t dict_foreign_err_mutex; /* mutex protecting the buffers */
+
+/** the dictionary system */
+extern dict_sys_t* dict_sys;
+/** the data dictionary rw-latch protecting dict_sys */
+extern rw_lock_t dict_operation_lock;
+
+/* Dictionary system struct */
+struct dict_sys_struct{
+ mutex_t mutex; /*!< mutex protecting the data
+ dictionary; protects also the
+ disk-based dictionary system tables;
+ this mutex serializes CREATE TABLE
+ and DROP TABLE, as well as reading
+ the dictionary data for a table from
+ system tables */
+ dulint row_id; /*!< the next row id to assign;
+ NOTE that at a checkpoint this
+ must be written to the dict system
+ header and flushed to a file; in
+ recovery this must be derived from
+ the log records */
+ hash_table_t* table_hash; /*!< hash table of the tables, based
+ on name */
+ hash_table_t* table_id_hash; /*!< hash table of the tables, based
+ on id */
+ UT_LIST_BASE_NODE_T(dict_table_t)
+ table_LRU; /*!< LRU list of tables */
+ ulint size; /*!< varying space in bytes occupied
+ by the data dictionary table and
+ index objects */
+ dict_table_t* sys_tables; /*!< SYS_TABLES table */
+ dict_table_t* sys_columns; /*!< SYS_COLUMNS table */
+ dict_table_t* sys_indexes; /*!< SYS_INDEXES table */
+ dict_table_t* sys_fields; /*!< SYS_FIELDS table */
+};
+#endif /* !UNIV_HOTBACKUP */
+
+/** dummy index for ROW_FORMAT=REDUNDANT supremum and infimum records */
+extern dict_index_t* dict_ind_redundant;
+/** dummy index for ROW_FORMAT=COMPACT supremum and infimum records */
+extern dict_index_t* dict_ind_compact;
+
+/**********************************************************************//**
+Inits dict_ind_redundant and dict_ind_compact. */
+UNIV_INTERN
+void
+dict_ind_init(void);
+/*===============*/
+
+#ifndef UNIV_NONINL
+#include "dict0dict.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/dict0dict.ic b/storage/innobase/include/dict0dict.ic
new file mode 100644
index 00000000000..46e78df8272
--- /dev/null
+++ b/storage/innobase/include/dict0dict.ic
@@ -0,0 +1,806 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/dict0dict.ic
+Data dictionary system
+
+Created 1/8/1996 Heikki Tuuri
+***********************************************************************/
+
+#include "data0type.h"
+#ifndef UNIV_HOTBACKUP
+#include "dict0load.h"
+#include "rem0types.h"
+
+/*********************************************************************//**
+Gets the column data type. */
+UNIV_INLINE
+void
+dict_col_copy_type(
+/*===============*/
+ const dict_col_t* col, /*!< in: column */
+ dtype_t* type) /*!< out: data type */
+{
+ ut_ad(col && type);
+
+ type->mtype = col->mtype;
+ type->prtype = col->prtype;
+ type->len = col->len;
+ type->mbminlen = col->mbminlen;
+ type->mbmaxlen = col->mbmaxlen;
+}
+#endif /* !UNIV_HOTBACKUP */
+
+#ifdef UNIV_DEBUG
+/*********************************************************************//**
+Assert that a column and a data type match.
+@return TRUE */
+UNIV_INLINE
+ibool
+dict_col_type_assert_equal(
+/*=======================*/
+ const dict_col_t* col, /*!< in: column */
+ const dtype_t* type) /*!< in: data type */
+{
+ ut_ad(col);
+ ut_ad(type);
+
+ ut_ad(col->mtype == type->mtype);
+ ut_ad(col->prtype == type->prtype);
+ ut_ad(col->len == type->len);
+# ifndef UNIV_HOTBACKUP
+ ut_ad(col->mbminlen == type->mbminlen);
+ ut_ad(col->mbmaxlen == type->mbmaxlen);
+# endif /* !UNIV_HOTBACKUP */
+
+ return(TRUE);
+}
+#endif /* UNIV_DEBUG */
+
+#ifndef UNIV_HOTBACKUP
+/***********************************************************************//**
+Returns the minimum size of the column.
+@return minimum size */
+UNIV_INLINE
+ulint
+dict_col_get_min_size(
+/*==================*/
+ const dict_col_t* col) /*!< in: column */
+{
+ return(dtype_get_min_size_low(col->mtype, col->prtype, col->len,
+ col->mbminlen, col->mbmaxlen));
+}
+/***********************************************************************//**
+Returns the maximum size of the column.
+@return maximum size */
+UNIV_INLINE
+ulint
+dict_col_get_max_size(
+/*==================*/
+ const dict_col_t* col) /*!< in: column */
+{
+ return(dtype_get_max_size_low(col->mtype, col->len));
+}
+#endif /* !UNIV_HOTBACKUP */
+/***********************************************************************//**
+Returns the size of a fixed size column, 0 if not a fixed size column.
+@return fixed size, or 0 */
+UNIV_INLINE
+ulint
+dict_col_get_fixed_size(
+/*====================*/
+ const dict_col_t* col, /*!< in: column */
+ ulint comp) /*!< in: nonzero=ROW_FORMAT=COMPACT */
+{
+ return(dtype_get_fixed_size_low(col->mtype, col->prtype, col->len,
+ col->mbminlen, col->mbmaxlen, comp));
+}
+/***********************************************************************//**
+Returns the ROW_FORMAT=REDUNDANT stored SQL NULL size of a column.
+For fixed length types it is the fixed length of the type, otherwise 0.
+@return SQL null storage size in ROW_FORMAT=REDUNDANT */
+UNIV_INLINE
+ulint
+dict_col_get_sql_null_size(
+/*=======================*/
+ const dict_col_t* col, /*!< in: column */
+ ulint comp) /*!< in: nonzero=ROW_FORMAT=COMPACT */
+{
+ return(dict_col_get_fixed_size(col, comp));
+}
+
+/*********************************************************************//**
+Gets the column number.
+@return col->ind, table column position (starting from 0) */
+UNIV_INLINE
+ulint
+dict_col_get_no(
+/*============*/
+ const dict_col_t* col) /*!< in: column */
+{
+ ut_ad(col);
+
+ return(col->ind);
+}
+
+/*********************************************************************//**
+Gets the column position in the clustered index. */
+UNIV_INLINE
+ulint
+dict_col_get_clust_pos(
+/*===================*/
+ const dict_col_t* col, /*!< in: table column */
+ const dict_index_t* clust_index) /*!< in: clustered index */
+{
+ ulint i;
+
+ ut_ad(col);
+ ut_ad(clust_index);
+ ut_ad(dict_index_is_clust(clust_index));
+
+ for (i = 0; i < clust_index->n_def; i++) {
+ const dict_field_t* field = &clust_index->fields[i];
+
+ if (!field->prefix_len && field->col == col) {
+ return(i);
+ }
+ }
+
+ return(ULINT_UNDEFINED);
+}
+
+#ifndef UNIV_HOTBACKUP
+#ifdef UNIV_DEBUG
+/********************************************************************//**
+Gets the first index on the table (the clustered index).
+@return index, NULL if none exists */
+UNIV_INLINE
+dict_index_t*
+dict_table_get_first_index(
+/*=======================*/
+ const dict_table_t* table) /*!< in: table */
+{
+ ut_ad(table);
+ ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+
+ return(UT_LIST_GET_FIRST(((dict_table_t*) table)->indexes));
+}
+
+/********************************************************************//**
+Gets the next index on the table.
+@return index, NULL if none left */
+UNIV_INLINE
+dict_index_t*
+dict_table_get_next_index(
+/*======================*/
+ const dict_index_t* index) /*!< in: index */
+{
+ ut_ad(index);
+ ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+
+ return(UT_LIST_GET_NEXT(indexes, (dict_index_t*) index));
+}
+#endif /* UNIV_DEBUG */
+#endif /* !UNIV_HOTBACKUP */
+
+/********************************************************************//**
+Check whether the index is the clustered index.
+@return nonzero for clustered index, zero for other indexes */
+UNIV_INLINE
+ulint
+dict_index_is_clust(
+/*================*/
+ const dict_index_t* index) /*!< in: index */
+{
+ ut_ad(index);
+ ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+
+ return(UNIV_UNLIKELY(index->type & DICT_CLUSTERED));
+}
+/********************************************************************//**
+Check whether the index is unique.
+@return nonzero for unique index, zero for other indexes */
+UNIV_INLINE
+ulint
+dict_index_is_unique(
+/*=================*/
+ const dict_index_t* index) /*!< in: index */
+{
+ ut_ad(index);
+ ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+
+ return(UNIV_UNLIKELY(index->type & DICT_UNIQUE));
+}
+
+/********************************************************************//**
+Check whether the index is the insert buffer tree.
+@return nonzero for insert buffer, zero for other indexes */
+UNIV_INLINE
+ulint
+dict_index_is_ibuf(
+/*===============*/
+ const dict_index_t* index) /*!< in: index */
+{
+ ut_ad(index);
+ ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+
+ return(UNIV_UNLIKELY(index->type & DICT_IBUF));
+}
+
+/********************************************************************//**
+Check whether the index is a secondary index or the insert buffer tree.
+@return nonzero for insert buffer, zero for other indexes */
+UNIV_INLINE
+ulint
+dict_index_is_sec_or_ibuf(
+/*======================*/
+ const dict_index_t* index) /*!< in: index */
+{
+ ulint type;
+
+ ut_ad(index);
+ ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+
+ type = index->type;
+
+ return(UNIV_LIKELY(!(type & DICT_CLUSTERED) || (type & DICT_IBUF)));
+}
+
+/********************************************************************//**
+Gets the number of user-defined columns in a table in the dictionary
+cache.
+@return number of user-defined (e.g., not ROW_ID) columns of a table */
+UNIV_INLINE
+ulint
+dict_table_get_n_user_cols(
+/*=======================*/
+ const dict_table_t* table) /*!< in: table */
+{
+ ut_ad(table);
+ ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+
+ return(table->n_cols - DATA_N_SYS_COLS);
+}
+
+/********************************************************************//**
+Gets the number of system columns in a table in the dictionary cache.
+@return number of system (e.g., ROW_ID) columns of a table */
+UNIV_INLINE
+ulint
+dict_table_get_n_sys_cols(
+/*======================*/
+ const dict_table_t* table __attribute__((unused))) /*!< in: table */
+{
+ ut_ad(table);
+ ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+ ut_ad(table->cached);
+
+ return(DATA_N_SYS_COLS);
+}
+
+/********************************************************************//**
+Gets the number of all columns (also system) in a table in the dictionary
+cache.
+@return number of columns of a table */
+UNIV_INLINE
+ulint
+dict_table_get_n_cols(
+/*==================*/
+ const dict_table_t* table) /*!< in: table */
+{
+ ut_ad(table);
+ ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+
+ return(table->n_cols);
+}
+
+#ifdef UNIV_DEBUG
+/********************************************************************//**
+Gets the nth column of a table.
+@return pointer to column object */
+UNIV_INLINE
+dict_col_t*
+dict_table_get_nth_col(
+/*===================*/
+ const dict_table_t* table, /*!< in: table */
+ ulint pos) /*!< in: position of column */
+{
+ ut_ad(table);
+ ut_ad(pos < table->n_def);
+ ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+
+ return((dict_col_t*) (table->cols) + pos);
+}
+
+/********************************************************************//**
+Gets the given system column of a table.
+@return pointer to column object */
+UNIV_INLINE
+dict_col_t*
+dict_table_get_sys_col(
+/*===================*/
+ const dict_table_t* table, /*!< in: table */
+ ulint sys) /*!< in: DATA_ROW_ID, ... */
+{
+ dict_col_t* col;
+
+ ut_ad(table);
+ ut_ad(sys < DATA_N_SYS_COLS);
+ ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+
+ col = dict_table_get_nth_col(table, table->n_cols
+ - DATA_N_SYS_COLS + sys);
+ ut_ad(col->mtype == DATA_SYS);
+ ut_ad(col->prtype == (sys | DATA_NOT_NULL));
+
+ return(col);
+}
+#endif /* UNIV_DEBUG */
+
+/********************************************************************//**
+Gets the given system column number of a table.
+@return column number */
+UNIV_INLINE
+ulint
+dict_table_get_sys_col_no(
+/*======================*/
+ const dict_table_t* table, /*!< in: table */
+ ulint sys) /*!< in: DATA_ROW_ID, ... */
+{
+ ut_ad(table);
+ ut_ad(sys < DATA_N_SYS_COLS);
+ ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+
+ return(table->n_cols - DATA_N_SYS_COLS + sys);
+}
+
+/********************************************************************//**
+Check whether the table uses the compact page format.
+@return TRUE if table uses the compact page format */
+UNIV_INLINE
+ibool
+dict_table_is_comp(
+/*===============*/
+ const dict_table_t* table) /*!< in: table */
+{
+ ut_ad(table);
+
+#if DICT_TF_COMPACT != TRUE
+#error
+#endif
+
+ return(UNIV_LIKELY(table->flags & DICT_TF_COMPACT));
+}
+
+/********************************************************************//**
+Determine the file format of a table.
+@return file format version */
+UNIV_INLINE
+ulint
+dict_table_get_format(
+/*==================*/
+ const dict_table_t* table) /*!< in: table */
+{
+ ut_ad(table);
+
+ return((table->flags & DICT_TF_FORMAT_MASK) >> DICT_TF_FORMAT_SHIFT);
+}
+
+/********************************************************************//**
+Determine the file format of a table. */
+UNIV_INLINE
+void
+dict_table_set_format(
+/*==================*/
+ dict_table_t* table, /*!< in/out: table */
+ ulint format) /*!< in: file format version */
+{
+ ut_ad(table);
+
+ table->flags = (table->flags & ~DICT_TF_FORMAT_MASK)
+ | (format << DICT_TF_FORMAT_SHIFT);
+}
+
+/********************************************************************//**
+Extract the compressed page size from table flags.
+@return compressed page size, or 0 if not compressed */
+UNIV_INLINE
+ulint
+dict_table_flags_to_zip_size(
+/*=========================*/
+ ulint flags) /*!< in: flags */
+{
+ ulint zip_size = flags & DICT_TF_ZSSIZE_MASK;
+
+ if (UNIV_UNLIKELY(zip_size)) {
+ zip_size = ((PAGE_ZIP_MIN_SIZE >> 1)
+ << (zip_size >> DICT_TF_ZSSIZE_SHIFT));
+
+ ut_ad(zip_size <= UNIV_PAGE_SIZE);
+ }
+
+ return(zip_size);
+}
+
+/********************************************************************//**
+Check whether the table uses the compressed compact page format.
+@return compressed page size, or 0 if not compressed */
+UNIV_INLINE
+ulint
+dict_table_zip_size(
+/*================*/
+ const dict_table_t* table) /*!< in: table */
+{
+ ut_ad(table);
+
+ return(dict_table_flags_to_zip_size(table->flags));
+}
+
+/********************************************************************//**
+Gets the number of fields in the internal representation of an index,
+including fields added by the dictionary system.
+@return number of fields */
+UNIV_INLINE
+ulint
+dict_index_get_n_fields(
+/*====================*/
+ const dict_index_t* index) /*!< in: an internal
+ representation of index (in
+ the dictionary cache) */
+{
+ ut_ad(index);
+ ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+
+ return(index->n_fields);
+}
+
+/********************************************************************//**
+Gets the number of fields in the internal representation of an index
+that uniquely determine the position of an index entry in the index, if
+we do not take multiversioning into account: in the B-tree use the value
+returned by dict_index_get_n_unique_in_tree.
+@return number of fields */
+UNIV_INLINE
+ulint
+dict_index_get_n_unique(
+/*====================*/
+ const dict_index_t* index) /*!< in: an internal representation
+ of index (in the dictionary cache) */
+{
+ ut_ad(index);
+ ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+ ut_ad(index->cached);
+
+ return(index->n_uniq);
+}
+
+/********************************************************************//**
+Gets the number of fields in the internal representation of an index
+which uniquely determine the position of an index entry in the index, if
+we also take multiversioning into account.
+@return number of fields */
+UNIV_INLINE
+ulint
+dict_index_get_n_unique_in_tree(
+/*============================*/
+ const dict_index_t* index) /*!< in: an internal representation
+ of index (in the dictionary cache) */
+{
+ ut_ad(index);
+ ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+ ut_ad(index->cached);
+
+ if (dict_index_is_clust(index)) {
+
+ return(dict_index_get_n_unique(index));
+ }
+
+ return(dict_index_get_n_fields(index));
+}
+
+/********************************************************************//**
+Gets the number of user-defined ordering fields in the index. In the internal
+representation of clustered indexes we add the row id to the ordering fields
+to make a clustered index unique, but this function returns the number of
+fields the user defined in the index as ordering fields.
+@return number of fields */
+UNIV_INLINE
+ulint
+dict_index_get_n_ordering_defined_by_user(
+/*======================================*/
+ const dict_index_t* index) /*!< in: an internal representation
+ of index (in the dictionary cache) */
+{
+ return(index->n_user_defined_cols);
+}
+
+#ifdef UNIV_DEBUG
+/********************************************************************//**
+Gets the nth field of an index.
+@return pointer to field object */
+UNIV_INLINE
+dict_field_t*
+dict_index_get_nth_field(
+/*=====================*/
+ const dict_index_t* index, /*!< in: index */
+ ulint pos) /*!< in: position of field */
+{
+ ut_ad(index);
+ ut_ad(pos < index->n_def);
+ ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+
+ return((dict_field_t*) (index->fields) + pos);
+}
+#endif /* UNIV_DEBUG */
+
+/********************************************************************//**
+Returns the position of a system column in an index.
+@return position, ULINT_UNDEFINED if not contained */
+UNIV_INLINE
+ulint
+dict_index_get_sys_col_pos(
+/*=======================*/
+ const dict_index_t* index, /*!< in: index */
+ ulint type) /*!< in: DATA_ROW_ID, ... */
+{
+ ut_ad(index);
+ ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+ ut_ad(!(index->type & DICT_UNIVERSAL));
+
+ if (dict_index_is_clust(index)) {
+
+ return(dict_col_get_clust_pos(
+ dict_table_get_sys_col(index->table, type),
+ index));
+ }
+
+ return(dict_index_get_nth_col_pos(
+ index, dict_table_get_sys_col_no(index->table, type)));
+}
+
+/*********************************************************************//**
+Gets the field column.
+@return field->col, pointer to the table column */
+UNIV_INLINE
+const dict_col_t*
+dict_field_get_col(
+/*===============*/
+ const dict_field_t* field) /*!< in: index field */
+{
+ ut_ad(field);
+
+ return(field->col);
+}
+
+/********************************************************************//**
+Gets pointer to the nth column in an index.
+@return column */
+UNIV_INLINE
+const dict_col_t*
+dict_index_get_nth_col(
+/*===================*/
+ const dict_index_t* index, /*!< in: index */
+ ulint pos) /*!< in: position of the field */
+{
+ return(dict_field_get_col(dict_index_get_nth_field(index, pos)));
+}
+
+/********************************************************************//**
+Gets the column number the nth field in an index.
+@return column number */
+UNIV_INLINE
+ulint
+dict_index_get_nth_col_no(
+/*======================*/
+ const dict_index_t* index, /*!< in: index */
+ ulint pos) /*!< in: position of the field */
+{
+ return(dict_col_get_no(dict_index_get_nth_col(index, pos)));
+}
+
+#ifndef UNIV_HOTBACKUP
+/********************************************************************//**
+Returns the minimum data size of an index record.
+@return minimum data size in bytes */
+UNIV_INLINE
+ulint
+dict_index_get_min_size(
+/*====================*/
+ const dict_index_t* index) /*!< in: index */
+{
+ ulint n = dict_index_get_n_fields(index);
+ ulint size = 0;
+
+ while (n--) {
+ size += dict_col_get_min_size(dict_index_get_nth_col(index,
+ n));
+ }
+
+ return(size);
+}
+
+/*********************************************************************//**
+Gets the space id of the root of the index tree.
+@return space id */
+UNIV_INLINE
+ulint
+dict_index_get_space(
+/*=================*/
+ const dict_index_t* index) /*!< in: index */
+{
+ ut_ad(index);
+ ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+
+ return(index->space);
+}
+
+/*********************************************************************//**
+Sets the space id of the root of the index tree. */
+UNIV_INLINE
+void
+dict_index_set_space(
+/*=================*/
+ dict_index_t* index, /*!< in/out: index */
+ ulint space) /*!< in: space id */
+{
+ ut_ad(index);
+ ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+
+ index->space = space;
+}
+
+/*********************************************************************//**
+Gets the page number of the root of the index tree.
+@return page number */
+UNIV_INLINE
+ulint
+dict_index_get_page(
+/*================*/
+ const dict_index_t* index) /*!< in: index */
+{
+ ut_ad(index);
+ ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+
+ return(index->page);
+}
+
+/*********************************************************************//**
+Sets the page number of the root of index tree. */
+UNIV_INLINE
+void
+dict_index_set_page(
+/*================*/
+ dict_index_t* index, /*!< in/out: index */
+ ulint page) /*!< in: page number */
+{
+ ut_ad(index);
+ ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+
+ index->page = page;
+}
+
+/*********************************************************************//**
+Gets the read-write lock of the index tree.
+@return read-write lock */
+UNIV_INLINE
+rw_lock_t*
+dict_index_get_lock(
+/*================*/
+ dict_index_t* index) /*!< in: index */
+{
+ ut_ad(index);
+ ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+
+ return(&(index->lock));
+}
+
+/********************************************************************//**
+Returns free space reserved for future updates of records. This is
+relevant only in the case of many consecutive inserts, as updates
+which make the records bigger might fragment the index.
+@return number of free bytes on page, reserved for updates */
+UNIV_INLINE
+ulint
+dict_index_get_space_reserve(void)
+/*==============================*/
+{
+ return(UNIV_PAGE_SIZE / 16);
+}
+
+/**********************************************************************//**
+Checks if a table is in the dictionary cache.
+@return table, NULL if not found */
+UNIV_INLINE
+dict_table_t*
+dict_table_check_if_in_cache_low(
+/*=============================*/
+ const char* table_name) /*!< in: table name */
+{
+ dict_table_t* table;
+ ulint table_fold;
+
+ ut_ad(table_name);
+ ut_ad(mutex_own(&(dict_sys->mutex)));
+
+ /* Look for the table name in the hash table */
+ table_fold = ut_fold_string(table_name);
+
+ HASH_SEARCH(name_hash, dict_sys->table_hash, table_fold,
+ dict_table_t*, table, ut_ad(table->cached),
+ !strcmp(table->name, table_name));
+ return(table);
+}
+
+/**********************************************************************//**
+Gets a table; loads it to the dictionary cache if necessary. A low-level
+function.
+@return table, NULL if not found */
+UNIV_INLINE
+dict_table_t*
+dict_table_get_low(
+/*===============*/
+ const char* table_name) /*!< in: table name */
+{
+ dict_table_t* table;
+
+ ut_ad(table_name);
+ ut_ad(mutex_own(&(dict_sys->mutex)));
+
+ table = dict_table_check_if_in_cache_low(table_name);
+
+ if (table == NULL) {
+ table = dict_load_table(table_name);
+ }
+
+ ut_ad(!table || table->cached);
+
+ return(table);
+}
+
+/**********************************************************************//**
+Returns a table object based on table id.
+@return table, NULL if does not exist */
+UNIV_INLINE
+dict_table_t*
+dict_table_get_on_id_low(
+/*=====================*/
+ dulint table_id) /*!< in: table id */
+{
+ dict_table_t* table;
+ ulint fold;
+
+ ut_ad(mutex_own(&(dict_sys->mutex)));
+
+ /* Look for the table name in the hash table */
+ fold = ut_fold_dulint(table_id);
+
+ HASH_SEARCH(id_hash, dict_sys->table_id_hash, fold,
+ dict_table_t*, table, ut_ad(table->cached),
+ !ut_dulint_cmp(table->id, table_id));
+ if (table == NULL) {
+ table = dict_load_table_on_id(table_id);
+ }
+
+ ut_ad(!table || table->cached);
+
+ /* TODO: should get the type information from MySQL */
+
+ return(table);
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/innobase/include/dict0load.h b/storage/innobase/include/dict0load.h
new file mode 100644
index 00000000000..60b8c1fb632
--- /dev/null
+++ b/storage/innobase/include/dict0load.h
@@ -0,0 +1,115 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/dict0load.h
+Loads to the memory cache database object definitions
+from dictionary tables
+
+Created 4/24/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef dict0load_h
+#define dict0load_h
+
+#include "univ.i"
+#include "dict0types.h"
+#include "ut0byte.h"
+#include "mem0mem.h"
+
+/********************************************************************//**
+In a crash recovery we already have all the tablespace objects created.
+This function compares the space id information in the InnoDB data dictionary
+to what we already read with fil_load_single_table_tablespaces().
+
+In a normal startup, we create the tablespace objects for every table in
+InnoDB's data dictionary, if the corresponding .ibd file exists.
+We also scan the biggest space id, and store it to fil_system. */
+UNIV_INTERN
+void
+dict_check_tablespaces_and_store_max_id(
+/*====================================*/
+ ibool in_crash_recovery); /*!< in: are we doing a crash recovery */
+/********************************************************************//**
+Finds the first table name in the given database.
+@return own: table name, NULL if does not exist; the caller must free
+the memory in the string! */
+UNIV_INTERN
+char*
+dict_get_first_table_name_in_db(
+/*============================*/
+ const char* name); /*!< in: database name which ends to '/' */
+/********************************************************************//**
+Loads a table definition and also all its index definitions, and also
+the cluster definition if the table is a member in a cluster. Also loads
+all foreign key constraints where the foreign key is in the table or where
+a foreign key references columns in this table.
+@return table, NULL if does not exist; if the table is stored in an
+.ibd file, but the file does not exist, then we set the
+ibd_file_missing flag TRUE in the table object we return */
+UNIV_INTERN
+dict_table_t*
+dict_load_table(
+/*============*/
+ const char* name); /*!< in: table name in the
+ databasename/tablename format */
+/***********************************************************************//**
+Loads a table object based on the table id.
+@return table; NULL if table does not exist */
+UNIV_INTERN
+dict_table_t*
+dict_load_table_on_id(
+/*==================*/
+ dulint table_id); /*!< in: table id */
+/********************************************************************//**
+This function is called when the database is booted.
+Loads system table index definitions except for the clustered index which
+is added to the dictionary cache at booting before calling this function. */
+UNIV_INTERN
+void
+dict_load_sys_table(
+/*================*/
+ dict_table_t* table); /*!< in: system table */
+/***********************************************************************//**
+Loads foreign key constraints where the table is either the foreign key
+holder or where the table is referenced by a foreign key. Adds these
+constraints to the data dictionary. Note that we know that the dictionary
+cache already contains all constraints where the other relevant table is
+already in the dictionary cache.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+ulint
+dict_load_foreigns(
+/*===============*/
+ const char* table_name, /*!< in: table name */
+ ibool check_charsets);/*!< in: TRUE=check charsets
+ compatibility */
+/********************************************************************//**
+Prints to the standard output information on all tables found in the data
+dictionary system table. */
+UNIV_INTERN
+void
+dict_print(void);
+/*============*/
+
+
+#ifndef UNIV_NONINL
+#include "dict0load.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/dict0load.ic b/storage/innobase/include/dict0load.ic
new file mode 100644
index 00000000000..ccc16db165b
--- /dev/null
+++ b/storage/innobase/include/dict0load.ic
@@ -0,0 +1,26 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/dict0load.ic
+Loads to the memory cache database object definitions
+from dictionary tables
+
+Created 4/24/1996 Heikki Tuuri
+*******************************************************/
+
diff --git a/storage/innobase/include/dict0mem.h b/storage/innobase/include/dict0mem.h
new file mode 100644
index 00000000000..1ee906fbf57
--- /dev/null
+++ b/storage/innobase/include/dict0mem.h
@@ -0,0 +1,537 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/dict0mem.h
+Data dictionary memory object creation
+
+Created 1/8/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef dict0mem_h
+#define dict0mem_h
+
+#include "univ.i"
+#include "dict0types.h"
+#include "data0type.h"
+#include "mem0mem.h"
+#include "rem0types.h"
+#include "btr0types.h"
+#ifndef UNIV_HOTBACKUP
+# include "lock0types.h"
+# include "que0types.h"
+# include "sync0rw.h"
+#endif /* !UNIV_HOTBACKUP */
+#include "ut0mem.h"
+#include "ut0lst.h"
+#include "ut0rnd.h"
+#include "ut0byte.h"
+#include "hash0hash.h"
+#include "trx0types.h"
+
+/** Type flags of an index: OR'ing of the flags is allowed to define a
+combination of types */
+/* @{ */
+#define DICT_CLUSTERED 1 /*!< clustered index */
+#define DICT_UNIQUE 2 /*!< unique index */
+#define DICT_UNIVERSAL 4 /*!< index which can contain records from any
+ other index */
+#define DICT_IBUF 8 /*!< insert buffer tree */
+/* @} */
+
+/** Types for a table object */
+#define DICT_TABLE_ORDINARY 1 /*!< ordinary table */
+#if 0 /* not implemented */
+#define DICT_TABLE_CLUSTER_MEMBER 2
+#define DICT_TABLE_CLUSTER 3 /* this means that the table is
+ really a cluster definition */
+#endif
+
+/** Table flags. All unused bits must be 0. */
+/* @{ */
+#define DICT_TF_COMPACT 1 /* Compact page format.
+ This must be set for
+ new file formats
+ (later than
+ DICT_TF_FORMAT_51). */
+
+/** Compressed page size (0=uncompressed, up to 15 compressed sizes) */
+/* @{ */
+#define DICT_TF_ZSSIZE_SHIFT 1
+#define DICT_TF_ZSSIZE_MASK (15 << DICT_TF_ZSSIZE_SHIFT)
+#define DICT_TF_ZSSIZE_MAX (UNIV_PAGE_SIZE_SHIFT - PAGE_ZIP_MIN_SIZE_SHIFT + 1)
+/* @} */
+
+/** File format */
+/* @{ */
+#define DICT_TF_FORMAT_SHIFT 5 /* file format */
+#define DICT_TF_FORMAT_MASK (127 << DICT_TF_FORMAT_SHIFT)
+#define DICT_TF_FORMAT_51 0 /*!< InnoDB/MySQL up to 5.1 */
+#define DICT_TF_FORMAT_ZIP 1 /*!< InnoDB plugin for 5.1:
+ compressed tables,
+ new BLOB treatment */
+/** Maximum supported file format */
+#define DICT_TF_FORMAT_MAX DICT_TF_FORMAT_ZIP
+
+#define DICT_TF_BITS 6 /*!< number of flag bits */
+#if (1 << (DICT_TF_BITS - DICT_TF_FORMAT_SHIFT)) <= DICT_TF_FORMAT_MAX
+# error "DICT_TF_BITS is insufficient for DICT_TF_FORMAT_MAX"
+#endif
+/* @} */
+/* @} */
+
+/**********************************************************************//**
+Creates a table memory object.
+@return own: table object */
+UNIV_INTERN
+dict_table_t*
+dict_mem_table_create(
+/*==================*/
+ const char* name, /*!< in: table name */
+ ulint space, /*!< in: space where the clustered index
+ of the table is placed; this parameter
+ is ignored if the table is made
+ a member of a cluster */
+ ulint n_cols, /*!< in: number of columns */
+ ulint flags); /*!< in: table flags */
+/****************************************************************//**
+Free a table memory object. */
+UNIV_INTERN
+void
+dict_mem_table_free(
+/*================*/
+ dict_table_t* table); /*!< in: table */
+/**********************************************************************//**
+Adds a column definition to a table. */
+UNIV_INTERN
+void
+dict_mem_table_add_col(
+/*===================*/
+ dict_table_t* table, /*!< in: table */
+ mem_heap_t* heap, /*!< in: temporary memory heap, or NULL */
+ const char* name, /*!< in: column name, or NULL */
+ ulint mtype, /*!< in: main datatype */
+ ulint prtype, /*!< in: precise type */
+ ulint len); /*!< in: precision */
+/**********************************************************************//**
+Creates an index memory object.
+@return own: index object */
+UNIV_INTERN
+dict_index_t*
+dict_mem_index_create(
+/*==================*/
+ const char* table_name, /*!< in: table name */
+ const char* index_name, /*!< in: index name */
+ ulint space, /*!< in: space where the index tree is
+ placed, ignored if the index is of
+ the clustered type */
+ ulint type, /*!< in: DICT_UNIQUE,
+ DICT_CLUSTERED, ... ORed */
+ ulint n_fields); /*!< in: number of fields */
+/**********************************************************************//**
+Adds a field definition to an index. NOTE: does not take a copy
+of the column name if the field is a column. The memory occupied
+by the column name may be released only after publishing the index. */
+UNIV_INTERN
+void
+dict_mem_index_add_field(
+/*=====================*/
+ dict_index_t* index, /*!< in: index */
+ const char* name, /*!< in: column name */
+ ulint prefix_len); /*!< in: 0 or the column prefix length
+ in a MySQL index like
+ INDEX (textcol(25)) */
+/**********************************************************************//**
+Frees an index memory object. */
+UNIV_INTERN
+void
+dict_mem_index_free(
+/*================*/
+ dict_index_t* index); /*!< in: index */
+/**********************************************************************//**
+Creates and initializes a foreign constraint memory object.
+@return own: foreign constraint struct */
+UNIV_INTERN
+dict_foreign_t*
+dict_mem_foreign_create(void);
+/*=========================*/
+
+/** Data structure for a column in a table */
+struct dict_col_struct{
+ /*----------------------*/
+ /** The following are copied from dtype_t,
+ so that all bit-fields can be packed tightly. */
+ /* @{ */
+ unsigned mtype:8; /*!< main data type */
+ unsigned prtype:24; /*!< precise type; MySQL data
+ type, charset code, flags to
+ indicate nullability,
+ signedness, whether this is a
+ binary string, whether this is
+ a true VARCHAR where MySQL
+ uses 2 bytes to store the length */
+
+ /* the remaining fields do not affect alphabetical ordering: */
+
+ unsigned len:16; /*!< length; for MySQL data this
+ is field->pack_length(),
+ except that for a >= 5.0.3
+ type true VARCHAR this is the
+ maximum byte length of the
+ string data (in addition to
+ the string, MySQL uses 1 or 2
+ bytes to store the string length) */
+
+ unsigned mbminlen:2; /*!< minimum length of a
+ character, in bytes */
+ unsigned mbmaxlen:3; /*!< maximum length of a
+ character, in bytes */
+ /*----------------------*/
+ /* End of definitions copied from dtype_t */
+ /* @} */
+
+ unsigned ind:10; /*!< table column position
+ (starting from 0) */
+ unsigned ord_part:1; /*!< nonzero if this column
+ appears in the ordering fields
+ of an index */
+};
+
+/** @brief DICT_MAX_INDEX_COL_LEN is measured in bytes and is the maximum
+indexed column length (or indexed prefix length).
+
+It is set to 3*256, so that one can create a column prefix index on
+256 characters of a TEXT or VARCHAR column also in the UTF-8
+charset. In that charset, a character may take at most 3 bytes. This
+constant MUST NOT BE CHANGED, or the compatibility of InnoDB data
+files would be at risk! */
+#define DICT_MAX_INDEX_COL_LEN REC_MAX_INDEX_COL_LEN
+
+/** Data structure for a field in an index */
+struct dict_field_struct{
+ dict_col_t* col; /*!< pointer to the table column */
+ const char* name; /*!< name of the column */
+ unsigned prefix_len:10; /*!< 0 or the length of the column
+ prefix in bytes in a MySQL index of
+ type, e.g., INDEX (textcol(25));
+ must be smaller than
+ DICT_MAX_INDEX_COL_LEN; NOTE that
+ in the UTF-8 charset, MySQL sets this
+ to 3 * the prefix len in UTF-8 chars */
+ unsigned fixed_len:10; /*!< 0 or the fixed length of the
+ column if smaller than
+ DICT_MAX_INDEX_COL_LEN */
+};
+
+/** Data structure for an index. Most fields will be
+initialized to 0, NULL or FALSE in dict_mem_index_create(). */
+struct dict_index_struct{
+ dulint id; /*!< id of the index */
+ mem_heap_t* heap; /*!< memory heap */
+ const char* name; /*!< index name */
+ const char* table_name;/*!< table name */
+ dict_table_t* table; /*!< back pointer to table */
+#ifndef UNIV_HOTBACKUP
+ unsigned space:32;
+ /*!< space where the index tree is placed */
+ unsigned page:32;/*!< index tree root page number */
+#endif /* !UNIV_HOTBACKUP */
+ unsigned type:4; /*!< index type (DICT_CLUSTERED, DICT_UNIQUE,
+ DICT_UNIVERSAL, DICT_IBUF) */
+ unsigned trx_id_offset:10;/*!< position of the trx id column
+ in a clustered index record, if the fields
+ before it are known to be of a fixed size,
+ 0 otherwise */
+ unsigned n_user_defined_cols:10;
+ /*!< number of columns the user defined to
+ be in the index: in the internal
+ representation we add more columns */
+ unsigned n_uniq:10;/*!< number of fields from the beginning
+ which are enough to determine an index
+ entry uniquely */
+ unsigned n_def:10;/*!< number of fields defined so far */
+ unsigned n_fields:10;/*!< number of fields in the index */
+ unsigned n_nullable:10;/*!< number of nullable fields */
+ unsigned cached:1;/*!< TRUE if the index object is in the
+ dictionary cache */
+ unsigned to_be_dropped:1;
+ /*!< TRUE if this index is marked to be
+ dropped in ha_innobase::prepare_drop_index(),
+ otherwise FALSE */
+ dict_field_t* fields; /*!< array of field descriptions */
+#ifndef UNIV_HOTBACKUP
+ UT_LIST_NODE_T(dict_index_t)
+ indexes;/*!< list of indexes of the table */
+ btr_search_t* search_info; /*!< info used in optimistic searches */
+ /*----------------------*/
+ /** Statistics for query optimization */
+ /* @{ */
+ ib_int64_t* stat_n_diff_key_vals;
+ /*!< approximate number of different
+ key values for this index, for each
+ n-column prefix where n <=
+ dict_get_n_unique(index); we
+ periodically calculate new
+ estimates */
+ ulint stat_index_size;
+ /*!< approximate index size in
+ database pages */
+ ulint stat_n_leaf_pages;
+ /*!< approximate number of leaf pages in the
+ index tree */
+ /* @} */
+ rw_lock_t lock; /*!< read-write lock protecting the
+ upper levels of the index tree */
+ ib_uint64_t trx_id; /*!< id of the transaction that created this
+ index, or 0 if the index existed
+ when InnoDB was started up */
+#endif /* !UNIV_HOTBACKUP */
+#ifdef UNIV_DEBUG
+ ulint magic_n;/*!< magic number */
+/** Value of dict_index_struct::magic_n */
+# define DICT_INDEX_MAGIC_N 76789786
+#endif
+};
+
+/** Data structure for a foreign key constraint; an example:
+FOREIGN KEY (A, B) REFERENCES TABLE2 (C, D). Most fields will be
+initialized to 0, NULL or FALSE in dict_mem_foreign_create(). */
+struct dict_foreign_struct{
+ mem_heap_t* heap; /*!< this object is allocated from
+ this memory heap */
+ char* id; /*!< id of the constraint as a
+ null-terminated string */
+ unsigned n_fields:10; /*!< number of indexes' first fields
+ for which the the foreign key
+ constraint is defined: we allow the
+ indexes to contain more fields than
+ mentioned in the constraint, as long
+ as the first fields are as mentioned */
+ unsigned type:6; /*!< 0 or DICT_FOREIGN_ON_DELETE_CASCADE
+ or DICT_FOREIGN_ON_DELETE_SET_NULL */
+ char* foreign_table_name;/*!< foreign table name */
+ dict_table_t* foreign_table; /*!< table where the foreign key is */
+ const char** foreign_col_names;/*!< names of the columns in the
+ foreign key */
+ char* referenced_table_name;/*!< referenced table name */
+ dict_table_t* referenced_table;/*!< table where the referenced key
+ is */
+ const char** referenced_col_names;/*!< names of the referenced
+ columns in the referenced table */
+ dict_index_t* foreign_index; /*!< foreign index; we require that
+ both tables contain explicitly defined
+ indexes for the constraint: InnoDB
+ does not generate new indexes
+ implicitly */
+ dict_index_t* referenced_index;/*!< referenced index */
+ UT_LIST_NODE_T(dict_foreign_t)
+ foreign_list; /*!< list node for foreign keys of the
+ table */
+ UT_LIST_NODE_T(dict_foreign_t)
+ referenced_list;/*!< list node for referenced
+ keys of the table */
+};
+
+/** The flags for ON_UPDATE and ON_DELETE can be ORed; the default is that
+a foreign key constraint is enforced, therefore RESTRICT just means no flag */
+/* @{ */
+#define DICT_FOREIGN_ON_DELETE_CASCADE 1 /*!< ON DELETE CASCADE */
+#define DICT_FOREIGN_ON_DELETE_SET_NULL 2 /*!< ON UPDATE SET NULL */
+#define DICT_FOREIGN_ON_UPDATE_CASCADE 4 /*!< ON DELETE CASCADE */
+#define DICT_FOREIGN_ON_UPDATE_SET_NULL 8 /*!< ON UPDATE SET NULL */
+#define DICT_FOREIGN_ON_DELETE_NO_ACTION 16 /*!< ON DELETE NO ACTION */
+#define DICT_FOREIGN_ON_UPDATE_NO_ACTION 32 /*!< ON UPDATE NO ACTION */
+/* @} */
+
+
+/** Data structure for a database table. Most fields will be
+initialized to 0, NULL or FALSE in dict_mem_table_create(). */
+struct dict_table_struct{
+ dulint id; /*!< id of the table */
+ mem_heap_t* heap; /*!< memory heap */
+ const char* name; /*!< table name */
+ const char* dir_path_of_temp_table;/*!< NULL or the directory path
+ where a TEMPORARY table that was explicitly
+ created by a user should be placed if
+ innodb_file_per_table is defined in my.cnf;
+ in Unix this is usually /tmp/..., in Windows
+ temp\... */
+ unsigned space:32;
+ /*!< space where the clustered index of the
+ table is placed */
+ unsigned flags:DICT_TF_BITS;/*!< DICT_TF_COMPACT, ... */
+ unsigned ibd_file_missing:1;
+ /*!< TRUE if this is in a single-table
+ tablespace and the .ibd file is missing; then
+ we must return in ha_innodb.cc an error if the
+ user tries to query such an orphaned table */
+ unsigned tablespace_discarded:1;
+ /*!< this flag is set TRUE when the user
+ calls DISCARD TABLESPACE on this
+ table, and reset to FALSE in IMPORT
+ TABLESPACE */
+ unsigned cached:1;/*!< TRUE if the table object has been added
+ to the dictionary cache */
+ unsigned n_def:10;/*!< number of columns defined so far */
+ unsigned n_cols:10;/*!< number of columns */
+ dict_col_t* cols; /*!< array of column descriptions */
+ const char* col_names;
+ /*!< Column names packed in a character string
+ "name1\0name2\0...nameN\0". Until
+ the string contains n_cols, it will be
+ allocated from a temporary heap. The final
+ string will be allocated from table->heap. */
+#ifndef UNIV_HOTBACKUP
+ hash_node_t name_hash; /*!< hash chain node */
+ hash_node_t id_hash; /*!< hash chain node */
+ UT_LIST_BASE_NODE_T(dict_index_t)
+ indexes; /*!< list of indexes of the table */
+ UT_LIST_BASE_NODE_T(dict_foreign_t)
+ foreign_list;/*!< list of foreign key constraints
+ in the table; these refer to columns
+ in other tables */
+ UT_LIST_BASE_NODE_T(dict_foreign_t)
+ referenced_list;/*!< list of foreign key constraints
+ which refer to this table */
+ UT_LIST_NODE_T(dict_table_t)
+ table_LRU; /*!< node of the LRU list of tables */
+ ulint n_mysql_handles_opened;
+ /*!< count of how many handles MySQL has opened
+ to this table; dropping of the table is
+ NOT allowed until this count gets to zero;
+ MySQL does NOT itself check the number of
+ open handles at drop */
+ ulint n_foreign_key_checks_running;
+ /*!< count of how many foreign key check
+ operations are currently being performed
+ on the table: we cannot drop the table while
+ there are foreign key checks running on
+ it! */
+ trx_id_t query_cache_inv_trx_id;
+ /*!< transactions whose trx id is
+ smaller than this number are not
+ allowed to store to the MySQL query
+ cache or retrieve from it; when a trx
+ with undo logs commits, it sets this
+ to the value of the trx id counter for
+ the tables it had an IX lock on */
+ UT_LIST_BASE_NODE_T(lock_t)
+ locks; /*!< list of locks on the table */
+#ifdef UNIV_DEBUG
+ /*----------------------*/
+ ibool does_not_fit_in_memory;
+ /*!< this field is used to specify in
+ simulations tables which are so big
+ that disk should be accessed: disk
+ access is simulated by putting the
+ thread to sleep for a while; NOTE that
+ this flag is not stored to the data
+ dictionary on disk, and the database
+ will forget about value TRUE if it has
+ to reload the table definition from
+ disk */
+#endif /* UNIV_DEBUG */
+ /*----------------------*/
+ unsigned big_rows:1;
+ /*!< flag: TRUE if the maximum length of
+ a single row exceeds BIG_ROW_SIZE;
+ initialized in dict_table_add_to_cache() */
+ /** Statistics for query optimization */
+ /* @{ */
+ unsigned stat_initialized:1; /*!< TRUE if statistics have
+ been calculated the first time
+ after database startup or table creation */
+ ib_int64_t stat_n_rows;
+ /*!< approximate number of rows in the table;
+ we periodically calculate new estimates */
+ ulint stat_clustered_index_size;
+ /*!< approximate clustered index size in
+ database pages */
+ ulint stat_sum_of_other_index_sizes;
+ /*!< other indexes in database pages */
+ ulint stat_modified_counter;
+ /*!< when a row is inserted, updated,
+ or deleted,
+ we add 1 to this number; we calculate new
+ estimates for the stat_... values for the
+ table and the indexes at an interval of 2 GB
+ or when about 1 / 16 of table has been
+ modified; also when the estimate operation is
+ called for MySQL SHOW TABLE STATUS; the
+ counter is reset to zero at statistics
+ calculation; this counter is not protected by
+ any latch, because this is only used for
+ heuristics */
+ /* @} */
+ /*----------------------*/
+ /**!< The following fields are used by the
+ AUTOINC code. The actual collection of
+ tables locked during AUTOINC read/write is
+ kept in trx_t. In order to quickly determine
+ whether a transaction has locked the AUTOINC
+ lock we keep a pointer to the transaction
+ here in the autoinc_trx variable. This is to
+ avoid acquiring the kernel mutex and scanning
+ the vector in trx_t.
+
+ When an AUTOINC lock has to wait, the
+ corresponding lock instance is created on
+ the trx lock heap rather than use the
+ pre-allocated instance in autoinc_lock below.*/
+ /* @{ */
+ lock_t* autoinc_lock;
+ /*!< a buffer for an AUTOINC lock
+ for this table: we allocate the memory here
+ so that individual transactions can get it
+ and release it without a need to allocate
+ space from the lock heap of the trx:
+ otherwise the lock heap would grow rapidly
+ if we do a large insert from a select */
+ mutex_t autoinc_mutex;
+ /*!< mutex protecting the autoincrement
+ counter */
+ ib_uint64_t autoinc;/*!< autoinc counter value to give to the
+ next inserted row */
+ ulong n_waiting_or_granted_auto_inc_locks;
+ /*!< This counter is used to track the number
+ of granted and pending autoinc locks on this
+ table. This value is set after acquiring the
+ kernel mutex but we peek the contents to
+ determine whether other transactions have
+ acquired the AUTOINC lock or not. Of course
+ only one transaction can be granted the
+ lock but there can be multiple waiters. */
+ const trx_t* autoinc_trx;
+ /*!< The transaction that currently holds the
+ the AUTOINC lock on this table. */
+ /* @} */
+ /*----------------------*/
+#endif /* !UNIV_HOTBACKUP */
+
+#ifdef UNIV_DEBUG
+ ulint magic_n;/*!< magic number */
+/** Value of dict_table_struct::magic_n */
+# define DICT_TABLE_MAGIC_N 76333786
+#endif /* UNIV_DEBUG */
+};
+
+#ifndef UNIV_NONINL
+#include "dict0mem.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/dict0mem.ic b/storage/innobase/include/dict0mem.ic
new file mode 100644
index 00000000000..c36adb07a18
--- /dev/null
+++ b/storage/innobase/include/dict0mem.ic
@@ -0,0 +1,26 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/dict0mem.ic
+Data dictionary memory object creation
+
+Created 1/8/1996 Heikki Tuuri
+***********************************************************************/
+
+
diff --git a/storage/innobase/include/dict0types.h b/storage/innobase/include/dict0types.h
new file mode 100644
index 00000000000..7ad69193cc9
--- /dev/null
+++ b/storage/innobase/include/dict0types.h
@@ -0,0 +1,48 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/dict0types.h
+Data dictionary global types
+
+Created 1/8/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef dict0types_h
+#define dict0types_h
+
+typedef struct dict_sys_struct dict_sys_t;
+typedef struct dict_col_struct dict_col_t;
+typedef struct dict_field_struct dict_field_t;
+typedef struct dict_index_struct dict_index_t;
+typedef struct dict_table_struct dict_table_t;
+typedef struct dict_foreign_struct dict_foreign_t;
+
+/* A cluster object is a table object with the type field set to
+DICT_CLUSTERED */
+
+typedef dict_table_t dict_cluster_t;
+
+typedef struct ind_node_struct ind_node_t;
+typedef struct tab_node_struct tab_node_t;
+
+/* Space id and page no where the dictionary header resides */
+#define DICT_HDR_SPACE 0 /* the SYSTEM tablespace */
+#define DICT_HDR_PAGE_NO FSP_DICT_HDR_PAGE_NO
+
+#endif
diff --git a/storage/innobase/include/dyn0dyn.h b/storage/innobase/include/dyn0dyn.h
new file mode 100644
index 00000000000..121a5946ac7
--- /dev/null
+++ b/storage/innobase/include/dyn0dyn.h
@@ -0,0 +1,188 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/dyn0dyn.h
+The dynamically allocated array
+
+Created 2/5/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef dyn0dyn_h
+#define dyn0dyn_h
+
+#include "univ.i"
+#include "ut0lst.h"
+#include "mem0mem.h"
+
+/** A block in a dynamically allocated array */
+typedef struct dyn_block_struct dyn_block_t;
+/** Dynamically allocated array */
+typedef dyn_block_t dyn_array_t;
+
+
+/** This is the initial 'payload' size of a dynamic array;
+this must be > MLOG_BUF_MARGIN + 30! */
+#define DYN_ARRAY_DATA_SIZE 512
+
+/*********************************************************************//**
+Initializes a dynamic array.
+@return initialized dyn array */
+UNIV_INLINE
+dyn_array_t*
+dyn_array_create(
+/*=============*/
+ dyn_array_t* arr); /*!< in: pointer to a memory buffer of
+ size sizeof(dyn_array_t) */
+/************************************************************//**
+Frees a dynamic array. */
+UNIV_INLINE
+void
+dyn_array_free(
+/*===========*/
+ dyn_array_t* arr); /*!< in: dyn array */
+/*********************************************************************//**
+Makes room on top of a dyn array and returns a pointer to a buffer in it.
+After copying the elements, the caller must close the buffer using
+dyn_array_close.
+@return pointer to the buffer */
+UNIV_INLINE
+byte*
+dyn_array_open(
+/*===========*/
+ dyn_array_t* arr, /*!< in: dynamic array */
+ ulint size); /*!< in: size in bytes of the buffer; MUST be
+ smaller than DYN_ARRAY_DATA_SIZE! */
+/*********************************************************************//**
+Closes the buffer returned by dyn_array_open. */
+UNIV_INLINE
+void
+dyn_array_close(
+/*============*/
+ dyn_array_t* arr, /*!< in: dynamic array */
+ byte* ptr); /*!< in: buffer space from ptr up was not used */
+/*********************************************************************//**
+Makes room on top of a dyn array and returns a pointer to
+the added element. The caller must copy the element to
+the pointer returned.
+@return pointer to the element */
+UNIV_INLINE
+void*
+dyn_array_push(
+/*===========*/
+ dyn_array_t* arr, /*!< in: dynamic array */
+ ulint size); /*!< in: size in bytes of the element */
+/************************************************************//**
+Returns pointer to an element in dyn array.
+@return pointer to element */
+UNIV_INLINE
+void*
+dyn_array_get_element(
+/*==================*/
+ dyn_array_t* arr, /*!< in: dyn array */
+ ulint pos); /*!< in: position of element as bytes
+ from array start */
+/************************************************************//**
+Returns the size of stored data in a dyn array.
+@return data size in bytes */
+UNIV_INLINE
+ulint
+dyn_array_get_data_size(
+/*====================*/
+ dyn_array_t* arr); /*!< in: dyn array */
+/************************************************************//**
+Gets the first block in a dyn array. */
+UNIV_INLINE
+dyn_block_t*
+dyn_array_get_first_block(
+/*======================*/
+ dyn_array_t* arr); /*!< in: dyn array */
+/************************************************************//**
+Gets the last block in a dyn array. */
+UNIV_INLINE
+dyn_block_t*
+dyn_array_get_last_block(
+/*=====================*/
+ dyn_array_t* arr); /*!< in: dyn array */
+/********************************************************************//**
+Gets the next block in a dyn array.
+@return pointer to next, NULL if end of list */
+UNIV_INLINE
+dyn_block_t*
+dyn_array_get_next_block(
+/*=====================*/
+ dyn_array_t* arr, /*!< in: dyn array */
+ dyn_block_t* block); /*!< in: dyn array block */
+/********************************************************************//**
+Gets the number of used bytes in a dyn array block.
+@return number of bytes used */
+UNIV_INLINE
+ulint
+dyn_block_get_used(
+/*===============*/
+ dyn_block_t* block); /*!< in: dyn array block */
+/********************************************************************//**
+Gets pointer to the start of data in a dyn array block.
+@return pointer to data */
+UNIV_INLINE
+byte*
+dyn_block_get_data(
+/*===============*/
+ dyn_block_t* block); /*!< in: dyn array block */
+/********************************************************//**
+Pushes n bytes to a dyn array. */
+UNIV_INLINE
+void
+dyn_push_string(
+/*============*/
+ dyn_array_t* arr, /*!< in: dyn array */
+ const byte* str, /*!< in: string to write */
+ ulint len); /*!< in: string length */
+
+/*#################################################################*/
+
+/** @brief A block in a dynamically allocated array.
+NOTE! Do not access the fields of the struct directly: the definition
+appears here only for the compiler to know its size! */
+struct dyn_block_struct{
+ mem_heap_t* heap; /*!< in the first block this is != NULL
+ if dynamic allocation has been needed */
+ ulint used; /*!< number of data bytes used in this block;
+ DYN_BLOCK_FULL_FLAG is set when the block
+ becomes full */
+ byte data[DYN_ARRAY_DATA_SIZE];
+ /*!< storage for array elements */
+ UT_LIST_BASE_NODE_T(dyn_block_t) base;
+ /*!< linear list of dyn blocks: this node is
+ used only in the first block */
+ UT_LIST_NODE_T(dyn_block_t) list;
+ /*!< linear list node: used in all blocks */
+#ifdef UNIV_DEBUG
+ ulint buf_end;/*!< only in the debug version: if dyn
+ array is opened, this is the buffer
+ end offset, else this is 0 */
+ ulint magic_n;/*!< magic number (DYN_BLOCK_MAGIC_N) */
+#endif
+};
+
+
+#ifndef UNIV_NONINL
+#include "dyn0dyn.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/dyn0dyn.ic b/storage/innobase/include/dyn0dyn.ic
new file mode 100644
index 00000000000..110e674abff
--- /dev/null
+++ b/storage/innobase/include/dyn0dyn.ic
@@ -0,0 +1,365 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/dyn0dyn.ic
+The dynamically allocated array
+
+Created 2/5/1996 Heikki Tuuri
+*******************************************************/
+
+/** Value of dyn_block_struct::magic_n */
+#define DYN_BLOCK_MAGIC_N 375767
+/** Flag for dyn_block_struct::used that indicates a full block */
+#define DYN_BLOCK_FULL_FLAG 0x1000000UL
+
+/************************************************************//**
+Adds a new block to a dyn array.
+@return created block */
+UNIV_INTERN
+dyn_block_t*
+dyn_array_add_block(
+/*================*/
+ dyn_array_t* arr); /*!< in: dyn array */
+
+
+/************************************************************//**
+Gets the first block in a dyn array. */
+UNIV_INLINE
+dyn_block_t*
+dyn_array_get_first_block(
+/*======================*/
+ dyn_array_t* arr) /*!< in: dyn array */
+{
+ return(arr);
+}
+
+/************************************************************//**
+Gets the last block in a dyn array. */
+UNIV_INLINE
+dyn_block_t*
+dyn_array_get_last_block(
+/*=====================*/
+ dyn_array_t* arr) /*!< in: dyn array */
+{
+ if (arr->heap == NULL) {
+
+ return(arr);
+ }
+
+ return(UT_LIST_GET_LAST(arr->base));
+}
+
+/********************************************************************//**
+Gets the next block in a dyn array.
+@return pointer to next, NULL if end of list */
+UNIV_INLINE
+dyn_block_t*
+dyn_array_get_next_block(
+/*=====================*/
+ dyn_array_t* arr, /*!< in: dyn array */
+ dyn_block_t* block) /*!< in: dyn array block */
+{
+ ut_ad(arr && block);
+
+ if (arr->heap == NULL) {
+ ut_ad(arr == block);
+
+ return(NULL);
+ }
+
+ return(UT_LIST_GET_NEXT(list, block));
+}
+
+/********************************************************************//**
+Gets the number of used bytes in a dyn array block.
+@return number of bytes used */
+UNIV_INLINE
+ulint
+dyn_block_get_used(
+/*===============*/
+ dyn_block_t* block) /*!< in: dyn array block */
+{
+ ut_ad(block);
+
+ return((block->used) & ~DYN_BLOCK_FULL_FLAG);
+}
+
+/********************************************************************//**
+Gets pointer to the start of data in a dyn array block.
+@return pointer to data */
+UNIV_INLINE
+byte*
+dyn_block_get_data(
+/*===============*/
+ dyn_block_t* block) /*!< in: dyn array block */
+{
+ ut_ad(block);
+
+ return(block->data);
+}
+
+/*********************************************************************//**
+Initializes a dynamic array.
+@return initialized dyn array */
+UNIV_INLINE
+dyn_array_t*
+dyn_array_create(
+/*=============*/
+ dyn_array_t* arr) /*!< in: pointer to a memory buffer of
+ size sizeof(dyn_array_t) */
+{
+ ut_ad(arr);
+#if DYN_ARRAY_DATA_SIZE >= DYN_BLOCK_FULL_FLAG
+# error "DYN_ARRAY_DATA_SIZE >= DYN_BLOCK_FULL_FLAG"
+#endif
+
+ arr->heap = NULL;
+ arr->used = 0;
+
+#ifdef UNIV_DEBUG
+ arr->buf_end = 0;
+ arr->magic_n = DYN_BLOCK_MAGIC_N;
+#endif
+ return(arr);
+}
+
+/************************************************************//**
+Frees a dynamic array. */
+UNIV_INLINE
+void
+dyn_array_free(
+/*===========*/
+ dyn_array_t* arr) /*!< in: dyn array */
+{
+ if (arr->heap != NULL) {
+ mem_heap_free(arr->heap);
+ }
+
+#ifdef UNIV_DEBUG
+ arr->magic_n = 0;
+#endif
+}
+
+/*********************************************************************//**
+Makes room on top of a dyn array and returns a pointer to the added element.
+The caller must copy the element to the pointer returned.
+@return pointer to the element */
+UNIV_INLINE
+void*
+dyn_array_push(
+/*===========*/
+ dyn_array_t* arr, /*!< in: dynamic array */
+ ulint size) /*!< in: size in bytes of the element */
+{
+ dyn_block_t* block;
+ ulint used;
+
+ ut_ad(arr);
+ ut_ad(arr->magic_n == DYN_BLOCK_MAGIC_N);
+ ut_ad(size <= DYN_ARRAY_DATA_SIZE);
+ ut_ad(size);
+
+ block = arr;
+ used = block->used;
+
+ if (used + size > DYN_ARRAY_DATA_SIZE) {
+ /* Get the last array block */
+
+ block = dyn_array_get_last_block(arr);
+ used = block->used;
+
+ if (used + size > DYN_ARRAY_DATA_SIZE) {
+ block = dyn_array_add_block(arr);
+ used = block->used;
+ }
+ }
+
+ block->used = used + size;
+ ut_ad(block->used <= DYN_ARRAY_DATA_SIZE);
+
+ return((block->data) + used);
+}
+
+/*********************************************************************//**
+Makes room on top of a dyn array and returns a pointer to a buffer in it.
+After copying the elements, the caller must close the buffer using
+dyn_array_close.
+@return pointer to the buffer */
+UNIV_INLINE
+byte*
+dyn_array_open(
+/*===========*/
+ dyn_array_t* arr, /*!< in: dynamic array */
+ ulint size) /*!< in: size in bytes of the buffer; MUST be
+ smaller than DYN_ARRAY_DATA_SIZE! */
+{
+ dyn_block_t* block;
+ ulint used;
+
+ ut_ad(arr);
+ ut_ad(arr->magic_n == DYN_BLOCK_MAGIC_N);
+ ut_ad(size <= DYN_ARRAY_DATA_SIZE);
+ ut_ad(size);
+
+ block = arr;
+ used = block->used;
+
+ if (used + size > DYN_ARRAY_DATA_SIZE) {
+ /* Get the last array block */
+
+ block = dyn_array_get_last_block(arr);
+ used = block->used;
+
+ if (used + size > DYN_ARRAY_DATA_SIZE) {
+ block = dyn_array_add_block(arr);
+ used = block->used;
+ ut_a(size <= DYN_ARRAY_DATA_SIZE);
+ }
+ }
+
+ ut_ad(block->used <= DYN_ARRAY_DATA_SIZE);
+#ifdef UNIV_DEBUG
+ ut_ad(arr->buf_end == 0);
+
+ arr->buf_end = used + size;
+#endif
+ return((block->data) + used);
+}
+
+/*********************************************************************//**
+Closes the buffer returned by dyn_array_open. */
+UNIV_INLINE
+void
+dyn_array_close(
+/*============*/
+ dyn_array_t* arr, /*!< in: dynamic array */
+ byte* ptr) /*!< in: buffer space from ptr up was not used */
+{
+ dyn_block_t* block;
+
+ ut_ad(arr);
+ ut_ad(arr->magic_n == DYN_BLOCK_MAGIC_N);
+
+ block = dyn_array_get_last_block(arr);
+
+ ut_ad(arr->buf_end + block->data >= ptr);
+
+ block->used = ptr - block->data;
+
+ ut_ad(block->used <= DYN_ARRAY_DATA_SIZE);
+
+#ifdef UNIV_DEBUG
+ arr->buf_end = 0;
+#endif
+}
+
+/************************************************************//**
+Returns pointer to an element in dyn array.
+@return pointer to element */
+UNIV_INLINE
+void*
+dyn_array_get_element(
+/*==================*/
+ dyn_array_t* arr, /*!< in: dyn array */
+ ulint pos) /*!< in: position of element as bytes
+ from array start */
+{
+ dyn_block_t* block;
+ ulint used;
+
+ ut_ad(arr);
+ ut_ad(arr->magic_n == DYN_BLOCK_MAGIC_N);
+
+ /* Get the first array block */
+ block = dyn_array_get_first_block(arr);
+
+ if (arr->heap != NULL) {
+ used = dyn_block_get_used(block);
+
+ while (pos >= used) {
+ pos -= used;
+ block = UT_LIST_GET_NEXT(list, block);
+ ut_ad(block);
+
+ used = dyn_block_get_used(block);
+ }
+ }
+
+ ut_ad(block);
+ ut_ad(dyn_block_get_used(block) >= pos);
+
+ return(block->data + pos);
+}
+
+/************************************************************//**
+Returns the size of stored data in a dyn array.
+@return data size in bytes */
+UNIV_INLINE
+ulint
+dyn_array_get_data_size(
+/*====================*/
+ dyn_array_t* arr) /*!< in: dyn array */
+{
+ dyn_block_t* block;
+ ulint sum = 0;
+
+ ut_ad(arr);
+ ut_ad(arr->magic_n == DYN_BLOCK_MAGIC_N);
+
+ if (arr->heap == NULL) {
+
+ return(arr->used);
+ }
+
+ /* Get the first array block */
+ block = dyn_array_get_first_block(arr);
+
+ while (block != NULL) {
+ sum += dyn_block_get_used(block);
+ block = dyn_array_get_next_block(arr, block);
+ }
+
+ return(sum);
+}
+
+/********************************************************//**
+Pushes n bytes to a dyn array. */
+UNIV_INLINE
+void
+dyn_push_string(
+/*============*/
+ dyn_array_t* arr, /*!< in: dyn array */
+ const byte* str, /*!< in: string to write */
+ ulint len) /*!< in: string length */
+{
+ ulint n_copied;
+
+ while (len > 0) {
+ if (len > DYN_ARRAY_DATA_SIZE) {
+ n_copied = DYN_ARRAY_DATA_SIZE;
+ } else {
+ n_copied = len;
+ }
+
+ memcpy(dyn_array_push(arr, n_copied), str, n_copied);
+
+ str += n_copied;
+ len -= n_copied;
+ }
+}
diff --git a/storage/innobase/include/eval0eval.h b/storage/innobase/include/eval0eval.h
new file mode 100644
index 00000000000..60aefd8d453
--- /dev/null
+++ b/storage/innobase/include/eval0eval.h
@@ -0,0 +1,114 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/eval0eval.h
+SQL evaluator: evaluates simple data structures, like expressions, in
+a query graph
+
+Created 12/29/1997 Heikki Tuuri
+*******************************************************/
+
+#ifndef eval0eval_h
+#define eval0eval_h
+
+#include "univ.i"
+#include "que0types.h"
+#include "pars0sym.h"
+#include "pars0pars.h"
+
+/*****************************************************************//**
+Free the buffer from global dynamic memory for a value of a que_node,
+if it has been allocated in the above function. The freeing for pushed
+column values is done in sel_col_prefetch_buf_free. */
+UNIV_INTERN
+void
+eval_node_free_val_buf(
+/*===================*/
+ que_node_t* node); /*!< in: query graph node */
+/*****************************************************************//**
+Evaluates a symbol table symbol. */
+UNIV_INLINE
+void
+eval_sym(
+/*=====*/
+ sym_node_t* sym_node); /*!< in: symbol table node */
+/*****************************************************************//**
+Evaluates an expression. */
+UNIV_INLINE
+void
+eval_exp(
+/*=====*/
+ que_node_t* exp_node); /*!< in: expression */
+/*****************************************************************//**
+Sets an integer value as the value of an expression node. */
+UNIV_INLINE
+void
+eval_node_set_int_val(
+/*==================*/
+ que_node_t* node, /*!< in: expression node */
+ lint val); /*!< in: value to set */
+/*****************************************************************//**
+Gets an integer value from an expression node.
+@return integer value */
+UNIV_INLINE
+lint
+eval_node_get_int_val(
+/*==================*/
+ que_node_t* node); /*!< in: expression node */
+/*****************************************************************//**
+Copies a binary string value as the value of a query graph node. Allocates a
+new buffer if necessary. */
+UNIV_INLINE
+void
+eval_node_copy_and_alloc_val(
+/*=========================*/
+ que_node_t* node, /*!< in: query graph node */
+ const byte* str, /*!< in: binary string */
+ ulint len); /*!< in: string length or UNIV_SQL_NULL */
+/*****************************************************************//**
+Copies a query node value to another node. */
+UNIV_INLINE
+void
+eval_node_copy_val(
+/*===============*/
+ que_node_t* node1, /*!< in: node to copy to */
+ que_node_t* node2); /*!< in: node to copy from */
+/*****************************************************************//**
+Gets a iboolean value from a query node.
+@return iboolean value */
+UNIV_INLINE
+ibool
+eval_node_get_ibool_val(
+/*====================*/
+ que_node_t* node); /*!< in: query graph node */
+/*****************************************************************//**
+Evaluates a comparison node.
+@return the result of the comparison */
+UNIV_INTERN
+ibool
+eval_cmp(
+/*=====*/
+ func_node_t* cmp_node); /*!< in: comparison node */
+
+
+#ifndef UNIV_NONINL
+#include "eval0eval.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/eval0eval.ic b/storage/innobase/include/eval0eval.ic
new file mode 100644
index 00000000000..fe767f39b00
--- /dev/null
+++ b/storage/innobase/include/eval0eval.ic
@@ -0,0 +1,251 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/eval0eval.ic
+SQL evaluator: evaluates simple data structures, like expressions, in
+a query graph
+
+Created 12/29/1997 Heikki Tuuri
+*******************************************************/
+
+#include "que0que.h"
+#include "rem0cmp.h"
+#include "pars0grm.h"
+
+/*****************************************************************//**
+Evaluates a function node. */
+UNIV_INTERN
+void
+eval_func(
+/*======*/
+ func_node_t* func_node); /*!< in: function node */
+/*****************************************************************//**
+Allocate a buffer from global dynamic memory for a value of a que_node.
+NOTE that this memory must be explicitly freed when the query graph is
+freed. If the node already has allocated buffer, that buffer is freed
+here. NOTE that this is the only function where dynamic memory should be
+allocated for a query node val field.
+@return pointer to allocated buffer */
+UNIV_INTERN
+byte*
+eval_node_alloc_val_buf(
+/*====================*/
+ que_node_t* node, /*!< in: query graph node; sets the val field
+ data field to point to the new buffer, and
+ len field equal to size */
+ ulint size); /*!< in: buffer size */
+
+
+/*****************************************************************//**
+Allocates a new buffer if needed.
+@return pointer to buffer */
+UNIV_INLINE
+byte*
+eval_node_ensure_val_buf(
+/*=====================*/
+ que_node_t* node, /*!< in: query graph node; sets the val field
+ data field to point to the new buffer, and
+ len field equal to size */
+ ulint size) /*!< in: buffer size */
+{
+ dfield_t* dfield;
+ byte* data;
+
+ dfield = que_node_get_val(node);
+ dfield_set_len(dfield, size);
+
+ data = dfield_get_data(dfield);
+
+ if (!data || que_node_get_val_buf_size(node) < size) {
+
+ data = eval_node_alloc_val_buf(node, size);
+ }
+
+ return(data);
+}
+
+/*****************************************************************//**
+Evaluates a symbol table symbol. */
+UNIV_INLINE
+void
+eval_sym(
+/*=====*/
+ sym_node_t* sym_node) /*!< in: symbol table node */
+{
+
+ ut_ad(que_node_get_type(sym_node) == QUE_NODE_SYMBOL);
+
+ if (sym_node->indirection) {
+ /* The symbol table node is an alias for a variable or a
+ column */
+
+ dfield_copy_data(que_node_get_val(sym_node),
+ que_node_get_val(sym_node->indirection));
+ }
+}
+
+/*****************************************************************//**
+Evaluates an expression. */
+UNIV_INLINE
+void
+eval_exp(
+/*=====*/
+ que_node_t* exp_node) /*!< in: expression */
+{
+ if (que_node_get_type(exp_node) == QUE_NODE_SYMBOL) {
+
+ eval_sym((sym_node_t*)exp_node);
+
+ return;
+ }
+
+ eval_func(exp_node);
+}
+
+/*****************************************************************//**
+Sets an integer value as the value of an expression node. */
+UNIV_INLINE
+void
+eval_node_set_int_val(
+/*==================*/
+ que_node_t* node, /*!< in: expression node */
+ lint val) /*!< in: value to set */
+{
+ dfield_t* dfield;
+ byte* data;
+
+ dfield = que_node_get_val(node);
+
+ data = dfield_get_data(dfield);
+
+ if (data == NULL) {
+ data = eval_node_alloc_val_buf(node, 4);
+ }
+
+ ut_ad(dfield_get_len(dfield) == 4);
+
+ mach_write_to_4(data, (ulint)val);
+}
+
+/*****************************************************************//**
+Gets an integer non-SQL null value from an expression node.
+@return integer value */
+UNIV_INLINE
+lint
+eval_node_get_int_val(
+/*==================*/
+ que_node_t* node) /*!< in: expression node */
+{
+ dfield_t* dfield;
+
+ dfield = que_node_get_val(node);
+
+ ut_ad(dfield_get_len(dfield) == 4);
+
+ return((int)mach_read_from_4(dfield_get_data(dfield)));
+}
+
+/*****************************************************************//**
+Gets a iboolean value from a query node.
+@return iboolean value */
+UNIV_INLINE
+ibool
+eval_node_get_ibool_val(
+/*====================*/
+ que_node_t* node) /*!< in: query graph node */
+{
+ dfield_t* dfield;
+ byte* data;
+
+ dfield = que_node_get_val(node);
+
+ data = dfield_get_data(dfield);
+
+ ut_ad(data != NULL);
+
+ return(mach_read_from_1(data));
+}
+
+/*****************************************************************//**
+Sets a iboolean value as the value of a function node. */
+UNIV_INLINE
+void
+eval_node_set_ibool_val(
+/*====================*/
+ func_node_t* func_node, /*!< in: function node */
+ ibool val) /*!< in: value to set */
+{
+ dfield_t* dfield;
+ byte* data;
+
+ dfield = que_node_get_val(func_node);
+
+ data = dfield_get_data(dfield);
+
+ if (data == NULL) {
+ /* Allocate 1 byte to hold the value */
+
+ data = eval_node_alloc_val_buf(func_node, 1);
+ }
+
+ ut_ad(dfield_get_len(dfield) == 1);
+
+ mach_write_to_1(data, val);
+}
+
+/*****************************************************************//**
+Copies a binary string value as the value of a query graph node. Allocates a
+new buffer if necessary. */
+UNIV_INLINE
+void
+eval_node_copy_and_alloc_val(
+/*=========================*/
+ que_node_t* node, /*!< in: query graph node */
+ const byte* str, /*!< in: binary string */
+ ulint len) /*!< in: string length or UNIV_SQL_NULL */
+{
+ byte* data;
+
+ if (len == UNIV_SQL_NULL) {
+ dfield_set_len(que_node_get_val(node), len);
+
+ return;
+ }
+
+ data = eval_node_ensure_val_buf(node, len);
+
+ ut_memcpy(data, str, len);
+}
+
+/*****************************************************************//**
+Copies a query node value to another node. */
+UNIV_INLINE
+void
+eval_node_copy_val(
+/*===============*/
+ que_node_t* node1, /*!< in: node to copy to */
+ que_node_t* node2) /*!< in: node to copy from */
+{
+ dfield_t* dfield2;
+
+ dfield2 = que_node_get_val(node2);
+
+ eval_node_copy_and_alloc_val(node1, dfield_get_data(dfield2),
+ dfield_get_len(dfield2));
+}
diff --git a/storage/innobase/include/eval0proc.h b/storage/innobase/include/eval0proc.h
new file mode 100644
index 00000000000..13e2e365320
--- /dev/null
+++ b/storage/innobase/include/eval0proc.h
@@ -0,0 +1,104 @@
+/*****************************************************************************
+
+Copyright (c) 1998, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/eval0proc.h
+Executes SQL stored procedures and their control structures
+
+Created 1/20/1998 Heikki Tuuri
+*******************************************************/
+
+#ifndef eval0proc_h
+#define eval0proc_h
+
+#include "univ.i"
+#include "que0types.h"
+#include "pars0sym.h"
+#include "pars0pars.h"
+
+/**********************************************************************//**
+Performs an execution step of a procedure node.
+@return query thread to run next or NULL */
+UNIV_INLINE
+que_thr_t*
+proc_step(
+/*======*/
+ que_thr_t* thr); /*!< in: query thread */
+/**********************************************************************//**
+Performs an execution step of an if-statement node.
+@return query thread to run next or NULL */
+UNIV_INTERN
+que_thr_t*
+if_step(
+/*====*/
+ que_thr_t* thr); /*!< in: query thread */
+/**********************************************************************//**
+Performs an execution step of a while-statement node.
+@return query thread to run next or NULL */
+UNIV_INTERN
+que_thr_t*
+while_step(
+/*=======*/
+ que_thr_t* thr); /*!< in: query thread */
+/**********************************************************************//**
+Performs an execution step of a for-loop node.
+@return query thread to run next or NULL */
+UNIV_INTERN
+que_thr_t*
+for_step(
+/*=====*/
+ que_thr_t* thr); /*!< in: query thread */
+/**********************************************************************//**
+Performs an execution step of an assignment statement node.
+@return query thread to run next or NULL */
+UNIV_INTERN
+que_thr_t*
+assign_step(
+/*========*/
+ que_thr_t* thr); /*!< in: query thread */
+/**********************************************************************//**
+Performs an execution step of a procedure call node.
+@return query thread to run next or NULL */
+UNIV_INLINE
+que_thr_t*
+proc_eval_step(
+/*===========*/
+ que_thr_t* thr); /*!< in: query thread */
+/**********************************************************************//**
+Performs an execution step of an exit statement node.
+@return query thread to run next or NULL */
+UNIV_INTERN
+que_thr_t*
+exit_step(
+/*======*/
+ que_thr_t* thr); /*!< in: query thread */
+/**********************************************************************//**
+Performs an execution step of a return-statement node.
+@return query thread to run next or NULL */
+UNIV_INTERN
+que_thr_t*
+return_step(
+/*========*/
+ que_thr_t* thr); /*!< in: query thread */
+
+
+#ifndef UNIV_NONINL
+#include "eval0proc.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/eval0proc.ic b/storage/innobase/include/eval0proc.ic
new file mode 100644
index 00000000000..c602af0a694
--- /dev/null
+++ b/storage/innobase/include/eval0proc.ic
@@ -0,0 +1,88 @@
+/*****************************************************************************
+
+Copyright (c) 1998, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/eval0proc.ic
+Executes SQL stored procedures and their control structures
+
+Created 1/20/1998 Heikki Tuuri
+*******************************************************/
+
+#include "pars0pars.h"
+#include "que0que.h"
+#include "eval0eval.h"
+
+/**********************************************************************//**
+Performs an execution step of a procedure node.
+@return query thread to run next or NULL */
+UNIV_INLINE
+que_thr_t*
+proc_step(
+/*======*/
+ que_thr_t* thr) /*!< in: query thread */
+{
+ proc_node_t* node;
+
+ ut_ad(thr);
+
+ node = thr->run_node;
+ ut_ad(que_node_get_type(node) == QUE_NODE_PROC);
+
+ if (thr->prev_node == que_node_get_parent(node)) {
+ /* Start execution from the first statement in the statement
+ list */
+
+ thr->run_node = node->stat_list;
+ } else {
+ /* Move to the next statement */
+ ut_ad(que_node_get_next(thr->prev_node) == NULL);
+
+ thr->run_node = NULL;
+ }
+
+ if (thr->run_node == NULL) {
+ thr->run_node = que_node_get_parent(node);
+ }
+
+ return(thr);
+}
+
+/**********************************************************************//**
+Performs an execution step of a procedure call node.
+@return query thread to run next or NULL */
+UNIV_INLINE
+que_thr_t*
+proc_eval_step(
+/*===========*/
+ que_thr_t* thr) /*!< in: query thread */
+{
+ func_node_t* node;
+
+ ut_ad(thr);
+
+ node = thr->run_node;
+ ut_ad(que_node_get_type(node) == QUE_NODE_FUNC);
+
+ /* Evaluate the procedure */
+
+ eval_exp(node);
+
+ thr->run_node = que_node_get_parent(node);
+
+ return(thr);
+}
diff --git a/storage/innobase/include/fil0fil.h b/storage/innobase/include/fil0fil.h
new file mode 100644
index 00000000000..a36deaf16ce
--- /dev/null
+++ b/storage/innobase/include/fil0fil.h
@@ -0,0 +1,726 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/fil0fil.h
+The low-level file system
+
+Created 10/25/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef fil0fil_h
+#define fil0fil_h
+
+#include "univ.i"
+#ifndef UNIV_HOTBACKUP
+#include "sync0rw.h"
+#endif /* !UNIV_HOTBACKUP */
+#include "dict0types.h"
+#include "ut0byte.h"
+#include "os0file.h"
+
+/** When mysqld is run, the default directory "." is the mysqld datadir,
+but in the MySQL Embedded Server Library and ibbackup it is not the default
+directory, and we must set the base file path explicitly */
+extern const char* fil_path_to_mysql_datadir;
+
+/** Initial size of a single-table tablespace in pages */
+#define FIL_IBD_FILE_INITIAL_SIZE 4
+
+/** 'null' (undefined) page offset in the context of file spaces */
+#define FIL_NULL ULINT32_UNDEFINED
+
+/* Space address data type; this is intended to be used when
+addresses accurate to a byte are stored in file pages. If the page part
+of the address is FIL_NULL, the address is considered undefined. */
+
+typedef byte fil_faddr_t; /*!< 'type' definition in C: an address
+ stored in a file page is a string of bytes */
+#define FIL_ADDR_PAGE 0 /* first in address is the page offset */
+#define FIL_ADDR_BYTE 4 /* then comes 2-byte byte offset within page*/
+
+#define FIL_ADDR_SIZE 6 /* address size is 6 bytes */
+
+/** A struct for storing a space address FIL_ADDR, when it is used
+in C program data structures. */
+
+typedef struct fil_addr_struct fil_addr_t;
+/** File space address */
+struct fil_addr_struct{
+ ulint page; /*!< page number within a space */
+ ulint boffset; /*!< byte offset within the page */
+};
+
+/** The null file address */
+extern fil_addr_t fil_addr_null;
+
+/** The byte offsets on a file page for various variables @{ */
+#define FIL_PAGE_SPACE_OR_CHKSUM 0 /*!< in < MySQL-4.0.14 space id the
+ page belongs to (== 0) but in later
+ versions the 'new' checksum of the
+ page */
+#define FIL_PAGE_OFFSET 4 /*!< page offset inside space */
+#define FIL_PAGE_PREV 8 /*!< if there is a 'natural'
+ predecessor of the page, its
+ offset. Otherwise FIL_NULL.
+ This field is not set on BLOB
+ pages, which are stored as a
+ singly-linked list. See also
+ FIL_PAGE_NEXT. */
+#define FIL_PAGE_NEXT 12 /*!< if there is a 'natural' successor
+ of the page, its offset.
+ Otherwise FIL_NULL.
+ B-tree index pages
+ (FIL_PAGE_TYPE contains FIL_PAGE_INDEX)
+ on the same PAGE_LEVEL are maintained
+ as a doubly linked list via
+ FIL_PAGE_PREV and FIL_PAGE_NEXT
+ in the collation order of the
+ smallest user record on each page. */
+#define FIL_PAGE_LSN 16 /*!< lsn of the end of the newest
+ modification log record to the page */
+#define FIL_PAGE_TYPE 24 /*!< file page type: FIL_PAGE_INDEX,...,
+ 2 bytes.
+
+ The contents of this field can only
+ be trusted in the following case:
+ if the page is an uncompressed
+ B-tree index page, then it is
+ guaranteed that the value is
+ FIL_PAGE_INDEX.
+ The opposite does not hold.
+
+ In tablespaces created by
+ MySQL/InnoDB 5.1.7 or later, the
+ contents of this field is valid
+ for all uncompressed pages. */
+#define FIL_PAGE_FILE_FLUSH_LSN 26 /*!< this is only defined for the
+ first page in a data file: the file
+ has been flushed to disk at least up
+ to this lsn */
+#define FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID 34 /*!< starting from 4.1.x this
+ contains the space id of the page */
+#define FIL_PAGE_DATA 38 /*!< start of the data on the page */
+/* @} */
+/** File page trailer @{ */
+#define FIL_PAGE_END_LSN_OLD_CHKSUM 8 /*!< the low 4 bytes of this are used
+ to store the page checksum, the
+ last 4 bytes should be identical
+ to the last 4 bytes of FIL_PAGE_LSN */
+#define FIL_PAGE_DATA_END 8 /*!< size of the page trailer */
+/* @} */
+
+/** File page types (values of FIL_PAGE_TYPE) @{ */
+#define FIL_PAGE_INDEX 17855 /*!< B-tree node */
+#define FIL_PAGE_UNDO_LOG 2 /*!< Undo log page */
+#define FIL_PAGE_INODE 3 /*!< Index node */
+#define FIL_PAGE_IBUF_FREE_LIST 4 /*!< Insert buffer free list */
+/* File page types introduced in MySQL/InnoDB 5.1.7 */
+#define FIL_PAGE_TYPE_ALLOCATED 0 /*!< Freshly allocated page */
+#define FIL_PAGE_IBUF_BITMAP 5 /*!< Insert buffer bitmap */
+#define FIL_PAGE_TYPE_SYS 6 /*!< System page */
+#define FIL_PAGE_TYPE_TRX_SYS 7 /*!< Transaction system data */
+#define FIL_PAGE_TYPE_FSP_HDR 8 /*!< File space header */
+#define FIL_PAGE_TYPE_XDES 9 /*!< Extent descriptor page */
+#define FIL_PAGE_TYPE_BLOB 10 /*!< Uncompressed BLOB page */
+#define FIL_PAGE_TYPE_ZBLOB 11 /*!< First compressed BLOB page */
+#define FIL_PAGE_TYPE_ZBLOB2 12 /*!< Subsequent compressed BLOB page */
+/* @} */
+
+/** Space types @{ */
+#define FIL_TABLESPACE 501 /*!< tablespace */
+#define FIL_LOG 502 /*!< redo log */
+/* @} */
+
+/** The number of fsyncs done to the log */
+extern ulint fil_n_log_flushes;
+
+/** Number of pending redo log flushes */
+extern ulint fil_n_pending_log_flushes;
+/** Number of pending tablespace flushes */
+extern ulint fil_n_pending_tablespace_flushes;
+
+
+#ifndef UNIV_HOTBACKUP
+/*******************************************************************//**
+Returns the version number of a tablespace, -1 if not found.
+@return version number, -1 if the tablespace does not exist in the
+memory cache */
+UNIV_INTERN
+ib_int64_t
+fil_space_get_version(
+/*==================*/
+ ulint id); /*!< in: space id */
+/*******************************************************************//**
+Returns the latch of a file space.
+@return latch protecting storage allocation */
+UNIV_INTERN
+rw_lock_t*
+fil_space_get_latch(
+/*================*/
+ ulint id, /*!< in: space id */
+ ulint* zip_size);/*!< out: compressed page size, or
+ 0 for uncompressed tablespaces */
+/*******************************************************************//**
+Returns the type of a file space.
+@return FIL_TABLESPACE or FIL_LOG */
+UNIV_INTERN
+ulint
+fil_space_get_type(
+/*===============*/
+ ulint id); /*!< in: space id */
+#endif /* !UNIV_HOTBACKUP */
+/*******************************************************************//**
+Appends a new file to the chain of files of a space. File must be closed. */
+UNIV_INTERN
+void
+fil_node_create(
+/*============*/
+ const char* name, /*!< in: file name (file must be closed) */
+ ulint size, /*!< in: file size in database blocks, rounded
+ downwards to an integer */
+ ulint id, /*!< in: space id where to append */
+ ibool is_raw);/*!< in: TRUE if a raw device or
+ a raw disk partition */
+#ifdef UNIV_LOG_ARCHIVE
+/****************************************************************//**
+Drops files from the start of a file space, so that its size is cut by
+the amount given. */
+UNIV_INTERN
+void
+fil_space_truncate_start(
+/*=====================*/
+ ulint id, /*!< in: space id */
+ ulint trunc_len); /*!< in: truncate by this much; it is an error
+ if this does not equal to the combined size of
+ some initial files in the space */
+#endif /* UNIV_LOG_ARCHIVE */
+/*******************************************************************//**
+Creates a space memory object and puts it to the 'fil system' hash table. If
+there is an error, prints an error message to the .err log.
+@return TRUE if success */
+UNIV_INTERN
+ibool
+fil_space_create(
+/*=============*/
+ const char* name, /*!< in: space name */
+ ulint id, /*!< in: space id */
+ ulint zip_size,/*!< in: compressed page size, or
+ 0 for uncompressed tablespaces */
+ ulint purpose);/*!< in: FIL_TABLESPACE, or FIL_LOG if log */
+/*******************************************************************//**
+Frees a space object from a the tablespace memory cache. Closes the files in
+the chain but does not delete them.
+@return TRUE if success */
+UNIV_INTERN
+ibool
+fil_space_free(
+/*===========*/
+ ulint id); /*!< in: space id */
+/*******************************************************************//**
+Returns the size of the space in pages. The tablespace must be cached in the
+memory cache.
+@return space size, 0 if space not found */
+UNIV_INTERN
+ulint
+fil_space_get_size(
+/*===============*/
+ ulint id); /*!< in: space id */
+/*******************************************************************//**
+Returns the flags of the space. The tablespace must be cached
+in the memory cache.
+@return flags, ULINT_UNDEFINED if space not found */
+UNIV_INTERN
+ulint
+fil_space_get_flags(
+/*================*/
+ ulint id); /*!< in: space id */
+/*******************************************************************//**
+Returns the compressed page size of the space, or 0 if the space
+is not compressed. The tablespace must be cached in the memory cache.
+@return compressed page size, ULINT_UNDEFINED if space not found */
+UNIV_INTERN
+ulint
+fil_space_get_zip_size(
+/*===================*/
+ ulint id); /*!< in: space id */
+/*******************************************************************//**
+Checks if the pair space, page_no refers to an existing page in a tablespace
+file space. The tablespace must be cached in the memory cache.
+@return TRUE if the address is meaningful */
+UNIV_INTERN
+ibool
+fil_check_adress_in_tablespace(
+/*===========================*/
+ ulint id, /*!< in: space id */
+ ulint page_no);/*!< in: page number */
+/****************************************************************//**
+Initializes the tablespace memory cache. */
+UNIV_INTERN
+void
+fil_init(
+/*=====*/
+ ulint hash_size, /*!< in: hash table size */
+ ulint max_n_open); /*!< in: max number of open files */
+/*******************************************************************//**
+Opens all log files and system tablespace data files. They stay open until the
+database server shutdown. This should be called at a server startup after the
+space objects for the log and the system tablespace have been created. The
+purpose of this operation is to make sure we never run out of file descriptors
+if we need to read from the insert buffer or to write to the log. */
+UNIV_INTERN
+void
+fil_open_log_and_system_tablespace_files(void);
+/*==========================================*/
+/*******************************************************************//**
+Closes all open files. There must not be any pending i/o's or not flushed
+modifications in the files. */
+UNIV_INTERN
+void
+fil_close_all_files(void);
+/*=====================*/
+/*******************************************************************//**
+Sets the max tablespace id counter if the given number is bigger than the
+previous value. */
+UNIV_INTERN
+void
+fil_set_max_space_id_if_bigger(
+/*===========================*/
+ ulint max_id);/*!< in: maximum known id */
+#ifndef UNIV_HOTBACKUP
+/****************************************************************//**
+Writes the flushed lsn and the latest archived log number to the page
+header of the first page of each data file in the system tablespace.
+@return DB_SUCCESS or error number */
+UNIV_INTERN
+ulint
+fil_write_flushed_lsn_to_data_files(
+/*================================*/
+ ib_uint64_t lsn, /*!< in: lsn to write */
+ ulint arch_log_no); /*!< in: latest archived log
+ file number */
+/*******************************************************************//**
+Reads the flushed lsn and arch no fields from a data file at database
+startup. */
+UNIV_INTERN
+void
+fil_read_flushed_lsn_and_arch_log_no(
+/*=================================*/
+ os_file_t data_file, /*!< in: open data file */
+ ibool one_read_already, /*!< in: TRUE if min and max
+ parameters below already
+ contain sensible data */
+#ifdef UNIV_LOG_ARCHIVE
+ ulint* min_arch_log_no, /*!< in/out: */
+ ulint* max_arch_log_no, /*!< in/out: */
+#endif /* UNIV_LOG_ARCHIVE */
+ ib_uint64_t* min_flushed_lsn, /*!< in/out: */
+ ib_uint64_t* max_flushed_lsn); /*!< in/out: */
+/*******************************************************************//**
+Increments the count of pending insert buffer page merges, if space is not
+being deleted.
+@return TRUE if being deleted, and ibuf merges should be skipped */
+UNIV_INTERN
+ibool
+fil_inc_pending_ibuf_merges(
+/*========================*/
+ ulint id); /*!< in: space id */
+/*******************************************************************//**
+Decrements the count of pending insert buffer page merges. */
+UNIV_INTERN
+void
+fil_decr_pending_ibuf_merges(
+/*=========================*/
+ ulint id); /*!< in: space id */
+#endif /* !UNIV_HOTBACKUP */
+/*******************************************************************//**
+Parses the body of a log record written about an .ibd file operation. That is,
+the log record part after the standard (type, space id, page no) header of the
+log record.
+
+If desired, also replays the delete or rename operation if the .ibd file
+exists and the space id in it matches. Replays the create operation if a file
+at that path does not exist yet. If the database directory for the file to be
+created does not exist, then we create the directory, too.
+
+Note that ibbackup --apply-log sets fil_path_to_mysql_datadir to point to the
+datadir that we should use in replaying the file operations.
+@return end of log record, or NULL if the record was not completely
+contained between ptr and end_ptr */
+UNIV_INTERN
+byte*
+fil_op_log_parse_or_replay(
+/*=======================*/
+ byte* ptr, /*!< in: buffer containing the log record body,
+ or an initial segment of it, if the record does
+ not fir completely between ptr and end_ptr */
+ byte* end_ptr, /*!< in: buffer end */
+ ulint type, /*!< in: the type of this log record */
+ ulint space_id, /*!< in: the space id of the tablespace in
+ question, or 0 if the log record should
+ only be parsed but not replayed */
+ ulint log_flags); /*!< in: redo log flags
+ (stored in the page number parameter) */
+/*******************************************************************//**
+Deletes a single-table tablespace. The tablespace must be cached in the
+memory cache.
+@return TRUE if success */
+UNIV_INTERN
+ibool
+fil_delete_tablespace(
+/*==================*/
+ ulint id); /*!< in: space id */
+#ifndef UNIV_HOTBACKUP
+/*******************************************************************//**
+Discards a single-table tablespace. The tablespace must be cached in the
+memory cache. Discarding is like deleting a tablespace, but
+1) we do not drop the table from the data dictionary;
+2) we remove all insert buffer entries for the tablespace immediately; in DROP
+TABLE they are only removed gradually in the background;
+3) when the user does IMPORT TABLESPACE, the tablespace will have the same id
+as it originally had.
+@return TRUE if success */
+UNIV_INTERN
+ibool
+fil_discard_tablespace(
+/*===================*/
+ ulint id); /*!< in: space id */
+#endif /* !UNIV_HOTBACKUP */
+/*******************************************************************//**
+Renames a single-table tablespace. The tablespace must be cached in the
+tablespace memory cache.
+@return TRUE if success */
+UNIV_INTERN
+ibool
+fil_rename_tablespace(
+/*==================*/
+ const char* old_name, /*!< in: old table name in the standard
+ databasename/tablename format of
+ InnoDB, or NULL if we do the rename
+ based on the space id only */
+ ulint id, /*!< in: space id */
+ const char* new_name); /*!< in: new table name in the standard
+ databasename/tablename format
+ of InnoDB */
+
+/*******************************************************************//**
+Creates a new single-table tablespace to a database directory of MySQL.
+Database directories are under the 'datadir' of MySQL. The datadir is the
+directory of a running mysqld program. We can refer to it by simply the
+path '.'. Tables created with CREATE TEMPORARY TABLE we place in the temp
+dir of the mysqld server.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+ulint
+fil_create_new_single_table_tablespace(
+/*===================================*/
+ ulint* space_id, /*!< in/out: space id; if this is != 0,
+ then this is an input parameter,
+ otherwise output */
+ const char* tablename, /*!< in: the table name in the usual
+ databasename/tablename format
+ of InnoDB, or a dir path to a temp
+ table */
+ ibool is_temp, /*!< in: TRUE if a table created with
+ CREATE TEMPORARY TABLE */
+ ulint flags, /*!< in: tablespace flags */
+ ulint size); /*!< in: the initial size of the
+ tablespace file in pages,
+ must be >= FIL_IBD_FILE_INITIAL_SIZE */
+#ifndef UNIV_HOTBACKUP
+/********************************************************************//**
+Tries to open a single-table tablespace and optionally checks the space id is
+right in it. If does not succeed, prints an error message to the .err log. This
+function is used to open a tablespace when we start up mysqld, and also in
+IMPORT TABLESPACE.
+NOTE that we assume this operation is used either at the database startup
+or under the protection of the dictionary mutex, so that two users cannot
+race here. This operation does not leave the file associated with the
+tablespace open, but closes it after we have looked at the space id in it.
+@return TRUE if success */
+UNIV_INTERN
+ibool
+fil_open_single_table_tablespace(
+/*=============================*/
+ ibool check_space_id, /*!< in: should we check that the space
+ id in the file is right; we assume
+ that this function runs much faster
+ if no check is made, since accessing
+ the file inode probably is much
+ faster (the OS caches them) than
+ accessing the first page of the file */
+ ulint id, /*!< in: space id */
+ ulint flags, /*!< in: tablespace flags */
+ const char* name); /*!< in: table name in the
+ databasename/tablename format */
+/********************************************************************//**
+It is possible, though very improbable, that the lsn's in the tablespace to be
+imported have risen above the current system lsn, if a lengthy purge, ibuf
+merge, or rollback was performed on a backup taken with ibbackup. If that is
+the case, reset page lsn's in the file. We assume that mysqld was shut down
+after it performed these cleanup operations on the .ibd file, so that it at
+the shutdown stamped the latest lsn to the FIL_PAGE_FILE_FLUSH_LSN in the
+first page of the .ibd file, and we can determine whether we need to reset the
+lsn's just by looking at that flush lsn.
+@return TRUE if success */
+UNIV_INTERN
+ibool
+fil_reset_too_high_lsns(
+/*====================*/
+ const char* name, /*!< in: table name in the
+ databasename/tablename format */
+ ib_uint64_t current_lsn); /*!< in: reset lsn's if the lsn stamped
+ to FIL_PAGE_FILE_FLUSH_LSN in the
+ first page is too high */
+#endif /* !UNIV_HOTBACKUP */
+/********************************************************************//**
+At the server startup, if we need crash recovery, scans the database
+directories under the MySQL datadir, looking for .ibd files. Those files are
+single-table tablespaces. We need to know the space id in each of them so that
+we know into which file we should look to check the contents of a page stored
+in the doublewrite buffer, also to know where to apply log records where the
+space id is != 0.
+@return DB_SUCCESS or error number */
+UNIV_INTERN
+ulint
+fil_load_single_table_tablespaces(void);
+/*===================================*/
+/********************************************************************//**
+If we need crash recovery, and we have called
+fil_load_single_table_tablespaces() and dict_load_single_table_tablespaces(),
+we can call this function to print an error message of orphaned .ibd files
+for which there is not a data dictionary entry with a matching table name
+and space id. */
+UNIV_INTERN
+void
+fil_print_orphaned_tablespaces(void);
+/*================================*/
+/*******************************************************************//**
+Returns TRUE if a single-table tablespace does not exist in the memory cache,
+or is being deleted there.
+@return TRUE if does not exist or is being\ deleted */
+UNIV_INTERN
+ibool
+fil_tablespace_deleted_or_being_deleted_in_mem(
+/*===========================================*/
+ ulint id, /*!< in: space id */
+ ib_int64_t version);/*!< in: tablespace_version should be this; if
+ you pass -1 as the value of this, then this
+ parameter is ignored */
+/*******************************************************************//**
+Returns TRUE if a single-table tablespace exists in the memory cache.
+@return TRUE if exists */
+UNIV_INTERN
+ibool
+fil_tablespace_exists_in_mem(
+/*=========================*/
+ ulint id); /*!< in: space id */
+#ifndef UNIV_HOTBACKUP
+/*******************************************************************//**
+Returns TRUE if a matching tablespace exists in the InnoDB tablespace memory
+cache. Note that if we have not done a crash recovery at the database startup,
+there may be many tablespaces which are not yet in the memory cache.
+@return TRUE if a matching tablespace exists in the memory cache */
+UNIV_INTERN
+ibool
+fil_space_for_table_exists_in_mem(
+/*==============================*/
+ ulint id, /*!< in: space id */
+ const char* name, /*!< in: table name in the standard
+ 'databasename/tablename' format or
+ the dir path to a temp table */
+ ibool is_temp, /*!< in: TRUE if created with CREATE
+ TEMPORARY TABLE */
+ ibool mark_space, /*!< in: in crash recovery, at database
+ startup we mark all spaces which have
+ an associated table in the InnoDB
+ data dictionary, so that
+ we can print a warning about orphaned
+ tablespaces */
+ ibool print_error_if_does_not_exist);
+ /*!< in: print detailed error
+ information to the .err log if a
+ matching tablespace is not found from
+ memory */
+#else /* !UNIV_HOTBACKUP */
+/********************************************************************//**
+Extends all tablespaces to the size stored in the space header. During the
+ibbackup --apply-log phase we extended the spaces on-demand so that log records
+could be appllied, but that may have left spaces still too small compared to
+the size stored in the space header. */
+UNIV_INTERN
+void
+fil_extend_tablespaces_to_stored_len(void);
+/*======================================*/
+#endif /* !UNIV_HOTBACKUP */
+/**********************************************************************//**
+Tries to extend a data file so that it would accommodate the number of pages
+given. The tablespace must be cached in the memory cache. If the space is big
+enough already, does nothing.
+@return TRUE if success */
+UNIV_INTERN
+ibool
+fil_extend_space_to_desired_size(
+/*=============================*/
+ ulint* actual_size, /*!< out: size of the space after extension;
+ if we ran out of disk space this may be lower
+ than the desired size */
+ ulint space_id, /*!< in: space id */
+ ulint size_after_extend);/*!< in: desired size in pages after the
+ extension; if the current space size is bigger
+ than this already, the function does nothing */
+/*******************************************************************//**
+Tries to reserve free extents in a file space.
+@return TRUE if succeed */
+UNIV_INTERN
+ibool
+fil_space_reserve_free_extents(
+/*===========================*/
+ ulint id, /*!< in: space id */
+ ulint n_free_now, /*!< in: number of free extents now */
+ ulint n_to_reserve); /*!< in: how many one wants to reserve */
+/*******************************************************************//**
+Releases free extents in a file space. */
+UNIV_INTERN
+void
+fil_space_release_free_extents(
+/*===========================*/
+ ulint id, /*!< in: space id */
+ ulint n_reserved); /*!< in: how many one reserved */
+/*******************************************************************//**
+Gets the number of reserved extents. If the database is silent, this number
+should be zero. */
+UNIV_INTERN
+ulint
+fil_space_get_n_reserved_extents(
+/*=============================*/
+ ulint id); /*!< in: space id */
+/********************************************************************//**
+Reads or writes data. This operation is asynchronous (aio).
+@return DB_SUCCESS, or DB_TABLESPACE_DELETED if we are trying to do
+i/o on a tablespace which does not exist */
+UNIV_INTERN
+ulint
+fil_io(
+/*===*/
+ ulint type, /*!< in: OS_FILE_READ or OS_FILE_WRITE,
+ ORed to OS_FILE_LOG, if a log i/o
+ and ORed to OS_AIO_SIMULATED_WAKE_LATER
+ if simulated aio and we want to post a
+ batch of i/os; NOTE that a simulated batch
+ may introduce hidden chances of deadlocks,
+ because i/os are not actually handled until
+ all have been posted: use with great
+ caution! */
+ ibool sync, /*!< in: TRUE if synchronous aio is desired */
+ ulint space_id, /*!< in: space id */
+ ulint zip_size, /*!< in: compressed page size in bytes;
+ 0 for uncompressed pages */
+ ulint block_offset, /*!< in: offset in number of blocks */
+ ulint byte_offset, /*!< in: remainder of offset in bytes; in
+ aio this must be divisible by the OS block
+ size */
+ ulint len, /*!< in: how many bytes to read or write; this
+ must not cross a file boundary; in aio this
+ must be a block size multiple */
+ void* buf, /*!< in/out: buffer where to store read data
+ or from where to write; in aio this must be
+ appropriately aligned */
+ void* message); /*!< in: message for aio handler if non-sync
+ aio used, else ignored */
+/**********************************************************************//**
+Waits for an aio operation to complete. This function is used to write the
+handler for completed requests. The aio array of pending requests is divided
+into segments (see os0file.c for more info). The thread specifies which
+segment it wants to wait for. */
+UNIV_INTERN
+void
+fil_aio_wait(
+/*=========*/
+ ulint segment); /*!< in: the number of the segment in the aio
+ array to wait for */
+/**********************************************************************//**
+Flushes to disk possible writes cached by the OS. If the space does not exist
+or is being dropped, does not do anything. */
+UNIV_INTERN
+void
+fil_flush(
+/*======*/
+ ulint space_id); /*!< in: file space id (this can be a group of
+ log files or a tablespace of the database) */
+/**********************************************************************//**
+Flushes to disk writes in file spaces of the given type possibly cached by
+the OS. */
+UNIV_INTERN
+void
+fil_flush_file_spaces(
+/*==================*/
+ ulint purpose); /*!< in: FIL_TABLESPACE, FIL_LOG */
+/******************************************************************//**
+Checks the consistency of the tablespace cache.
+@return TRUE if ok */
+UNIV_INTERN
+ibool
+fil_validate(void);
+/*==============*/
+/********************************************************************//**
+Returns TRUE if file address is undefined.
+@return TRUE if undefined */
+UNIV_INTERN
+ibool
+fil_addr_is_null(
+/*=============*/
+ fil_addr_t addr); /*!< in: address */
+/********************************************************************//**
+Get the predecessor of a file page.
+@return FIL_PAGE_PREV */
+UNIV_INTERN
+ulint
+fil_page_get_prev(
+/*==============*/
+ const byte* page); /*!< in: file page */
+/********************************************************************//**
+Get the successor of a file page.
+@return FIL_PAGE_NEXT */
+UNIV_INTERN
+ulint
+fil_page_get_next(
+/*==============*/
+ const byte* page); /*!< in: file page */
+/*********************************************************************//**
+Sets the file page type. */
+UNIV_INTERN
+void
+fil_page_set_type(
+/*==============*/
+ byte* page, /*!< in/out: file page */
+ ulint type); /*!< in: type */
+/*********************************************************************//**
+Gets the file page type.
+@return type; NOTE that if the type has not been written to page, the
+return value not defined */
+UNIV_INTERN
+ulint
+fil_page_get_type(
+/*==============*/
+ const byte* page); /*!< in: file page */
+
+
+typedef struct fil_space_struct fil_space_t;
+
+#endif
diff --git a/storage/innobase/include/fsp0fsp.h b/storage/innobase/include/fsp0fsp.h
new file mode 100644
index 00000000000..5f7dc58eedc
--- /dev/null
+++ b/storage/innobase/include/fsp0fsp.h
@@ -0,0 +1,359 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/fsp0fsp.h
+File space management
+
+Created 12/18/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef fsp0fsp_h
+#define fsp0fsp_h
+
+#include "univ.i"
+
+#include "mtr0mtr.h"
+#include "fut0lst.h"
+#include "ut0byte.h"
+#include "page0types.h"
+#include "fsp0types.h"
+
+/**********************************************************************//**
+Initializes the file space system. */
+UNIV_INTERN
+void
+fsp_init(void);
+/*==========*/
+/**********************************************************************//**
+Gets the current free limit of the system tablespace. The free limit
+means the place of the first page which has never been put to the the
+free list for allocation. The space above that address is initialized
+to zero. Sets also the global variable log_fsp_current_free_limit.
+@return free limit in megabytes */
+UNIV_INTERN
+ulint
+fsp_header_get_free_limit(void);
+/*===========================*/
+/**********************************************************************//**
+Gets the size of the system tablespace from the tablespace header. If
+we do not have an auto-extending data file, this should be equal to
+the size of the data files. If there is an auto-extending data file,
+this can be smaller.
+@return size in pages */
+UNIV_INTERN
+ulint
+fsp_header_get_tablespace_size(void);
+/*================================*/
+/**********************************************************************//**
+Reads the file space size stored in the header page.
+@return tablespace size stored in the space header */
+UNIV_INTERN
+ulint
+fsp_get_size_low(
+/*=============*/
+ page_t* page); /*!< in: header page (page 0 in the tablespace) */
+/**********************************************************************//**
+Reads the space id from the first page of a tablespace.
+@return space id, ULINT UNDEFINED if error */
+UNIV_INTERN
+ulint
+fsp_header_get_space_id(
+/*====================*/
+ const page_t* page); /*!< in: first page of a tablespace */
+/**********************************************************************//**
+Reads the space flags from the first page of a tablespace.
+@return flags */
+UNIV_INTERN
+ulint
+fsp_header_get_flags(
+/*=================*/
+ const page_t* page); /*!< in: first page of a tablespace */
+/**********************************************************************//**
+Reads the compressed page size from the first page of a tablespace.
+@return compressed page size in bytes, or 0 if uncompressed */
+UNIV_INTERN
+ulint
+fsp_header_get_zip_size(
+/*====================*/
+ const page_t* page); /*!< in: first page of a tablespace */
+/**********************************************************************//**
+Writes the space id and compressed page size to a tablespace header.
+This function is used past the buffer pool when we in fil0fil.c create
+a new single-table tablespace. */
+UNIV_INTERN
+void
+fsp_header_init_fields(
+/*===================*/
+ page_t* page, /*!< in/out: first page in the space */
+ ulint space_id, /*!< in: space id */
+ ulint flags); /*!< in: tablespace flags (FSP_SPACE_FLAGS):
+ 0, or table->flags if newer than COMPACT */
+/**********************************************************************//**
+Initializes the space header of a new created space and creates also the
+insert buffer tree root if space == 0. */
+UNIV_INTERN
+void
+fsp_header_init(
+/*============*/
+ ulint space, /*!< in: space id */
+ ulint size, /*!< in: current size in blocks */
+ mtr_t* mtr); /*!< in: mini-transaction handle */
+/**********************************************************************//**
+Increases the space size field of a space. */
+UNIV_INTERN
+void
+fsp_header_inc_size(
+/*================*/
+ ulint space, /*!< in: space id */
+ ulint size_inc,/*!< in: size increment in pages */
+ mtr_t* mtr); /*!< in: mini-transaction handle */
+/**********************************************************************//**
+Creates a new segment.
+@return the block where the segment header is placed, x-latched, NULL
+if could not create segment because of lack of space */
+UNIV_INTERN
+buf_block_t*
+fseg_create(
+/*========*/
+ ulint space, /*!< in: space id */
+ ulint page, /*!< in: page where the segment header is placed: if
+ this is != 0, the page must belong to another segment,
+ if this is 0, a new page will be allocated and it
+ will belong to the created segment */
+ ulint byte_offset, /*!< in: byte offset of the created segment header
+ on the page */
+ mtr_t* mtr); /*!< in: mtr */
+/**********************************************************************//**
+Creates a new segment.
+@return the block where the segment header is placed, x-latched, NULL
+if could not create segment because of lack of space */
+UNIV_INTERN
+buf_block_t*
+fseg_create_general(
+/*================*/
+ ulint space, /*!< in: space id */
+ ulint page, /*!< in: page where the segment header is placed: if
+ this is != 0, the page must belong to another segment,
+ if this is 0, a new page will be allocated and it
+ will belong to the created segment */
+ ulint byte_offset, /*!< in: byte offset of the created segment header
+ on the page */
+ ibool has_done_reservation, /*!< in: TRUE if the caller has already
+ done the reservation for the pages with
+ fsp_reserve_free_extents (at least 2 extents: one for
+ the inode and the other for the segment) then there is
+ no need to do the check for this individual
+ operation */
+ mtr_t* mtr); /*!< in: mtr */
+/**********************************************************************//**
+Calculates the number of pages reserved by a segment, and how many pages are
+currently used.
+@return number of reserved pages */
+UNIV_INTERN
+ulint
+fseg_n_reserved_pages(
+/*==================*/
+ fseg_header_t* header, /*!< in: segment header */
+ ulint* used, /*!< out: number of pages used (<= reserved) */
+ mtr_t* mtr); /*!< in: mtr handle */
+/**********************************************************************//**
+Allocates a single free page from a segment. This function implements
+the intelligent allocation strategy which tries to minimize
+file space fragmentation.
+@return the allocated page offset FIL_NULL if no page could be allocated */
+UNIV_INTERN
+ulint
+fseg_alloc_free_page(
+/*=================*/
+ fseg_header_t* seg_header, /*!< in: segment header */
+ ulint hint, /*!< in: hint of which page would be desirable */
+ byte direction, /*!< in: if the new page is needed because
+ of an index page split, and records are
+ inserted there in order, into which
+ direction they go alphabetically: FSP_DOWN,
+ FSP_UP, FSP_NO_DIR */
+ mtr_t* mtr); /*!< in: mtr handle */
+/**********************************************************************//**
+Allocates a single free page from a segment. This function implements
+the intelligent allocation strategy which tries to minimize file space
+fragmentation.
+@return allocated page offset, FIL_NULL if no page could be allocated */
+UNIV_INTERN
+ulint
+fseg_alloc_free_page_general(
+/*=========================*/
+ fseg_header_t* seg_header,/*!< in: segment header */
+ ulint hint, /*!< in: hint of which page would be desirable */
+ byte direction,/*!< in: if the new page is needed because
+ of an index page split, and records are
+ inserted there in order, into which
+ direction they go alphabetically: FSP_DOWN,
+ FSP_UP, FSP_NO_DIR */
+ ibool has_done_reservation, /*!< in: TRUE if the caller has
+ already done the reservation for the page
+ with fsp_reserve_free_extents, then there
+ is no need to do the check for this individual
+ page */
+ mtr_t* mtr); /*!< in: mtr handle */
+/**********************************************************************//**
+Reserves free pages from a tablespace. All mini-transactions which may
+use several pages from the tablespace should call this function beforehand
+and reserve enough free extents so that they certainly will be able
+to do their operation, like a B-tree page split, fully. Reservations
+must be released with function fil_space_release_free_extents!
+
+The alloc_type below has the following meaning: FSP_NORMAL means an
+operation which will probably result in more space usage, like an
+insert in a B-tree; FSP_UNDO means allocation to undo logs: if we are
+deleting rows, then this allocation will in the long run result in
+less space usage (after a purge); FSP_CLEANING means allocation done
+in a physical record delete (like in a purge) or other cleaning operation
+which will result in less space usage in the long run. We prefer the latter
+two types of allocation: when space is scarce, FSP_NORMAL allocations
+will not succeed, but the latter two allocations will succeed, if possible.
+The purpose is to avoid dead end where the database is full but the
+user cannot free any space because these freeing operations temporarily
+reserve some space.
+
+Single-table tablespaces whose size is < 32 pages are a special case. In this
+function we would liberally reserve several 64 page extents for every page
+split or merge in a B-tree. But we do not want to waste disk space if the table
+only occupies < 32 pages. That is why we apply different rules in that special
+case, just ensuring that there are 3 free pages available.
+@return TRUE if we were able to make the reservation */
+UNIV_INTERN
+ibool
+fsp_reserve_free_extents(
+/*=====================*/
+ ulint* n_reserved,/*!< out: number of extents actually reserved; if we
+ return TRUE and the tablespace size is < 64 pages,
+ then this can be 0, otherwise it is n_ext */
+ ulint space, /*!< in: space id */
+ ulint n_ext, /*!< in: number of extents to reserve */
+ ulint alloc_type,/*!< in: FSP_NORMAL, FSP_UNDO, or FSP_CLEANING */
+ mtr_t* mtr); /*!< in: mtr */
+/**********************************************************************//**
+This function should be used to get information on how much we still
+will be able to insert new data to the database without running out the
+tablespace. Only free extents are taken into account and we also subtract
+the safety margin required by the above function fsp_reserve_free_extents.
+@return available space in kB */
+UNIV_INTERN
+ullint
+fsp_get_available_space_in_free_extents(
+/*====================================*/
+ ulint space); /*!< in: space id */
+/**********************************************************************//**
+Frees a single page of a segment. */
+UNIV_INTERN
+void
+fseg_free_page(
+/*===========*/
+ fseg_header_t* seg_header, /*!< in: segment header */
+ ulint space, /*!< in: space id */
+ ulint page, /*!< in: page offset */
+ mtr_t* mtr); /*!< in: mtr handle */
+/**********************************************************************//**
+Frees part of a segment. This function can be used to free a segment
+by repeatedly calling this function in different mini-transactions.
+Doing the freeing in a single mini-transaction might result in
+too big a mini-transaction.
+@return TRUE if freeing completed */
+UNIV_INTERN
+ibool
+fseg_free_step(
+/*===========*/
+ fseg_header_t* header, /*!< in, own: segment header; NOTE: if the header
+ resides on the first page of the frag list
+ of the segment, this pointer becomes obsolete
+ after the last freeing step */
+ mtr_t* mtr); /*!< in: mtr */
+/**********************************************************************//**
+Frees part of a segment. Differs from fseg_free_step because this function
+leaves the header page unfreed.
+@return TRUE if freeing completed, except the header page */
+UNIV_INTERN
+ibool
+fseg_free_step_not_header(
+/*======================*/
+ fseg_header_t* header, /*!< in: segment header which must reside on
+ the first fragment page of the segment */
+ mtr_t* mtr); /*!< in: mtr */
+/***********************************************************************//**
+Checks if a page address is an extent descriptor page address.
+@return TRUE if a descriptor page */
+UNIV_INLINE
+ibool
+fsp_descr_page(
+/*===========*/
+ ulint zip_size,/*!< in: compressed page size in bytes;
+ 0 for uncompressed pages */
+ ulint page_no);/*!< in: page number */
+/***********************************************************//**
+Parses a redo log record of a file page init.
+@return end of log record or NULL */
+UNIV_INTERN
+byte*
+fsp_parse_init_file_page(
+/*=====================*/
+ byte* ptr, /*!< in: buffer */
+ byte* end_ptr, /*!< in: buffer end */
+ buf_block_t* block); /*!< in: block or NULL */
+/*******************************************************************//**
+Validates the file space system and its segments.
+@return TRUE if ok */
+UNIV_INTERN
+ibool
+fsp_validate(
+/*=========*/
+ ulint space); /*!< in: space id */
+/*******************************************************************//**
+Prints info of a file space. */
+UNIV_INTERN
+void
+fsp_print(
+/*======*/
+ ulint space); /*!< in: space id */
+#ifdef UNIV_DEBUG
+/*******************************************************************//**
+Validates a segment.
+@return TRUE if ok */
+UNIV_INTERN
+ibool
+fseg_validate(
+/*==========*/
+ fseg_header_t* header, /*!< in: segment header */
+ mtr_t* mtr); /*!< in: mtr */
+#endif /* UNIV_DEBUG */
+#ifdef UNIV_BTR_PRINT
+/*******************************************************************//**
+Writes info of a segment. */
+UNIV_INTERN
+void
+fseg_print(
+/*=======*/
+ fseg_header_t* header, /*!< in: segment header */
+ mtr_t* mtr); /*!< in: mtr */
+#endif /* UNIV_BTR_PRINT */
+
+#ifndef UNIV_NONINL
+#include "fsp0fsp.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/fsp0fsp.ic b/storage/innobase/include/fsp0fsp.ic
new file mode 100644
index 00000000000..434c370b527
--- /dev/null
+++ b/storage/innobase/include/fsp0fsp.ic
@@ -0,0 +1,45 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/fsp0fsp.ic
+File space management
+
+Created 12/18/1995 Heikki Tuuri
+*******************************************************/
+
+/***********************************************************************//**
+Checks if a page address is an extent descriptor page address.
+@return TRUE if a descriptor page */
+UNIV_INLINE
+ibool
+fsp_descr_page(
+/*===========*/
+ ulint zip_size,/*!< in: compressed page size in bytes;
+ 0 for uncompressed pages */
+ ulint page_no)/*!< in: page number */
+{
+ ut_ad(ut_is_2pow(zip_size));
+
+ if (!zip_size) {
+ return(UNIV_UNLIKELY((page_no & (UNIV_PAGE_SIZE - 1))
+ == FSP_XDES_OFFSET));
+ }
+
+ return(UNIV_UNLIKELY((page_no & (zip_size - 1)) == FSP_XDES_OFFSET));
+}
diff --git a/storage/innobase/include/fsp0types.h b/storage/innobase/include/fsp0types.h
new file mode 100644
index 00000000000..496081c2346
--- /dev/null
+++ b/storage/innobase/include/fsp0types.h
@@ -0,0 +1,110 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/******************************************************
+@file include/fsp0types.h
+File space management types
+
+Created May 26, 2009 Vasil Dimov
+*******************************************************/
+
+#ifndef fsp0types_h
+#define fsp0types_h
+
+#include "univ.i"
+
+#include "fil0fil.h" /* for FIL_PAGE_DATA */
+
+/** @name Flags for inserting records in order
+If records are inserted in order, there are the following
+flags to tell this (their type is made byte for the compiler
+to warn if direction and hint parameters are switched in
+fseg_alloc_free_page) */
+/* @{ */
+#define FSP_UP ((byte)111) /*!< alphabetically upwards */
+#define FSP_DOWN ((byte)112) /*!< alphabetically downwards */
+#define FSP_NO_DIR ((byte)113) /*!< no order */
+/* @} */
+
+/** File space extent size (one megabyte) in pages */
+#define FSP_EXTENT_SIZE (1 << (20 - UNIV_PAGE_SIZE_SHIFT))
+
+/** On a page of any file segment, data may be put starting from this
+offset */
+#define FSEG_PAGE_DATA FIL_PAGE_DATA
+
+/** @name File segment header
+The file segment header points to the inode describing the file segment. */
+/* @{ */
+/** Data type for file segment header */
+typedef byte fseg_header_t;
+
+#define FSEG_HDR_SPACE 0 /*!< space id of the inode */
+#define FSEG_HDR_PAGE_NO 4 /*!< page number of the inode */
+#define FSEG_HDR_OFFSET 8 /*!< byte offset of the inode */
+
+#define FSEG_HEADER_SIZE 10 /*!< Length of the file system
+ header, in bytes */
+/* @} */
+
+/** Flags for fsp_reserve_free_extents @{ */
+#define FSP_NORMAL 1000000
+#define FSP_UNDO 2000000
+#define FSP_CLEANING 3000000
+/* @} */
+
+/* Number of pages described in a single descriptor page: currently each page
+description takes less than 1 byte; a descriptor page is repeated every
+this many file pages */
+/* #define XDES_DESCRIBED_PER_PAGE UNIV_PAGE_SIZE */
+/* This has been replaced with either UNIV_PAGE_SIZE or page_zip->size. */
+
+/** @name The space low address page map
+The pages at FSP_XDES_OFFSET and FSP_IBUF_BITMAP_OFFSET are repeated
+every XDES_DESCRIBED_PER_PAGE pages in every tablespace. */
+/* @{ */
+/*--------------------------------------*/
+#define FSP_XDES_OFFSET 0 /* !< extent descriptor */
+#define FSP_IBUF_BITMAP_OFFSET 1 /* !< insert buffer bitmap */
+ /* The ibuf bitmap pages are the ones whose
+ page number is the number above plus a
+ multiple of XDES_DESCRIBED_PER_PAGE */
+
+#define FSP_FIRST_INODE_PAGE_NO 2 /*!< in every tablespace */
+ /* The following pages exist
+ in the system tablespace (space 0). */
+#define FSP_IBUF_HEADER_PAGE_NO 3 /*!< insert buffer
+ header page, in
+ tablespace 0 */
+#define FSP_IBUF_TREE_ROOT_PAGE_NO 4 /*!< insert buffer
+ B-tree root page in
+ tablespace 0 */
+ /* The ibuf tree root page number in
+ tablespace 0; its fseg inode is on the page
+ number FSP_FIRST_INODE_PAGE_NO */
+#define FSP_TRX_SYS_PAGE_NO 5 /*!< transaction
+ system header, in
+ tablespace 0 */
+#define FSP_FIRST_RSEG_PAGE_NO 6 /*!< first rollback segment
+ page, in tablespace 0 */
+#define FSP_DICT_HDR_PAGE_NO 7 /*!< data dictionary header
+ page, in tablespace 0 */
+/*--------------------------------------*/
+/* @} */
+
+#endif /* fsp0types_h */
diff --git a/storage/innobase/include/fut0fut.h b/storage/innobase/include/fut0fut.h
new file mode 100644
index 00000000000..dce20b3bad6
--- /dev/null
+++ b/storage/innobase/include/fut0fut.h
@@ -0,0 +1,55 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/fut0fut.h
+File-based utilities
+
+Created 12/13/1995 Heikki Tuuri
+***********************************************************************/
+
+
+#ifndef fut0fut_h
+#define fut0fut_h
+
+#include "univ.i"
+
+#include "fil0fil.h"
+#include "mtr0mtr.h"
+
+/********************************************************************//**
+Gets a pointer to a file address and latches the page.
+@return pointer to a byte in a frame; the file page in the frame is
+bufferfixed and latched */
+UNIV_INLINE
+byte*
+fut_get_ptr(
+/*========*/
+ ulint space, /*!< in: space id */
+ ulint zip_size,/*!< in: compressed page size in bytes
+ or 0 for uncompressed pages */
+ fil_addr_t addr, /*!< in: file address */
+ ulint rw_latch, /*!< in: RW_S_LATCH, RW_X_LATCH */
+ mtr_t* mtr); /*!< in: mtr handle */
+
+#ifndef UNIV_NONINL
+#include "fut0fut.ic"
+#endif
+
+#endif
+
diff --git a/storage/innobase/include/fut0fut.ic b/storage/innobase/include/fut0fut.ic
new file mode 100644
index 00000000000..0b52719a055
--- /dev/null
+++ b/storage/innobase/include/fut0fut.ic
@@ -0,0 +1,56 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/fut0fut.ic
+File-based utilities
+
+Created 12/13/1995 Heikki Tuuri
+***********************************************************************/
+
+#include "sync0rw.h"
+#include "buf0buf.h"
+
+/********************************************************************//**
+Gets a pointer to a file address and latches the page.
+@return pointer to a byte in a frame; the file page in the frame is
+bufferfixed and latched */
+UNIV_INLINE
+byte*
+fut_get_ptr(
+/*========*/
+ ulint space, /*!< in: space id */
+ ulint zip_size,/*!< in: compressed page size in bytes
+ or 0 for uncompressed pages */
+ fil_addr_t addr, /*!< in: file address */
+ ulint rw_latch, /*!< in: RW_S_LATCH, RW_X_LATCH */
+ mtr_t* mtr) /*!< in: mtr handle */
+{
+ buf_block_t* block;
+ byte* ptr;
+
+ ut_ad(addr.boffset < UNIV_PAGE_SIZE);
+ ut_ad((rw_latch == RW_S_LATCH) || (rw_latch == RW_X_LATCH));
+
+ block = buf_page_get(space, zip_size, addr.page, rw_latch, mtr);
+ ptr = buf_block_get_frame(block) + addr.boffset;
+
+ buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK);
+
+ return(ptr);
+}
diff --git a/storage/innobase/include/fut0lst.h b/storage/innobase/include/fut0lst.h
new file mode 100644
index 00000000000..fe024c2498f
--- /dev/null
+++ b/storage/innobase/include/fut0lst.h
@@ -0,0 +1,217 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/fut0lst.h
+File-based list utilities
+
+Created 11/28/1995 Heikki Tuuri
+***********************************************************************/
+
+#ifndef fut0lst_h
+#define fut0lst_h
+
+#include "univ.i"
+
+#include "fil0fil.h"
+#include "mtr0mtr.h"
+
+
+/* The C 'types' of base node and list node: these should be used to
+write self-documenting code. Of course, the sizeof macro cannot be
+applied to these types! */
+
+typedef byte flst_base_node_t;
+typedef byte flst_node_t;
+
+/* The physical size of a list base node in bytes */
+#define FLST_BASE_NODE_SIZE (4 + 2 * FIL_ADDR_SIZE)
+
+/* The physical size of a list node in bytes */
+#define FLST_NODE_SIZE (2 * FIL_ADDR_SIZE)
+
+#ifndef UNIV_HOTBACKUP
+/********************************************************************//**
+Initializes a list base node. */
+UNIV_INLINE
+void
+flst_init(
+/*======*/
+ flst_base_node_t* base, /*!< in: pointer to base node */
+ mtr_t* mtr); /*!< in: mini-transaction handle */
+/********************************************************************//**
+Adds a node as the last node in a list. */
+UNIV_INTERN
+void
+flst_add_last(
+/*==========*/
+ flst_base_node_t* base, /*!< in: pointer to base node of list */
+ flst_node_t* node, /*!< in: node to add */
+ mtr_t* mtr); /*!< in: mini-transaction handle */
+/********************************************************************//**
+Adds a node as the first node in a list. */
+UNIV_INTERN
+void
+flst_add_first(
+/*===========*/
+ flst_base_node_t* base, /*!< in: pointer to base node of list */
+ flst_node_t* node, /*!< in: node to add */
+ mtr_t* mtr); /*!< in: mini-transaction handle */
+/********************************************************************//**
+Inserts a node after another in a list. */
+UNIV_INTERN
+void
+flst_insert_after(
+/*==============*/
+ flst_base_node_t* base, /*!< in: pointer to base node of list */
+ flst_node_t* node1, /*!< in: node to insert after */
+ flst_node_t* node2, /*!< in: node to add */
+ mtr_t* mtr); /*!< in: mini-transaction handle */
+/********************************************************************//**
+Inserts a node before another in a list. */
+UNIV_INTERN
+void
+flst_insert_before(
+/*===============*/
+ flst_base_node_t* base, /*!< in: pointer to base node of list */
+ flst_node_t* node2, /*!< in: node to insert */
+ flst_node_t* node3, /*!< in: node to insert before */
+ mtr_t* mtr); /*!< in: mini-transaction handle */
+/********************************************************************//**
+Removes a node. */
+UNIV_INTERN
+void
+flst_remove(
+/*========*/
+ flst_base_node_t* base, /*!< in: pointer to base node of list */
+ flst_node_t* node2, /*!< in: node to remove */
+ mtr_t* mtr); /*!< in: mini-transaction handle */
+/********************************************************************//**
+Cuts off the tail of the list, including the node given. The number of
+nodes which will be removed must be provided by the caller, as this function
+does not measure the length of the tail. */
+UNIV_INTERN
+void
+flst_cut_end(
+/*=========*/
+ flst_base_node_t* base, /*!< in: pointer to base node of list */
+ flst_node_t* node2, /*!< in: first node to remove */
+ ulint n_nodes,/*!< in: number of nodes to remove,
+ must be >= 1 */
+ mtr_t* mtr); /*!< in: mini-transaction handle */
+/********************************************************************//**
+Cuts off the tail of the list, not including the given node. The number of
+nodes which will be removed must be provided by the caller, as this function
+does not measure the length of the tail. */
+UNIV_INTERN
+void
+flst_truncate_end(
+/*==============*/
+ flst_base_node_t* base, /*!< in: pointer to base node of list */
+ flst_node_t* node2, /*!< in: first node not to remove */
+ ulint n_nodes,/*!< in: number of nodes to remove */
+ mtr_t* mtr); /*!< in: mini-transaction handle */
+/********************************************************************//**
+Gets list length.
+@return length */
+UNIV_INLINE
+ulint
+flst_get_len(
+/*=========*/
+ const flst_base_node_t* base, /*!< in: pointer to base node */
+ mtr_t* mtr); /*!< in: mini-transaction handle */
+/********************************************************************//**
+Gets list first node address.
+@return file address */
+UNIV_INLINE
+fil_addr_t
+flst_get_first(
+/*===========*/
+ const flst_base_node_t* base, /*!< in: pointer to base node */
+ mtr_t* mtr); /*!< in: mini-transaction handle */
+/********************************************************************//**
+Gets list last node address.
+@return file address */
+UNIV_INLINE
+fil_addr_t
+flst_get_last(
+/*==========*/
+ const flst_base_node_t* base, /*!< in: pointer to base node */
+ mtr_t* mtr); /*!< in: mini-transaction handle */
+/********************************************************************//**
+Gets list next node address.
+@return file address */
+UNIV_INLINE
+fil_addr_t
+flst_get_next_addr(
+/*===============*/
+ const flst_node_t* node, /*!< in: pointer to node */
+ mtr_t* mtr); /*!< in: mini-transaction handle */
+/********************************************************************//**
+Gets list prev node address.
+@return file address */
+UNIV_INLINE
+fil_addr_t
+flst_get_prev_addr(
+/*===============*/
+ const flst_node_t* node, /*!< in: pointer to node */
+ mtr_t* mtr); /*!< in: mini-transaction handle */
+/********************************************************************//**
+Writes a file address. */
+UNIV_INLINE
+void
+flst_write_addr(
+/*============*/
+ fil_faddr_t* faddr, /*!< in: pointer to file faddress */
+ fil_addr_t addr, /*!< in: file address */
+ mtr_t* mtr); /*!< in: mini-transaction handle */
+/********************************************************************//**
+Reads a file address.
+@return file address */
+UNIV_INLINE
+fil_addr_t
+flst_read_addr(
+/*===========*/
+ const fil_faddr_t* faddr, /*!< in: pointer to file faddress */
+ mtr_t* mtr); /*!< in: mini-transaction handle */
+/********************************************************************//**
+Validates a file-based list.
+@return TRUE if ok */
+UNIV_INTERN
+ibool
+flst_validate(
+/*==========*/
+ const flst_base_node_t* base, /*!< in: pointer to base node of list */
+ mtr_t* mtr1); /*!< in: mtr */
+/********************************************************************//**
+Prints info of a file-based list. */
+UNIV_INTERN
+void
+flst_print(
+/*=======*/
+ const flst_base_node_t* base, /*!< in: pointer to base node of list */
+ mtr_t* mtr); /*!< in: mtr */
+
+
+#ifndef UNIV_NONINL
+#include "fut0lst.ic"
+#endif
+
+#endif /* !UNIV_HOTBACKUP */
+
+#endif
diff --git a/storage/innobase/include/fut0lst.ic b/storage/innobase/include/fut0lst.ic
new file mode 100644
index 00000000000..dcd13c61871
--- /dev/null
+++ b/storage/innobase/include/fut0lst.ic
@@ -0,0 +1,167 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/fut0lst.ic
+File-based list utilities
+
+Created 11/28/1995 Heikki Tuuri
+***********************************************************************/
+
+#include "fut0fut.h"
+#include "mtr0log.h"
+#include "buf0buf.h"
+
+/* We define the field offsets of a node for the list */
+#define FLST_PREV 0 /* 6-byte address of the previous list element;
+ the page part of address is FIL_NULL, if no
+ previous element */
+#define FLST_NEXT FIL_ADDR_SIZE /* 6-byte address of the next
+ list element; the page part of address
+ is FIL_NULL, if no next element */
+
+/* We define the field offsets of a base node for the list */
+#define FLST_LEN 0 /* 32-bit list length field */
+#define FLST_FIRST 4 /* 6-byte address of the first element
+ of the list; undefined if empty list */
+#define FLST_LAST (4 + FIL_ADDR_SIZE) /* 6-byte address of the
+ last element of the list; undefined
+ if empty list */
+
+/********************************************************************//**
+Writes a file address. */
+UNIV_INLINE
+void
+flst_write_addr(
+/*============*/
+ fil_faddr_t* faddr, /*!< in: pointer to file faddress */
+ fil_addr_t addr, /*!< in: file address */
+ mtr_t* mtr) /*!< in: mini-transaction handle */
+{
+ ut_ad(faddr && mtr);
+ ut_ad(mtr_memo_contains_page(mtr, faddr, MTR_MEMO_PAGE_X_FIX));
+ ut_a(addr.page == FIL_NULL || addr.boffset >= FIL_PAGE_DATA);
+ ut_a(ut_align_offset(faddr, UNIV_PAGE_SIZE) >= FIL_PAGE_DATA);
+
+ mlog_write_ulint(faddr + FIL_ADDR_PAGE, addr.page, MLOG_4BYTES, mtr);
+ mlog_write_ulint(faddr + FIL_ADDR_BYTE, addr.boffset,
+ MLOG_2BYTES, mtr);
+}
+
+/********************************************************************//**
+Reads a file address.
+@return file address */
+UNIV_INLINE
+fil_addr_t
+flst_read_addr(
+/*===========*/
+ const fil_faddr_t* faddr, /*!< in: pointer to file faddress */
+ mtr_t* mtr) /*!< in: mini-transaction handle */
+{
+ fil_addr_t addr;
+
+ ut_ad(faddr && mtr);
+
+ addr.page = mtr_read_ulint(faddr + FIL_ADDR_PAGE, MLOG_4BYTES, mtr);
+ addr.boffset = mtr_read_ulint(faddr + FIL_ADDR_BYTE, MLOG_2BYTES,
+ mtr);
+ ut_a(addr.page == FIL_NULL || addr.boffset >= FIL_PAGE_DATA);
+ ut_a(ut_align_offset(faddr, UNIV_PAGE_SIZE) >= FIL_PAGE_DATA);
+ return(addr);
+}
+
+/********************************************************************//**
+Initializes a list base node. */
+UNIV_INLINE
+void
+flst_init(
+/*======*/
+ flst_base_node_t* base, /*!< in: pointer to base node */
+ mtr_t* mtr) /*!< in: mini-transaction handle */
+{
+ ut_ad(mtr_memo_contains_page(mtr, base, MTR_MEMO_PAGE_X_FIX));
+
+ mlog_write_ulint(base + FLST_LEN, 0, MLOG_4BYTES, mtr);
+ flst_write_addr(base + FLST_FIRST, fil_addr_null, mtr);
+ flst_write_addr(base + FLST_LAST, fil_addr_null, mtr);
+}
+
+/********************************************************************//**
+Gets list length.
+@return length */
+UNIV_INLINE
+ulint
+flst_get_len(
+/*=========*/
+ const flst_base_node_t* base, /*!< in: pointer to base node */
+ mtr_t* mtr) /*!< in: mini-transaction handle */
+{
+ return(mtr_read_ulint(base + FLST_LEN, MLOG_4BYTES, mtr));
+}
+
+/********************************************************************//**
+Gets list first node address.
+@return file address */
+UNIV_INLINE
+fil_addr_t
+flst_get_first(
+/*===========*/
+ const flst_base_node_t* base, /*!< in: pointer to base node */
+ mtr_t* mtr) /*!< in: mini-transaction handle */
+{
+ return(flst_read_addr(base + FLST_FIRST, mtr));
+}
+
+/********************************************************************//**
+Gets list last node address.
+@return file address */
+UNIV_INLINE
+fil_addr_t
+flst_get_last(
+/*==========*/
+ const flst_base_node_t* base, /*!< in: pointer to base node */
+ mtr_t* mtr) /*!< in: mini-transaction handle */
+{
+ return(flst_read_addr(base + FLST_LAST, mtr));
+}
+
+/********************************************************************//**
+Gets list next node address.
+@return file address */
+UNIV_INLINE
+fil_addr_t
+flst_get_next_addr(
+/*===============*/
+ const flst_node_t* node, /*!< in: pointer to node */
+ mtr_t* mtr) /*!< in: mini-transaction handle */
+{
+ return(flst_read_addr(node + FLST_NEXT, mtr));
+}
+
+/********************************************************************//**
+Gets list prev node address.
+@return file address */
+UNIV_INLINE
+fil_addr_t
+flst_get_prev_addr(
+/*===============*/
+ const flst_node_t* node, /*!< in: pointer to node */
+ mtr_t* mtr) /*!< in: mini-transaction handle */
+{
+ return(flst_read_addr(node + FLST_PREV, mtr));
+}
diff --git a/storage/innobase/include/ha0ha.h b/storage/innobase/include/ha0ha.h
new file mode 100644
index 00000000000..1ffbd3440aa
--- /dev/null
+++ b/storage/innobase/include/ha0ha.h
@@ -0,0 +1,241 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/ha0ha.h
+The hash table with external chains
+
+Created 8/18/1994 Heikki Tuuri
+*******************************************************/
+
+#ifndef ha0ha_h
+#define ha0ha_h
+
+#include "univ.i"
+
+#include "hash0hash.h"
+#include "page0types.h"
+#include "buf0types.h"
+
+/*************************************************************//**
+Looks for an element in a hash table.
+@return pointer to the data of the first hash table node in chain
+having the fold number, NULL if not found */
+UNIV_INLINE
+void*
+ha_search_and_get_data(
+/*===================*/
+ hash_table_t* table, /*!< in: hash table */
+ ulint fold); /*!< in: folded value of the searched data */
+/*********************************************************//**
+Looks for an element when we know the pointer to the data and updates
+the pointer to data if found. */
+UNIV_INTERN
+void
+ha_search_and_update_if_found_func(
+/*===============================*/
+ hash_table_t* table, /*!< in/out: hash table */
+ ulint fold, /*!< in: folded value of the searched data */
+ void* data, /*!< in: pointer to the data */
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+ buf_block_t* new_block,/*!< in: block containing new_data */
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+ void* new_data);/*!< in: new pointer to the data */
+
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+/** Looks for an element when we know the pointer to the data and
+updates the pointer to data if found.
+@param table in/out: hash table
+@param fold in: folded value of the searched data
+@param data in: pointer to the data
+@param new_block in: block containing new_data
+@param new_data in: new pointer to the data */
+# define ha_search_and_update_if_found(table,fold,data,new_block,new_data) \
+ ha_search_and_update_if_found_func(table,fold,data,new_block,new_data)
+#else /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+/** Looks for an element when we know the pointer to the data and
+updates the pointer to data if found.
+@param table in/out: hash table
+@param fold in: folded value of the searched data
+@param data in: pointer to the data
+@param new_block ignored: block containing new_data
+@param new_data in: new pointer to the data */
+# define ha_search_and_update_if_found(table,fold,data,new_block,new_data) \
+ ha_search_and_update_if_found_func(table,fold,data,new_data)
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+/*************************************************************//**
+Creates a hash table with at least n array cells. The actual number
+of cells is chosen to be a prime number slightly bigger than n.
+@return own: created table */
+UNIV_INTERN
+hash_table_t*
+ha_create_func(
+/*===========*/
+ ulint n, /*!< in: number of array cells */
+#ifdef UNIV_SYNC_DEBUG
+ ulint mutex_level, /*!< in: level of the mutexes in the latching
+ order: this is used in the debug version */
+#endif /* UNIV_SYNC_DEBUG */
+ ulint n_mutexes); /*!< in: number of mutexes to protect the
+ hash table: must be a power of 2, or 0 */
+#ifdef UNIV_SYNC_DEBUG
+/** Creates a hash table.
+@return own: created table
+@param n_c in: number of array cells. The actual number of cells is
+chosen to be a slightly bigger prime number.
+@param level in: level of the mutexes in the latching order
+@param n_m in: number of mutexes to protect the hash table;
+ must be a power of 2, or 0 */
+# define ha_create(n_c,n_m,level) ha_create_func(n_c,level,n_m)
+#else /* UNIV_SYNC_DEBUG */
+/** Creates a hash table.
+@return own: created table
+@param n_c in: number of array cells. The actual number of cells is
+chosen to be a slightly bigger prime number.
+@param level in: level of the mutexes in the latching order
+@param n_m in: number of mutexes to protect the hash table;
+ must be a power of 2, or 0 */
+# define ha_create(n_c,n_m,level) ha_create_func(n_c,n_m)
+#endif /* UNIV_SYNC_DEBUG */
+
+/*************************************************************//**
+Empties a hash table and frees the memory heaps. */
+UNIV_INTERN
+void
+ha_clear(
+/*=====*/
+ hash_table_t* table); /*!< in, own: hash table */
+
+/*************************************************************//**
+Inserts an entry into a hash table. If an entry with the same fold number
+is found, its node is updated to point to the new data, and no new node
+is inserted.
+@return TRUE if succeed, FALSE if no more memory could be allocated */
+UNIV_INTERN
+ibool
+ha_insert_for_fold_func(
+/*====================*/
+ hash_table_t* table, /*!< in: hash table */
+ ulint fold, /*!< in: folded value of data; if a node with
+ the same fold value already exists, it is
+ updated to point to the same data, and no new
+ node is created! */
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+ buf_block_t* block, /*!< in: buffer block containing the data */
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+ void* data); /*!< in: data, must not be NULL */
+
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+/**
+Inserts an entry into a hash table. If an entry with the same fold number
+is found, its node is updated to point to the new data, and no new node
+is inserted.
+@return TRUE if succeed, FALSE if no more memory could be allocated
+@param t in: hash table
+@param f in: folded value of data
+@param b in: buffer block containing the data
+@param d in: data, must not be NULL */
+# define ha_insert_for_fold(t,f,b,d) ha_insert_for_fold_func(t,f,b,d)
+#else /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+/**
+Inserts an entry into a hash table. If an entry with the same fold number
+is found, its node is updated to point to the new data, and no new node
+is inserted.
+@return TRUE if succeed, FALSE if no more memory could be allocated
+@param t in: hash table
+@param f in: folded value of data
+@param b ignored: buffer block containing the data
+@param d in: data, must not be NULL */
+# define ha_insert_for_fold(t,f,b,d) ha_insert_for_fold_func(t,f,d)
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+
+/*********************************************************//**
+Looks for an element when we know the pointer to the data and deletes
+it from the hash table if found.
+@return TRUE if found */
+UNIV_INLINE
+ibool
+ha_search_and_delete_if_found(
+/*==========================*/
+ hash_table_t* table, /*!< in: hash table */
+ ulint fold, /*!< in: folded value of the searched data */
+ void* data); /*!< in: pointer to the data */
+#ifndef UNIV_HOTBACKUP
+/*****************************************************************//**
+Removes from the chain determined by fold all nodes whose data pointer
+points to the page given. */
+UNIV_INTERN
+void
+ha_remove_all_nodes_to_page(
+/*========================*/
+ hash_table_t* table, /*!< in: hash table */
+ ulint fold, /*!< in: fold value */
+ const page_t* page); /*!< in: buffer page */
+/*************************************************************//**
+Validates a given range of the cells in hash table.
+@return TRUE if ok */
+UNIV_INTERN
+ibool
+ha_validate(
+/*========*/
+ hash_table_t* table, /*!< in: hash table */
+ ulint start_index, /*!< in: start index */
+ ulint end_index); /*!< in: end index */
+/*************************************************************//**
+Prints info of a hash table. */
+UNIV_INTERN
+void
+ha_print_info(
+/*==========*/
+ FILE* file, /*!< in: file where to print */
+ hash_table_t* table); /*!< in: hash table */
+#endif /* !UNIV_HOTBACKUP */
+
+/** The hash table external chain node */
+typedef struct ha_node_struct ha_node_t;
+
+/** The hash table external chain node */
+struct ha_node_struct {
+ ha_node_t* next; /*!< next chain node or NULL if none */
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+ buf_block_t* block; /*!< buffer block containing the data, or NULL */
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+ void* data; /*!< pointer to the data */
+ ulint fold; /*!< fold value for the data */
+};
+
+#ifndef UNIV_HOTBACKUP
+/** Assert that the current thread is holding the mutex protecting a
+hash bucket corresponding to a fold value.
+@param table in: hash table
+@param fold in: fold value */
+# define ASSERT_HASH_MUTEX_OWN(table, fold) \
+ ut_ad(!(table)->mutexes || mutex_own(hash_get_mutex(table, fold)))
+#else /* !UNIV_HOTBACKUP */
+/** Assert that the current thread is holding the mutex protecting a
+hash bucket corresponding to a fold value.
+@param table in: hash table
+@param fold in: fold value */
+# define ASSERT_HASH_MUTEX_OWN(table, fold) ((void) 0)
+#endif /* !UNIV_HOTBACKUP */
+
+#ifndef UNIV_NONINL
+#include "ha0ha.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/ha0ha.ic b/storage/innobase/include/ha0ha.ic
new file mode 100644
index 00000000000..734403c4cd9
--- /dev/null
+++ b/storage/innobase/include/ha0ha.ic
@@ -0,0 +1,220 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/ha0ha.ic
+The hash table with external chains
+
+Created 8/18/1994 Heikki Tuuri
+*************************************************************************/
+
+#include "ut0rnd.h"
+#include "mem0mem.h"
+
+/***********************************************************//**
+Deletes a hash node. */
+UNIV_INTERN
+void
+ha_delete_hash_node(
+/*================*/
+ hash_table_t* table, /*!< in: hash table */
+ ha_node_t* del_node); /*!< in: node to be deleted */
+
+/******************************************************************//**
+Gets a hash node data.
+@return pointer to the data */
+UNIV_INLINE
+void*
+ha_node_get_data(
+/*=============*/
+ ha_node_t* node) /*!< in: hash chain node */
+{
+ return(node->data);
+}
+
+/******************************************************************//**
+Sets hash node data. */
+UNIV_INLINE
+void
+ha_node_set_data_func(
+/*==================*/
+ ha_node_t* node, /*!< in: hash chain node */
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+ buf_block_t* block, /*!< in: buffer block containing the data */
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+ void* data) /*!< in: pointer to the data */
+{
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+ node->block = block;
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+ node->data = data;
+}
+
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+/** Sets hash node data.
+@param n in: hash chain node
+@param b in: buffer block containing the data
+@param d in: pointer to the data */
+# define ha_node_set_data(n,b,d) ha_node_set_data_func(n,b,d)
+#else /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+/** Sets hash node data.
+@param n in: hash chain node
+@param b in: buffer block containing the data
+@param d in: pointer to the data */
+# define ha_node_set_data(n,b,d) ha_node_set_data_func(n,d)
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+
+/******************************************************************//**
+Gets the next node in a hash chain.
+@return next node, NULL if none */
+UNIV_INLINE
+ha_node_t*
+ha_chain_get_next(
+/*==============*/
+ ha_node_t* node) /*!< in: hash chain node */
+{
+ return(node->next);
+}
+
+/******************************************************************//**
+Gets the first node in a hash chain.
+@return first node, NULL if none */
+UNIV_INLINE
+ha_node_t*
+ha_chain_get_first(
+/*===============*/
+ hash_table_t* table, /*!< in: hash table */
+ ulint fold) /*!< in: fold value determining the chain */
+{
+ return((ha_node_t*)
+ hash_get_nth_cell(table, hash_calc_hash(fold, table))->node);
+}
+
+/*************************************************************//**
+Looks for an element in a hash table.
+@return pointer to the first hash table node in chain having the fold
+number, NULL if not found */
+UNIV_INLINE
+ha_node_t*
+ha_search(
+/*======*/
+ hash_table_t* table, /*!< in: hash table */
+ ulint fold) /*!< in: folded value of the searched data */
+{
+ ha_node_t* node;
+
+ ASSERT_HASH_MUTEX_OWN(table, fold);
+
+ node = ha_chain_get_first(table, fold);
+
+ while (node) {
+ if (node->fold == fold) {
+
+ return(node);
+ }
+
+ node = ha_chain_get_next(node);
+ }
+
+ return(NULL);
+}
+
+/*************************************************************//**
+Looks for an element in a hash table.
+@return pointer to the data of the first hash table node in chain
+having the fold number, NULL if not found */
+UNIV_INLINE
+void*
+ha_search_and_get_data(
+/*===================*/
+ hash_table_t* table, /*!< in: hash table */
+ ulint fold) /*!< in: folded value of the searched data */
+{
+ ha_node_t* node;
+
+ ASSERT_HASH_MUTEX_OWN(table, fold);
+
+ node = ha_chain_get_first(table, fold);
+
+ while (node) {
+ if (node->fold == fold) {
+
+ return(node->data);
+ }
+
+ node = ha_chain_get_next(node);
+ }
+
+ return(NULL);
+}
+
+/*********************************************************//**
+Looks for an element when we know the pointer to the data.
+@return pointer to the hash table node, NULL if not found in the table */
+UNIV_INLINE
+ha_node_t*
+ha_search_with_data(
+/*================*/
+ hash_table_t* table, /*!< in: hash table */
+ ulint fold, /*!< in: folded value of the searched data */
+ void* data) /*!< in: pointer to the data */
+{
+ ha_node_t* node;
+
+ ASSERT_HASH_MUTEX_OWN(table, fold);
+
+ node = ha_chain_get_first(table, fold);
+
+ while (node) {
+ if (node->data == data) {
+
+ return(node);
+ }
+
+ node = ha_chain_get_next(node);
+ }
+
+ return(NULL);
+}
+
+/*********************************************************//**
+Looks for an element when we know the pointer to the data, and deletes
+it from the hash table, if found.
+@return TRUE if found */
+UNIV_INLINE
+ibool
+ha_search_and_delete_if_found(
+/*==========================*/
+ hash_table_t* table, /*!< in: hash table */
+ ulint fold, /*!< in: folded value of the searched data */
+ void* data) /*!< in: pointer to the data */
+{
+ ha_node_t* node;
+
+ ASSERT_HASH_MUTEX_OWN(table, fold);
+
+ node = ha_search_with_data(table, fold, data);
+
+ if (node) {
+ ha_delete_hash_node(table, node);
+
+ return(TRUE);
+ }
+
+ return(FALSE);
+}
diff --git a/storage/innobase/include/ha0storage.h b/storage/innobase/include/ha0storage.h
new file mode 100644
index 00000000000..c30bd840579
--- /dev/null
+++ b/storage/innobase/include/ha0storage.h
@@ -0,0 +1,140 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/ha0storage.h
+Hash storage.
+Provides a data structure that stores chunks of data in
+its own storage, avoiding duplicates.
+
+Created September 22, 2007 Vasil Dimov
+*******************************************************/
+
+#ifndef ha0storage_h
+#define ha0storage_h
+
+#include "univ.i"
+
+/** This value is used by default by ha_storage_create(). More memory
+is allocated later when/if it is needed. */
+#define HA_STORAGE_DEFAULT_HEAP_BYTES 1024
+
+/** This value is used by default by ha_storage_create(). It is a
+constant per ha_storage's lifetime. */
+#define HA_STORAGE_DEFAULT_HASH_CELLS 4096
+
+/** Hash storage */
+typedef struct ha_storage_struct ha_storage_t;
+
+/*******************************************************************//**
+Creates a hash storage. If any of the parameters is 0, then a default
+value is used.
+@return own: hash storage */
+UNIV_INLINE
+ha_storage_t*
+ha_storage_create(
+/*==============*/
+ ulint initial_heap_bytes, /*!< in: initial heap's size */
+ ulint initial_hash_cells); /*!< in: initial number of cells
+ in the hash table */
+
+/*******************************************************************//**
+Copies data into the storage and returns a pointer to the copy. If the
+same data chunk is already present, then pointer to it is returned.
+Data chunks are considered to be equal if len1 == len2 and
+memcmp(data1, data2, len1) == 0. If "data" is not present (and thus
+data_len bytes need to be allocated) and the size of storage is going to
+become more than "memlim" then "data" is not added and NULL is returned.
+To disable this behavior "memlim" can be set to 0, which stands for
+"no limit".
+@return pointer to the copy */
+UNIV_INTERN
+const void*
+ha_storage_put_memlim(
+/*==================*/
+ ha_storage_t* storage, /*!< in/out: hash storage */
+ const void* data, /*!< in: data to store */
+ ulint data_len, /*!< in: data length */
+ ulint memlim); /*!< in: memory limit to obey */
+
+/*******************************************************************//**
+Same as ha_storage_put_memlim() but without memory limit.
+@param storage in/out: hash storage
+@param data in: data to store
+@param data_len in: data length
+@return pointer to the copy of the string */
+#define ha_storage_put(storage, data, data_len) \
+ ha_storage_put_memlim((storage), (data), (data_len), 0)
+
+/*******************************************************************//**
+Copies string into the storage and returns a pointer to the copy. If the
+same string is already present, then pointer to it is returned.
+Strings are considered to be equal if strcmp(str1, str2) == 0.
+@param storage in/out: hash storage
+@param str in: string to put
+@return pointer to the copy of the string */
+#define ha_storage_put_str(storage, str) \
+ ((const char*) ha_storage_put((storage), (str), strlen(str) + 1))
+
+/*******************************************************************//**
+Copies string into the storage and returns a pointer to the copy obeying
+a memory limit.
+If the same string is already present, then pointer to it is returned.
+Strings are considered to be equal if strcmp(str1, str2) == 0.
+@param storage in/out: hash storage
+@param str in: string to put
+@param memlim in: memory limit to obey
+@return pointer to the copy of the string */
+#define ha_storage_put_str_memlim(storage, str, memlim) \
+ ((const char*) ha_storage_put_memlim((storage), (str), \
+ strlen(str) + 1, (memlim)))
+
+/*******************************************************************//**
+Empties a hash storage, freeing memory occupied by data chunks.
+This invalidates any pointers previously returned by ha_storage_put().
+The hash storage is not invalidated itself and can be used again. */
+UNIV_INLINE
+void
+ha_storage_empty(
+/*=============*/
+ ha_storage_t** storage); /*!< in/out: hash storage */
+
+/*******************************************************************//**
+Frees a hash storage and everything it contains, it cannot be used after
+this call.
+This invalidates any pointers previously returned by ha_storage_put(). */
+UNIV_INLINE
+void
+ha_storage_free(
+/*============*/
+ ha_storage_t* storage); /*!< in, own: hash storage */
+
+/*******************************************************************//**
+Gets the size of the memory used by a storage.
+@return bytes used */
+UNIV_INLINE
+ulint
+ha_storage_get_size(
+/*================*/
+ const ha_storage_t* storage); /*!< in: hash storage */
+
+#ifndef UNIV_NONINL
+#include "ha0storage.ic"
+#endif
+
+#endif /* ha0storage_h */
diff --git a/storage/innobase/include/ha0storage.ic b/storage/innobase/include/ha0storage.ic
new file mode 100644
index 00000000000..5acbf82f005
--- /dev/null
+++ b/storage/innobase/include/ha0storage.ic
@@ -0,0 +1,148 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/ha0storage.ic
+Hash storage.
+Provides a data structure that stores chunks of data in
+its own storage, avoiding duplicates.
+
+Created September 24, 2007 Vasil Dimov
+*******************************************************/
+
+#include "univ.i"
+#include "ha0storage.h"
+#include "hash0hash.h"
+#include "mem0mem.h"
+
+/** Hash storage for strings */
+struct ha_storage_struct {
+ mem_heap_t* heap; /*!< memory heap from which memory is
+ allocated */
+ hash_table_t* hash; /*!< hash table used to avoid
+ duplicates */
+};
+
+/** Objects of this type are stored in ha_storage_t */
+typedef struct ha_storage_node_struct ha_storage_node_t;
+/** Objects of this type are stored in ha_storage_struct */
+struct ha_storage_node_struct {
+ ulint data_len;/*!< length of the data */
+ const void* data; /*!< pointer to data */
+ ha_storage_node_t* next; /*!< next node in hash chain */
+};
+
+/*******************************************************************//**
+Creates a hash storage. If any of the parameters is 0, then a default
+value is used.
+@return own: hash storage */
+UNIV_INLINE
+ha_storage_t*
+ha_storage_create(
+/*==============*/
+ ulint initial_heap_bytes, /*!< in: initial heap's size */
+ ulint initial_hash_cells) /*!< in: initial number of cells
+ in the hash table */
+{
+ ha_storage_t* storage;
+ mem_heap_t* heap;
+
+ if (initial_heap_bytes == 0) {
+
+ initial_heap_bytes = HA_STORAGE_DEFAULT_HEAP_BYTES;
+ }
+
+ if (initial_hash_cells == 0) {
+
+ initial_hash_cells = HA_STORAGE_DEFAULT_HASH_CELLS;
+ }
+
+ /* we put "storage" within "storage->heap" */
+
+ heap = mem_heap_create(sizeof(ha_storage_t)
+ + initial_heap_bytes);
+
+ storage = (ha_storage_t*) mem_heap_alloc(heap,
+ sizeof(ha_storage_t));
+
+ storage->heap = heap;
+ storage->hash = hash_create(initial_hash_cells);
+
+ return(storage);
+}
+
+/*******************************************************************//**
+Empties a hash storage, freeing memory occupied by data chunks.
+This invalidates any pointers previously returned by ha_storage_put().
+The hash storage is not invalidated itself and can be used again. */
+UNIV_INLINE
+void
+ha_storage_empty(
+/*=============*/
+ ha_storage_t** storage) /*!< in/out: hash storage */
+{
+ ha_storage_t temp_storage;
+
+ temp_storage.heap = (*storage)->heap;
+ temp_storage.hash = (*storage)->hash;
+
+ hash_table_clear(temp_storage.hash);
+ mem_heap_empty(temp_storage.heap);
+
+ *storage = (ha_storage_t*) mem_heap_alloc(temp_storage.heap,
+ sizeof(ha_storage_t));
+
+ (*storage)->heap = temp_storage.heap;
+ (*storage)->hash = temp_storage.hash;
+}
+
+/*******************************************************************//**
+Frees a hash storage and everything it contains, it cannot be used after
+this call.
+This invalidates any pointers previously returned by ha_storage_put(). */
+UNIV_INLINE
+void
+ha_storage_free(
+/*============*/
+ ha_storage_t* storage) /*!< in, own: hash storage */
+{
+ /* order is important because the pointer storage->hash is
+ within the heap */
+ hash_table_free(storage->hash);
+ mem_heap_free(storage->heap);
+}
+
+/*******************************************************************//**
+Gets the size of the memory used by a storage.
+@return bytes used */
+UNIV_INLINE
+ulint
+ha_storage_get_size(
+/*================*/
+ const ha_storage_t* storage) /*!< in: hash storage */
+{
+ ulint ret;
+
+ ret = mem_heap_get_size(storage->heap);
+
+ /* this assumes hash->heap and hash->heaps are NULL */
+ ret += sizeof(hash_table_t);
+ ret += sizeof(hash_cell_t) * hash_get_n_cells(storage->hash);
+
+ return(ret);
+}
diff --git a/storage/innobase/include/ha_prototypes.h b/storage/innobase/include/ha_prototypes.h
new file mode 100644
index 00000000000..e8789d1638b
--- /dev/null
+++ b/storage/innobase/include/ha_prototypes.h
@@ -0,0 +1,283 @@
+/*****************************************************************************
+
+Copyright (c) 2006, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/*******************************************************************//**
+@file include/ha_prototypes.h
+Prototypes for global functions in ha_innodb.cc that are called by
+InnoDB C code
+
+Created 5/11/2006 Osku Salerma
+************************************************************************/
+
+#ifndef HA_INNODB_PROTOTYPES_H
+#define HA_INNODB_PROTOTYPES_H
+
+#include "trx0types.h"
+#include "m_ctype.h" /* CHARSET_INFO */
+
+/*********************************************************************//**
+Wrapper around MySQL's copy_and_convert function.
+@return number of bytes copied to 'to' */
+UNIV_INTERN
+ulint
+innobase_convert_string(
+/*====================*/
+ void* to, /*!< out: converted string */
+ ulint to_length, /*!< in: number of bytes reserved
+ for the converted string */
+ CHARSET_INFO* to_cs, /*!< in: character set to convert to */
+ const void* from, /*!< in: string to convert */
+ ulint from_length, /*!< in: number of bytes to convert */
+ CHARSET_INFO* from_cs, /*!< in: character set to convert from */
+ uint* errors); /*!< out: number of errors encountered
+ during the conversion */
+
+/*******************************************************************//**
+Formats the raw data in "data" (in InnoDB on-disk format) that is of
+type DATA_(CHAR|VARCHAR|MYSQL|VARMYSQL) using "charset_coll" and writes
+the result to "buf". The result is converted to "system_charset_info".
+Not more than "buf_size" bytes are written to "buf".
+The result is always NUL-terminated (provided buf_size > 0) and the
+number of bytes that were written to "buf" is returned (including the
+terminating NUL).
+@return number of bytes that were written */
+UNIV_INTERN
+ulint
+innobase_raw_format(
+/*================*/
+ const char* data, /*!< in: raw data */
+ ulint data_len, /*!< in: raw data length
+ in bytes */
+ ulint charset_coll, /*!< in: charset collation */
+ char* buf, /*!< out: output buffer */
+ ulint buf_size); /*!< in: output buffer size
+ in bytes */
+
+/*****************************************************************//**
+Invalidates the MySQL query cache for the table. */
+UNIV_INTERN
+void
+innobase_invalidate_query_cache(
+/*============================*/
+ trx_t* trx, /*!< in: transaction which
+ modifies the table */
+ const char* full_name, /*!< in: concatenation of
+ database name, null char NUL,
+ table name, null char NUL;
+ NOTE that in Windows this is
+ always in LOWER CASE! */
+ ulint full_name_len); /*!< in: full name length where
+ also the null chars count */
+
+/*****************************************************************//**
+Convert a table or index name to the MySQL system_charset_info (UTF-8)
+and quote it if needed.
+@return pointer to the end of buf */
+UNIV_INTERN
+char*
+innobase_convert_name(
+/*==================*/
+ char* buf, /*!< out: buffer for converted identifier */
+ ulint buflen, /*!< in: length of buf, in bytes */
+ const char* id, /*!< in: identifier to convert */
+ ulint idlen, /*!< in: length of id, in bytes */
+ void* thd, /*!< in: MySQL connection thread, or NULL */
+ ibool table_id);/*!< in: TRUE=id is a table or database name;
+ FALSE=id is an index name */
+
+/******************************************************************//**
+Returns true if the thread is the replication thread on the slave
+server. Used in srv_conc_enter_innodb() to determine if the thread
+should be allowed to enter InnoDB - the replication thread is treated
+differently than other threads. Also used in
+srv_conc_force_exit_innodb().
+@return true if thd is the replication thread */
+UNIV_INTERN
+ibool
+thd_is_replication_slave_thread(
+/*============================*/
+ void* thd); /*!< in: thread handle (THD*) */
+
+/******************************************************************//**
+Returns true if the transaction this thread is processing has edited
+non-transactional tables. Used by the deadlock detector when deciding
+which transaction to rollback in case of a deadlock - we try to avoid
+rolling back transactions that have edited non-transactional tables.
+@return true if non-transactional tables have been edited */
+UNIV_INTERN
+ibool
+thd_has_edited_nontrans_tables(
+/*===========================*/
+ void* thd); /*!< in: thread handle (THD*) */
+
+/*************************************************************//**
+Prints info of a THD object (== user session thread) to the given file. */
+UNIV_INTERN
+void
+innobase_mysql_print_thd(
+/*=====================*/
+ FILE* f, /*!< in: output stream */
+ void* thd, /*!< in: pointer to a MySQL THD object */
+ uint max_query_len); /*!< in: max query length to print, or 0 to
+ use the default max length */
+
+/**************************************************************//**
+Converts a MySQL type to an InnoDB type. Note that this function returns
+the 'mtype' of InnoDB. InnoDB differentiates between MySQL's old <= 4.1
+VARCHAR and the new true VARCHAR in >= 5.0.3 by the 'prtype'.
+@return DATA_BINARY, DATA_VARCHAR, ... */
+UNIV_INTERN
+ulint
+get_innobase_type_from_mysql_type(
+/*==============================*/
+ ulint* unsigned_flag, /*!< out: DATA_UNSIGNED if an
+ 'unsigned type';
+ at least ENUM and SET,
+ and unsigned integer
+ types are 'unsigned types' */
+ const void* field) /*!< in: MySQL Field */
+ __attribute__((nonnull));
+
+/*************************************************************//**
+If you want to print a thd that is not associated with the current thread,
+you must call this function before reserving the InnoDB kernel_mutex, to
+protect MySQL from setting thd->query NULL. If you print a thd of the current
+thread, we know that MySQL cannot modify thd->query, and it is not necessary
+to call this. Call innobase_mysql_end_print_arbitrary_thd() after you release
+the kernel_mutex. */
+UNIV_INTERN
+void
+innobase_mysql_prepare_print_arbitrary_thd(void);
+/*============================================*/
+
+/*************************************************************//**
+Releases the mutex reserved by innobase_mysql_prepare_print_arbitrary_thd().
+In the InnoDB latching order, the mutex sits right above the
+kernel_mutex. In debug builds, we assert that the kernel_mutex is
+released before this function is invoked. */
+UNIV_INTERN
+void
+innobase_mysql_end_print_arbitrary_thd(void);
+/*========================================*/
+
+/******************************************************************//**
+Get the variable length bounds of the given character set. */
+UNIV_INTERN
+void
+innobase_get_cset_width(
+/*====================*/
+ ulint cset, /*!< in: MySQL charset-collation code */
+ ulint* mbminlen, /*!< out: minimum length of a char (in bytes) */
+ ulint* mbmaxlen); /*!< out: maximum length of a char (in bytes) */
+
+/******************************************************************//**
+Compares NUL-terminated UTF-8 strings case insensitively.
+@return 0 if a=b, <0 if a<b, >1 if a>b */
+UNIV_INTERN
+int
+innobase_strcasecmp(
+/*================*/
+ const char* a, /*!< in: first string to compare */
+ const char* b); /*!< in: second string to compare */
+
+/******************************************************************//**
+Returns true if the thread is executing a SELECT statement.
+@return true if thd is executing SELECT */
+
+ibool
+thd_is_select(
+/*==========*/
+ const void* thd); /*!< in: thread handle (THD*) */
+
+/******************************************************************//**
+Converts an identifier to a table name. */
+UNIV_INTERN
+void
+innobase_convert_from_table_id(
+/*===========================*/
+ struct charset_info_st* cs, /*!< in: the 'from' character set */
+ char* to, /*!< out: converted identifier */
+ const char* from, /*!< in: identifier to convert */
+ ulint len); /*!< in: length of 'to', in bytes; should
+ be at least 5 * strlen(to) + 1 */
+/******************************************************************//**
+Converts an identifier to UTF-8. */
+UNIV_INTERN
+void
+innobase_convert_from_id(
+/*=====================*/
+ struct charset_info_st* cs, /*!< in: the 'from' character set */
+ char* to, /*!< out: converted identifier */
+ const char* from, /*!< in: identifier to convert */
+ ulint len); /*!< in: length of 'to', in bytes; should
+ be at least 3 * strlen(to) + 1 */
+/******************************************************************//**
+Makes all characters in a NUL-terminated UTF-8 string lower case. */
+UNIV_INTERN
+void
+innobase_casedn_str(
+/*================*/
+ char* a); /*!< in/out: string to put in lower case */
+
+/**********************************************************************//**
+Determines the connection character set.
+@return connection character set */
+struct charset_info_st*
+innobase_get_charset(
+/*=================*/
+ void* mysql_thd); /*!< in: MySQL thread handle */
+
+/******************************************************************//**
+This function is used to find the storage length in bytes of the first n
+characters for prefix indexes using a multibyte character set. The function
+finds charset information and returns length of prefix_len characters in the
+index field in bytes.
+@return number of bytes occupied by the first n characters */
+UNIV_INTERN
+ulint
+innobase_get_at_most_n_mbchars(
+/*===========================*/
+ ulint charset_id, /*!< in: character set id */
+ ulint prefix_len, /*!< in: prefix length in bytes of the index
+ (this has to be divided by mbmaxlen to get the
+ number of CHARACTERS n in the prefix) */
+ ulint data_len, /*!< in: length of the string in bytes */
+ const char* str); /*!< in: character string */
+
+/******************************************************************//**
+Returns true if the thread supports XA,
+global value of innodb_supports_xa if thd is NULL.
+@return true if thd supports XA */
+
+ibool
+thd_supports_xa(
+/*============*/
+ void* thd); /*!< in: thread handle (THD*), or NULL to query
+ the global innodb_supports_xa */
+
+/******************************************************************//**
+Returns the lock wait timeout for the current connection.
+@return the lock wait timeout, in seconds */
+
+ulong
+thd_lock_wait_timeout(
+/*==================*/
+ void* thd); /*!< in: thread handle (THD*), or NULL to query
+ the global innodb_lock_wait_timeout */
+
+#endif
diff --git a/storage/innobase/include/handler0alter.h b/storage/innobase/include/handler0alter.h
new file mode 100644
index 00000000000..985b76f4f50
--- /dev/null
+++ b/storage/innobase/include/handler0alter.h
@@ -0,0 +1,42 @@
+/*****************************************************************************
+
+Copyright (c) 2005, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/handler0alter.h
+Smart ALTER TABLE
+*******************************************************/
+
+/*************************************************************//**
+Copies an InnoDB record to table->record[0]. */
+UNIV_INTERN
+void
+innobase_rec_to_mysql(
+/*==================*/
+ TABLE* table, /*!< in/out: MySQL table */
+ const rec_t* rec, /*!< in: record */
+ const dict_index_t* index, /*!< in: index */
+ const ulint* offsets); /*!< in: rec_get_offsets(
+ rec, index, ...) */
+
+/*************************************************************//**
+Resets table->record[0]. */
+UNIV_INTERN
+void
+innobase_rec_reset(
+/*===============*/
+ TABLE* table); /*!< in/out: MySQL table */
diff --git a/storage/innobase/include/hash0hash.h b/storage/innobase/include/hash0hash.h
new file mode 100644
index 00000000000..977cb829f35
--- /dev/null
+++ b/storage/innobase/include/hash0hash.h
@@ -0,0 +1,446 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/hash0hash.h
+The simple hash table utility
+
+Created 5/20/1997 Heikki Tuuri
+*******************************************************/
+
+#ifndef hash0hash_h
+#define hash0hash_h
+
+#include "univ.i"
+#include "mem0mem.h"
+#ifndef UNIV_HOTBACKUP
+# include "sync0sync.h"
+#endif /* !UNIV_HOTBACKUP */
+
+typedef struct hash_table_struct hash_table_t;
+typedef struct hash_cell_struct hash_cell_t;
+
+typedef void* hash_node_t;
+
+/* Fix Bug #13859: symbol collision between imap/mysql */
+#define hash_create hash0_create
+
+/*************************************************************//**
+Creates a hash table with >= n array cells. The actual number
+of cells is chosen to be a prime number slightly bigger than n.
+@return own: created table */
+UNIV_INTERN
+hash_table_t*
+hash_create(
+/*========*/
+ ulint n); /*!< in: number of array cells */
+#ifndef UNIV_HOTBACKUP
+/*************************************************************//**
+Creates a mutex array to protect a hash table. */
+UNIV_INTERN
+void
+hash_create_mutexes_func(
+/*=====================*/
+ hash_table_t* table, /*!< in: hash table */
+#ifdef UNIV_SYNC_DEBUG
+ ulint sync_level, /*!< in: latching order level of the
+ mutexes: used in the debug version */
+#endif /* UNIV_SYNC_DEBUG */
+ ulint n_mutexes); /*!< in: number of mutexes */
+#ifdef UNIV_SYNC_DEBUG
+# define hash_create_mutexes(t,n,level) hash_create_mutexes_func(t,level,n)
+#else /* UNIV_SYNC_DEBUG */
+# define hash_create_mutexes(t,n,level) hash_create_mutexes_func(t,n)
+#endif /* UNIV_SYNC_DEBUG */
+#endif /* !UNIV_HOTBACKUP */
+
+/*************************************************************//**
+Frees a hash table. */
+UNIV_INTERN
+void
+hash_table_free(
+/*============*/
+ hash_table_t* table); /*!< in, own: hash table */
+/**************************************************************//**
+Calculates the hash value from a folded value.
+@return hashed value */
+UNIV_INLINE
+ulint
+hash_calc_hash(
+/*===========*/
+ ulint fold, /*!< in: folded value */
+ hash_table_t* table); /*!< in: hash table */
+#ifndef UNIV_HOTBACKUP
+/********************************************************************//**
+Assert that the mutex for the table in a hash operation is owned. */
+# define HASH_ASSERT_OWNED(TABLE, FOLD) \
+ut_ad(!(TABLE)->mutexes || mutex_own(hash_get_mutex(TABLE, FOLD)));
+#else /* !UNIV_HOTBACKUP */
+# define HASH_ASSERT_OWNED(TABLE, FOLD)
+#endif /* !UNIV_HOTBACKUP */
+
+/*******************************************************************//**
+Inserts a struct to a hash table. */
+
+#define HASH_INSERT(TYPE, NAME, TABLE, FOLD, DATA)\
+do {\
+ hash_cell_t* cell3333;\
+ TYPE* struct3333;\
+\
+ HASH_ASSERT_OWNED(TABLE, FOLD)\
+\
+ (DATA)->NAME = NULL;\
+\
+ cell3333 = hash_get_nth_cell(TABLE, hash_calc_hash(FOLD, TABLE));\
+\
+ if (cell3333->node == NULL) {\
+ cell3333->node = DATA;\
+ } else {\
+ struct3333 = (TYPE*) cell3333->node;\
+\
+ while (struct3333->NAME != NULL) {\
+\
+ struct3333 = (TYPE*) struct3333->NAME;\
+ }\
+\
+ struct3333->NAME = DATA;\
+ }\
+} while (0)
+
+#ifdef UNIV_HASH_DEBUG
+# define HASH_ASSERT_VALID(DATA) ut_a((void*) (DATA) != (void*) -1)
+# define HASH_INVALIDATE(DATA, NAME) DATA->NAME = (void*) -1
+#else
+# define HASH_ASSERT_VALID(DATA) do {} while (0)
+# define HASH_INVALIDATE(DATA, NAME) do {} while (0)
+#endif
+
+/*******************************************************************//**
+Deletes a struct from a hash table. */
+
+#define HASH_DELETE(TYPE, NAME, TABLE, FOLD, DATA)\
+do {\
+ hash_cell_t* cell3333;\
+ TYPE* struct3333;\
+\
+ HASH_ASSERT_OWNED(TABLE, FOLD)\
+\
+ cell3333 = hash_get_nth_cell(TABLE, hash_calc_hash(FOLD, TABLE));\
+\
+ if (cell3333->node == DATA) {\
+ HASH_ASSERT_VALID(DATA->NAME);\
+ cell3333->node = DATA->NAME;\
+ } else {\
+ struct3333 = (TYPE*) cell3333->node;\
+\
+ while (struct3333->NAME != DATA) {\
+\
+ struct3333 = (TYPE*) struct3333->NAME;\
+ ut_a(struct3333);\
+ }\
+\
+ struct3333->NAME = DATA->NAME;\
+ }\
+ HASH_INVALIDATE(DATA, NAME);\
+} while (0)
+
+/*******************************************************************//**
+Gets the first struct in a hash chain, NULL if none. */
+
+#define HASH_GET_FIRST(TABLE, HASH_VAL)\
+ (hash_get_nth_cell(TABLE, HASH_VAL)->node)
+
+/*******************************************************************//**
+Gets the next struct in a hash chain, NULL if none. */
+
+#define HASH_GET_NEXT(NAME, DATA) ((DATA)->NAME)
+
+/********************************************************************//**
+Looks for a struct in a hash table. */
+#define HASH_SEARCH(NAME, TABLE, FOLD, TYPE, DATA, ASSERTION, TEST)\
+{\
+\
+ HASH_ASSERT_OWNED(TABLE, FOLD)\
+\
+ (DATA) = (TYPE) HASH_GET_FIRST(TABLE, hash_calc_hash(FOLD, TABLE));\
+ HASH_ASSERT_VALID(DATA);\
+\
+ while ((DATA) != NULL) {\
+ ASSERTION;\
+ if (TEST) {\
+ break;\
+ } else {\
+ HASH_ASSERT_VALID(HASH_GET_NEXT(NAME, DATA));\
+ (DATA) = (TYPE) HASH_GET_NEXT(NAME, DATA);\
+ }\
+ }\
+}
+
+/********************************************************************//**
+Looks for an item in all hash buckets. */
+#define HASH_SEARCH_ALL(NAME, TABLE, TYPE, DATA, ASSERTION, TEST) \
+do { \
+ ulint i3333; \
+ \
+ for (i3333 = (TABLE)->n_cells; i3333--; ) { \
+ (DATA) = (TYPE) HASH_GET_FIRST(TABLE, i3333); \
+ \
+ while ((DATA) != NULL) { \
+ HASH_ASSERT_VALID(DATA); \
+ ASSERTION; \
+ \
+ if (TEST) { \
+ break; \
+ } \
+ \
+ (DATA) = (TYPE) HASH_GET_NEXT(NAME, DATA); \
+ } \
+ \
+ if ((DATA) != NULL) { \
+ break; \
+ } \
+ } \
+} while (0)
+
+/************************************************************//**
+Gets the nth cell in a hash table.
+@return pointer to cell */
+UNIV_INLINE
+hash_cell_t*
+hash_get_nth_cell(
+/*==============*/
+ hash_table_t* table, /*!< in: hash table */
+ ulint n); /*!< in: cell index */
+
+/*************************************************************//**
+Clears a hash table so that all the cells become empty. */
+UNIV_INLINE
+void
+hash_table_clear(
+/*=============*/
+ hash_table_t* table); /*!< in/out: hash table */
+
+/*************************************************************//**
+Returns the number of cells in a hash table.
+@return number of cells */
+UNIV_INLINE
+ulint
+hash_get_n_cells(
+/*=============*/
+ hash_table_t* table); /*!< in: table */
+/*******************************************************************//**
+Deletes a struct which is stored in the heap of the hash table, and compacts
+the heap. The fold value must be stored in the struct NODE in a field named
+'fold'. */
+
+#define HASH_DELETE_AND_COMPACT(TYPE, NAME, TABLE, NODE)\
+do {\
+ TYPE* node111;\
+ TYPE* top_node111;\
+ hash_cell_t* cell111;\
+ ulint fold111;\
+\
+ fold111 = (NODE)->fold;\
+\
+ HASH_DELETE(TYPE, NAME, TABLE, fold111, NODE);\
+\
+ top_node111 = (TYPE*)mem_heap_get_top(\
+ hash_get_heap(TABLE, fold111),\
+ sizeof(TYPE));\
+\
+ /* If the node to remove is not the top node in the heap, compact the\
+ heap of nodes by moving the top node in the place of NODE. */\
+\
+ if (NODE != top_node111) {\
+\
+ /* Copy the top node in place of NODE */\
+\
+ *(NODE) = *top_node111;\
+\
+ cell111 = hash_get_nth_cell(TABLE,\
+ hash_calc_hash(top_node111->fold, TABLE));\
+\
+ /* Look for the pointer to the top node, to update it */\
+\
+ if (cell111->node == top_node111) {\
+ /* The top node is the first in the chain */\
+\
+ cell111->node = NODE;\
+ } else {\
+ /* We have to look for the predecessor of the top\
+ node */\
+ node111 = cell111->node;\
+\
+ while (top_node111 != HASH_GET_NEXT(NAME, node111)) {\
+\
+ node111 = HASH_GET_NEXT(NAME, node111);\
+ }\
+\
+ /* Now we have the predecessor node */\
+\
+ node111->NAME = NODE;\
+ }\
+ }\
+\
+ /* Free the space occupied by the top node */\
+\
+ mem_heap_free_top(hash_get_heap(TABLE, fold111), sizeof(TYPE));\
+} while (0)
+
+#ifndef UNIV_HOTBACKUP
+/****************************************************************//**
+Move all hash table entries from OLD_TABLE to NEW_TABLE. */
+
+#define HASH_MIGRATE(OLD_TABLE, NEW_TABLE, NODE_TYPE, PTR_NAME, FOLD_FUNC) \
+do {\
+ ulint i2222;\
+ ulint cell_count2222;\
+\
+ cell_count2222 = hash_get_n_cells(OLD_TABLE);\
+\
+ for (i2222 = 0; i2222 < cell_count2222; i2222++) {\
+ NODE_TYPE* node2222 = HASH_GET_FIRST((OLD_TABLE), i2222);\
+\
+ while (node2222) {\
+ NODE_TYPE* next2222 = node2222->PTR_NAME;\
+ ulint fold2222 = FOLD_FUNC(node2222);\
+\
+ HASH_INSERT(NODE_TYPE, PTR_NAME, (NEW_TABLE),\
+ fold2222, node2222);\
+\
+ node2222 = next2222;\
+ }\
+ }\
+} while (0)
+
+/************************************************************//**
+Gets the mutex index for a fold value in a hash table.
+@return mutex number */
+UNIV_INLINE
+ulint
+hash_get_mutex_no(
+/*==============*/
+ hash_table_t* table, /*!< in: hash table */
+ ulint fold); /*!< in: fold */
+/************************************************************//**
+Gets the nth heap in a hash table.
+@return mem heap */
+UNIV_INLINE
+mem_heap_t*
+hash_get_nth_heap(
+/*==============*/
+ hash_table_t* table, /*!< in: hash table */
+ ulint i); /*!< in: index of the heap */
+/************************************************************//**
+Gets the heap for a fold value in a hash table.
+@return mem heap */
+UNIV_INLINE
+mem_heap_t*
+hash_get_heap(
+/*==========*/
+ hash_table_t* table, /*!< in: hash table */
+ ulint fold); /*!< in: fold */
+/************************************************************//**
+Gets the nth mutex in a hash table.
+@return mutex */
+UNIV_INLINE
+mutex_t*
+hash_get_nth_mutex(
+/*===============*/
+ hash_table_t* table, /*!< in: hash table */
+ ulint i); /*!< in: index of the mutex */
+/************************************************************//**
+Gets the mutex for a fold value in a hash table.
+@return mutex */
+UNIV_INLINE
+mutex_t*
+hash_get_mutex(
+/*===========*/
+ hash_table_t* table, /*!< in: hash table */
+ ulint fold); /*!< in: fold */
+/************************************************************//**
+Reserves the mutex for a fold value in a hash table. */
+UNIV_INTERN
+void
+hash_mutex_enter(
+/*=============*/
+ hash_table_t* table, /*!< in: hash table */
+ ulint fold); /*!< in: fold */
+/************************************************************//**
+Releases the mutex for a fold value in a hash table. */
+UNIV_INTERN
+void
+hash_mutex_exit(
+/*============*/
+ hash_table_t* table, /*!< in: hash table */
+ ulint fold); /*!< in: fold */
+/************************************************************//**
+Reserves all the mutexes of a hash table, in an ascending order. */
+UNIV_INTERN
+void
+hash_mutex_enter_all(
+/*=================*/
+ hash_table_t* table); /*!< in: hash table */
+/************************************************************//**
+Releases all the mutexes of a hash table. */
+UNIV_INTERN
+void
+hash_mutex_exit_all(
+/*================*/
+ hash_table_t* table); /*!< in: hash table */
+#else /* !UNIV_HOTBACKUP */
+# define hash_get_heap(table, fold) ((table)->heap)
+# define hash_mutex_enter(table, fold) ((void) 0)
+# define hash_mutex_exit(table, fold) ((void) 0)
+#endif /* !UNIV_HOTBACKUP */
+
+struct hash_cell_struct{
+ void* node; /*!< hash chain node, NULL if none */
+};
+
+/* The hash table structure */
+struct hash_table_struct {
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+# ifndef UNIV_HOTBACKUP
+ ibool adaptive;/* TRUE if this is the hash table of the
+ adaptive hash index */
+# endif /* !UNIV_HOTBACKUP */
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+ ulint n_cells;/* number of cells in the hash table */
+ hash_cell_t* array; /*!< pointer to cell array */
+#ifndef UNIV_HOTBACKUP
+ ulint n_mutexes;/* if mutexes != NULL, then the number of
+ mutexes, must be a power of 2 */
+ mutex_t* mutexes;/* NULL, or an array of mutexes used to
+ protect segments of the hash table */
+ mem_heap_t** heaps; /*!< if this is non-NULL, hash chain nodes for
+ external chaining can be allocated from these
+ memory heaps; there are then n_mutexes many of
+ these heaps */
+#endif /* !UNIV_HOTBACKUP */
+ mem_heap_t* heap;
+ ulint magic_n;
+};
+
+#define HASH_TABLE_MAGIC_N 76561114
+
+#ifndef UNIV_NONINL
+#include "hash0hash.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/hash0hash.ic b/storage/innobase/include/hash0hash.ic
new file mode 100644
index 00000000000..19da2d50701
--- /dev/null
+++ b/storage/innobase/include/hash0hash.ic
@@ -0,0 +1,163 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/hash0hash.ic
+The simple hash table utility
+
+Created 5/20/1997 Heikki Tuuri
+*******************************************************/
+
+#include "ut0rnd.h"
+
+/************************************************************//**
+Gets the nth cell in a hash table.
+@return pointer to cell */
+UNIV_INLINE
+hash_cell_t*
+hash_get_nth_cell(
+/*==============*/
+ hash_table_t* table, /*!< in: hash table */
+ ulint n) /*!< in: cell index */
+{
+ ut_ad(n < table->n_cells);
+
+ return(table->array + n);
+}
+
+/*************************************************************//**
+Clears a hash table so that all the cells become empty. */
+UNIV_INLINE
+void
+hash_table_clear(
+/*=============*/
+ hash_table_t* table) /*!< in/out: hash table */
+{
+ memset(table->array, 0x0,
+ table->n_cells * sizeof(*table->array));
+}
+
+/*************************************************************//**
+Returns the number of cells in a hash table.
+@return number of cells */
+UNIV_INLINE
+ulint
+hash_get_n_cells(
+/*=============*/
+ hash_table_t* table) /*!< in: table */
+{
+ return(table->n_cells);
+}
+
+/**************************************************************//**
+Calculates the hash value from a folded value.
+@return hashed value */
+UNIV_INLINE
+ulint
+hash_calc_hash(
+/*===========*/
+ ulint fold, /*!< in: folded value */
+ hash_table_t* table) /*!< in: hash table */
+{
+ return(ut_hash_ulint(fold, table->n_cells));
+}
+
+#ifndef UNIV_HOTBACKUP
+/************************************************************//**
+Gets the mutex index for a fold value in a hash table.
+@return mutex number */
+UNIV_INLINE
+ulint
+hash_get_mutex_no(
+/*==============*/
+ hash_table_t* table, /*!< in: hash table */
+ ulint fold) /*!< in: fold */
+{
+ ut_ad(ut_is_2pow(table->n_mutexes));
+ return(ut_2pow_remainder(hash_calc_hash(fold, table),
+ table->n_mutexes));
+}
+
+/************************************************************//**
+Gets the nth heap in a hash table.
+@return mem heap */
+UNIV_INLINE
+mem_heap_t*
+hash_get_nth_heap(
+/*==============*/
+ hash_table_t* table, /*!< in: hash table */
+ ulint i) /*!< in: index of the heap */
+{
+ ut_ad(i < table->n_mutexes);
+
+ return(table->heaps[i]);
+}
+
+/************************************************************//**
+Gets the heap for a fold value in a hash table.
+@return mem heap */
+UNIV_INLINE
+mem_heap_t*
+hash_get_heap(
+/*==========*/
+ hash_table_t* table, /*!< in: hash table */
+ ulint fold) /*!< in: fold */
+{
+ ulint i;
+
+ if (table->heap) {
+ return(table->heap);
+ }
+
+ i = hash_get_mutex_no(table, fold);
+
+ return(hash_get_nth_heap(table, i));
+}
+
+/************************************************************//**
+Gets the nth mutex in a hash table.
+@return mutex */
+UNIV_INLINE
+mutex_t*
+hash_get_nth_mutex(
+/*===============*/
+ hash_table_t* table, /*!< in: hash table */
+ ulint i) /*!< in: index of the mutex */
+{
+ ut_ad(i < table->n_mutexes);
+
+ return(table->mutexes + i);
+}
+
+/************************************************************//**
+Gets the mutex for a fold value in a hash table.
+@return mutex */
+UNIV_INLINE
+mutex_t*
+hash_get_mutex(
+/*===========*/
+ hash_table_t* table, /*!< in: hash table */
+ ulint fold) /*!< in: fold */
+{
+ ulint i;
+
+ i = hash_get_mutex_no(table, fold);
+
+ return(hash_get_nth_mutex(table, i));
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/innobase/include/ibuf0ibuf.h b/storage/innobase/include/ibuf0ibuf.h
new file mode 100644
index 00000000000..21330997df3
--- /dev/null
+++ b/storage/innobase/include/ibuf0ibuf.h
@@ -0,0 +1,377 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/ibuf0ibuf.h
+Insert buffer
+
+Created 7/19/1997 Heikki Tuuri
+*******************************************************/
+
+#ifndef ibuf0ibuf_h
+#define ibuf0ibuf_h
+
+#include "univ.i"
+
+#include "mtr0mtr.h"
+#include "dict0mem.h"
+#include "fsp0fsp.h"
+
+#ifndef UNIV_HOTBACKUP
+# include "ibuf0types.h"
+
+/** Combinations of operations that can be buffered. Because the enum
+values are used for indexing innobase_change_buffering_values[], they
+should start at 0 and there should not be any gaps. */
+typedef enum {
+ IBUF_USE_NONE = 0,
+ IBUF_USE_INSERT, /* insert */
+
+ IBUF_USE_COUNT /* number of entries in ibuf_use_t */
+} ibuf_use_t;
+
+/** Operations that can currently be buffered. */
+extern ibuf_use_t ibuf_use;
+
+/** The insert buffer control structure */
+extern ibuf_t* ibuf;
+
+/* The purpose of the insert buffer is to reduce random disk access.
+When we wish to insert a record into a non-unique secondary index and
+the B-tree leaf page where the record belongs to is not in the buffer
+pool, we insert the record into the insert buffer B-tree, indexed by
+(space_id, page_no). When the page is eventually read into the buffer
+pool, we look up the insert buffer B-tree for any modifications to the
+page, and apply these upon the completion of the read operation. This
+is called the insert buffer merge. */
+
+/* The insert buffer merge must always succeed. To guarantee this,
+the insert buffer subsystem keeps track of the free space in pages for
+which it can buffer operations. Two bits per page in the insert
+buffer bitmap indicate the available space in coarse increments. The
+free bits in the insert buffer bitmap must never exceed the free space
+on a page. It is safe to decrement or reset the bits in the bitmap in
+a mini-transaction that is committed before the mini-transaction that
+affects the free space. It is unsafe to increment the bits in a
+separately committed mini-transaction, because in crash recovery, the
+free bits could momentarily be set too high. */
+
+/******************************************************************//**
+Creates the insert buffer data structure at a database startup and
+initializes the data structures for the insert buffer of each tablespace. */
+UNIV_INTERN
+void
+ibuf_init_at_db_start(void);
+/*=======================*/
+/*********************************************************************//**
+Reads the biggest tablespace id from the high end of the insert buffer
+tree and updates the counter in fil_system. */
+UNIV_INTERN
+void
+ibuf_update_max_tablespace_id(void);
+/*===============================*/
+/*********************************************************************//**
+Initializes an ibuf bitmap page. */
+UNIV_INTERN
+void
+ibuf_bitmap_page_init(
+/*==================*/
+ buf_block_t* block, /*!< in: bitmap page */
+ mtr_t* mtr); /*!< in: mtr */
+/************************************************************************//**
+Resets the free bits of the page in the ibuf bitmap. This is done in a
+separate mini-transaction, hence this operation does not restrict
+further work to only ibuf bitmap operations, which would result if the
+latch to the bitmap page were kept. NOTE: The free bits in the insert
+buffer bitmap must never exceed the free space on a page. It is safe
+to decrement or reset the bits in the bitmap in a mini-transaction
+that is committed before the mini-transaction that affects the free
+space. */
+UNIV_INTERN
+void
+ibuf_reset_free_bits(
+/*=================*/
+ buf_block_t* block); /*!< in: index page; free bits are set to 0
+ if the index is a non-clustered
+ non-unique, and page level is 0 */
+/************************************************************************//**
+Updates the free bits of an uncompressed page in the ibuf bitmap if
+there is not enough free on the page any more. This is done in a
+separate mini-transaction, hence this operation does not restrict
+further work to only ibuf bitmap operations, which would result if the
+latch to the bitmap page were kept. NOTE: The free bits in the insert
+buffer bitmap must never exceed the free space on a page. It is
+unsafe to increment the bits in a separately committed
+mini-transaction, because in crash recovery, the free bits could
+momentarily be set too high. It is only safe to use this function for
+decrementing the free bits. Should more free space become available,
+we must not update the free bits here, because that would break crash
+recovery. */
+UNIV_INLINE
+void
+ibuf_update_free_bits_if_full(
+/*==========================*/
+ buf_block_t* block, /*!< in: index page to which we have added new
+ records; the free bits are updated if the
+ index is non-clustered and non-unique and
+ the page level is 0, and the page becomes
+ fuller */
+ ulint max_ins_size,/*!< in: value of maximum insert size with
+ reorganize before the latest operation
+ performed to the page */
+ ulint increase);/*!< in: upper limit for the additional space
+ used in the latest operation, if known, or
+ ULINT_UNDEFINED */
+/**********************************************************************//**
+Updates the free bits for an uncompressed page to reflect the present
+state. Does this in the mtr given, which means that the latching
+order rules virtually prevent any further operations for this OS
+thread until mtr is committed. NOTE: The free bits in the insert
+buffer bitmap must never exceed the free space on a page. It is safe
+to set the free bits in the same mini-transaction that updated the
+page. */
+UNIV_INTERN
+void
+ibuf_update_free_bits_low(
+/*======================*/
+ const buf_block_t* block, /*!< in: index page */
+ ulint max_ins_size, /*!< in: value of
+ maximum insert size
+ with reorganize before
+ the latest operation
+ performed to the page */
+ mtr_t* mtr); /*!< in/out: mtr */
+/**********************************************************************//**
+Updates the free bits for a compressed page to reflect the present
+state. Does this in the mtr given, which means that the latching
+order rules virtually prevent any further operations for this OS
+thread until mtr is committed. NOTE: The free bits in the insert
+buffer bitmap must never exceed the free space on a page. It is safe
+to set the free bits in the same mini-transaction that updated the
+page. */
+UNIV_INTERN
+void
+ibuf_update_free_bits_zip(
+/*======================*/
+ buf_block_t* block, /*!< in/out: index page */
+ mtr_t* mtr); /*!< in/out: mtr */
+/**********************************************************************//**
+Updates the free bits for the two pages to reflect the present state.
+Does this in the mtr given, which means that the latching order rules
+virtually prevent any further operations until mtr is committed.
+NOTE: The free bits in the insert buffer bitmap must never exceed the
+free space on a page. It is safe to set the free bits in the same
+mini-transaction that updated the pages. */
+UNIV_INTERN
+void
+ibuf_update_free_bits_for_two_pages_low(
+/*====================================*/
+ ulint zip_size,/*!< in: compressed page size in bytes;
+ 0 for uncompressed pages */
+ buf_block_t* block1, /*!< in: index page */
+ buf_block_t* block2, /*!< in: index page */
+ mtr_t* mtr); /*!< in: mtr */
+/**********************************************************************//**
+A basic partial test if an insert to the insert buffer could be possible and
+recommended. */
+UNIV_INLINE
+ibool
+ibuf_should_try(
+/*============*/
+ dict_index_t* index, /*!< in: index where to insert */
+ ulint ignore_sec_unique); /*!< in: if != 0, we should
+ ignore UNIQUE constraint on
+ a secondary index when we
+ decide */
+/******************************************************************//**
+Returns TRUE if the current OS thread is performing an insert buffer
+routine.
+
+For instance, a read-ahead of non-ibuf pages is forbidden by threads
+that are executing an insert buffer routine.
+@return TRUE if inside an insert buffer routine */
+UNIV_INTERN
+ibool
+ibuf_inside(void);
+/*=============*/
+/***********************************************************************//**
+Checks if a page address is an ibuf bitmap page (level 3 page) address.
+@return TRUE if a bitmap page */
+UNIV_INLINE
+ibool
+ibuf_bitmap_page(
+/*=============*/
+ ulint zip_size,/*!< in: compressed page size in bytes;
+ 0 for uncompressed pages */
+ ulint page_no);/*!< in: page number */
+/***********************************************************************//**
+Checks if a page is a level 2 or 3 page in the ibuf hierarchy of pages.
+Must not be called when recv_no_ibuf_operations==TRUE.
+@return TRUE if level 2 or level 3 page */
+UNIV_INTERN
+ibool
+ibuf_page(
+/*======*/
+ ulint space, /*!< in: space id */
+ ulint zip_size,/*!< in: compressed page size in bytes, or 0 */
+ ulint page_no,/*!< in: page number */
+ mtr_t* mtr); /*!< in: mtr which will contain an x-latch to the
+ bitmap page if the page is not one of the fixed
+ address ibuf pages, or NULL, in which case a new
+ transaction is created. */
+/***********************************************************************//**
+Frees excess pages from the ibuf free list. This function is called when an OS
+thread calls fsp services to allocate a new file segment, or a new page to a
+file segment, and the thread did not own the fsp latch before this call. */
+UNIV_INTERN
+void
+ibuf_free_excess_pages(void);
+/*========================*/
+/*********************************************************************//**
+Makes an index insert to the insert buffer, instead of directly to the disk
+page, if this is possible. Does not do insert if the index is clustered
+or unique.
+@return TRUE if success */
+UNIV_INTERN
+ibool
+ibuf_insert(
+/*========*/
+ const dtuple_t* entry, /*!< in: index entry to insert */
+ dict_index_t* index, /*!< in: index where to insert */
+ ulint space, /*!< in: space id where to insert */
+ ulint zip_size,/*!< in: compressed page size in bytes, or 0 */
+ ulint page_no,/*!< in: page number where to insert */
+ que_thr_t* thr); /*!< in: query thread */
+/*********************************************************************//**
+When an index page is read from a disk to the buffer pool, this function
+inserts to the page the possible index entries buffered in the insert buffer.
+The entries are deleted from the insert buffer. If the page is not read, but
+created in the buffer pool, this function deletes its buffered entries from
+the insert buffer; there can exist entries for such a page if the page
+belonged to an index which subsequently was dropped. */
+UNIV_INTERN
+void
+ibuf_merge_or_delete_for_page(
+/*==========================*/
+ buf_block_t* block, /*!< in: if page has been read from
+ disk, pointer to the page x-latched,
+ else NULL */
+ ulint space, /*!< in: space id of the index page */
+ ulint page_no,/*!< in: page number of the index page */
+ ulint zip_size,/*!< in: compressed page size in bytes,
+ or 0 */
+ ibool update_ibuf_bitmap);/*!< in: normally this is set
+ to TRUE, but if we have deleted or are
+ deleting the tablespace, then we
+ naturally do not want to update a
+ non-existent bitmap page */
+/*********************************************************************//**
+Deletes all entries in the insert buffer for a given space id. This is used
+in DISCARD TABLESPACE and IMPORT TABLESPACE.
+NOTE: this does not update the page free bitmaps in the space. The space will
+become CORRUPT when you call this function! */
+UNIV_INTERN
+void
+ibuf_delete_for_discarded_space(
+/*============================*/
+ ulint space); /*!< in: space id */
+/*********************************************************************//**
+Contracts insert buffer trees by reading pages to the buffer pool.
+@return a lower limit for the combined size in bytes of entries which
+will be merged from ibuf trees to the pages read, 0 if ibuf is
+empty */
+UNIV_INTERN
+ulint
+ibuf_contract(
+/*==========*/
+ ibool sync); /*!< in: TRUE if the caller wants to wait for the
+ issued read with the highest tablespace address
+ to complete */
+/*********************************************************************//**
+Contracts insert buffer trees by reading pages to the buffer pool.
+@return a lower limit for the combined size in bytes of entries which
+will be merged from ibuf trees to the pages read, 0 if ibuf is
+empty */
+UNIV_INTERN
+ulint
+ibuf_contract_for_n_pages(
+/*======================*/
+ ibool sync, /*!< in: TRUE if the caller wants to wait for the
+ issued read with the highest tablespace address
+ to complete */
+ ulint n_pages);/*!< in: try to read at least this many pages to
+ the buffer pool and merge the ibuf contents to
+ them */
+#endif /* !UNIV_HOTBACKUP */
+/*********************************************************************//**
+Parses a redo log record of an ibuf bitmap page init.
+@return end of log record or NULL */
+UNIV_INTERN
+byte*
+ibuf_parse_bitmap_init(
+/*===================*/
+ byte* ptr, /*!< in: buffer */
+ byte* end_ptr,/*!< in: buffer end */
+ buf_block_t* block, /*!< in: block or NULL */
+ mtr_t* mtr); /*!< in: mtr or NULL */
+#ifndef UNIV_HOTBACKUP
+#ifdef UNIV_IBUF_COUNT_DEBUG
+/******************************************************************//**
+Gets the ibuf count for a given page.
+@return number of entries in the insert buffer currently buffered for
+this page */
+UNIV_INTERN
+ulint
+ibuf_count_get(
+/*===========*/
+ ulint space, /*!< in: space id */
+ ulint page_no);/*!< in: page number */
+#endif
+/******************************************************************//**
+Looks if the insert buffer is empty.
+@return TRUE if empty */
+UNIV_INTERN
+ibool
+ibuf_is_empty(void);
+/*===============*/
+/******************************************************************//**
+Prints info of ibuf. */
+UNIV_INTERN
+void
+ibuf_print(
+/*=======*/
+ FILE* file); /*!< in: file where to print */
+
+#define IBUF_HEADER_PAGE_NO FSP_IBUF_HEADER_PAGE_NO
+#define IBUF_TREE_ROOT_PAGE_NO FSP_IBUF_TREE_ROOT_PAGE_NO
+
+#endif /* !UNIV_HOTBACKUP */
+
+/* The ibuf header page currently contains only the file segment header
+for the file segment from which the pages for the ibuf tree are allocated */
+#define IBUF_HEADER PAGE_DATA
+#define IBUF_TREE_SEG_HEADER 0 /* fseg header for ibuf tree */
+
+/* The insert buffer tree itself is always located in space 0. */
+#define IBUF_SPACE_ID 0
+
+#ifndef UNIV_NONINL
+#include "ibuf0ibuf.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/ibuf0ibuf.ic b/storage/innobase/include/ibuf0ibuf.ic
new file mode 100644
index 00000000000..15bbe61ab30
--- /dev/null
+++ b/storage/innobase/include/ibuf0ibuf.ic
@@ -0,0 +1,327 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/ibuf0ibuf.ic
+Insert buffer
+
+Created 7/19/1997 Heikki Tuuri
+*******************************************************/
+
+#include "page0page.h"
+#include "page0zip.h"
+#ifndef UNIV_HOTBACKUP
+#include "buf0lru.h"
+
+/** Counter for ibuf_should_try() */
+extern ulint ibuf_flush_count;
+
+/** An index page must contain at least UNIV_PAGE_SIZE /
+IBUF_PAGE_SIZE_PER_FREE_SPACE bytes of free space for ibuf to try to
+buffer inserts to this page. If there is this much of free space, the
+corresponding bits are set in the ibuf bitmap. */
+#define IBUF_PAGE_SIZE_PER_FREE_SPACE 32
+
+/** Insert buffer struct */
+struct ibuf_struct{
+ ulint size; /*!< current size of the ibuf index
+ tree, in pages */
+ ulint max_size; /*!< recommended maximum size of the
+ ibuf index tree, in pages */
+ ulint seg_size; /*!< allocated pages of the file
+ segment containing ibuf header and
+ tree */
+ ibool empty; /*!< after an insert to the ibuf tree
+ is performed, this is set to FALSE,
+ and if a contract operation finds
+ the tree empty, this is set to
+ TRUE */
+ ulint free_list_len; /*!< length of the free list */
+ ulint height; /*!< tree height */
+ dict_index_t* index; /*!< insert buffer index */
+
+ ulint n_inserts; /*!< number of inserts made to
+ the insert buffer */
+ ulint n_merges; /*!< number of pages merged */
+ ulint n_merged_recs; /*!< number of records merged */
+};
+
+/************************************************************************//**
+Sets the free bit of the page in the ibuf bitmap. This is done in a separate
+mini-transaction, hence this operation does not restrict further work to only
+ibuf bitmap operations, which would result if the latch to the bitmap page
+were kept. */
+UNIV_INTERN
+void
+ibuf_set_free_bits_func(
+/*====================*/
+ buf_block_t* block, /*!< in: index page of a non-clustered index;
+ free bit is reset if page level is 0 */
+#ifdef UNIV_IBUF_DEBUG
+ ulint max_val,/*!< in: ULINT_UNDEFINED or a maximum
+ value which the bits must have before
+ setting; this is for debugging */
+#endif /* UNIV_IBUF_DEBUG */
+ ulint val); /*!< in: value to set: < 4 */
+#ifdef UNIV_IBUF_DEBUG
+# define ibuf_set_free_bits(b,v,max) ibuf_set_free_bits_func(b,max,v)
+#else /* UNIV_IBUF_DEBUG */
+# define ibuf_set_free_bits(b,v,max) ibuf_set_free_bits_func(b,v)
+#endif /* UNIV_IBUF_DEBUG */
+
+/**********************************************************************//**
+A basic partial test if an insert to the insert buffer could be possible and
+recommended. */
+UNIV_INLINE
+ibool
+ibuf_should_try(
+/*============*/
+ dict_index_t* index, /*!< in: index where to insert */
+ ulint ignore_sec_unique) /*!< in: if != 0, we should
+ ignore UNIQUE constraint on
+ a secondary index when we
+ decide */
+{
+ if (ibuf_use != IBUF_USE_NONE
+ && !dict_index_is_clust(index)
+ && (ignore_sec_unique || !dict_index_is_unique(index))) {
+
+ ibuf_flush_count++;
+
+ if (ibuf_flush_count % 4 == 0) {
+
+ buf_LRU_try_free_flushed_blocks();
+ }
+
+ return(TRUE);
+ }
+
+ return(FALSE);
+}
+
+/***********************************************************************//**
+Checks if a page address is an ibuf bitmap page address.
+@return TRUE if a bitmap page */
+UNIV_INLINE
+ibool
+ibuf_bitmap_page(
+/*=============*/
+ ulint zip_size,/*!< in: compressed page size in bytes;
+ 0 for uncompressed pages */
+ ulint page_no)/*!< in: page number */
+{
+ ut_ad(ut_is_2pow(zip_size));
+
+ if (!zip_size) {
+ return(UNIV_UNLIKELY((page_no & (UNIV_PAGE_SIZE - 1))
+ == FSP_IBUF_BITMAP_OFFSET));
+ }
+
+ return(UNIV_UNLIKELY((page_no & (zip_size - 1))
+ == FSP_IBUF_BITMAP_OFFSET));
+}
+
+/*********************************************************************//**
+Translates the free space on a page to a value in the ibuf bitmap.
+@return value for ibuf bitmap bits */
+UNIV_INLINE
+ulint
+ibuf_index_page_calc_free_bits(
+/*===========================*/
+ ulint zip_size, /*!< in: compressed page size in bytes;
+ 0 for uncompressed pages */
+ ulint max_ins_size) /*!< in: maximum insert size after reorganize
+ for the page */
+{
+ ulint n;
+ ut_ad(ut_is_2pow(zip_size));
+ ut_ad(!zip_size || zip_size > IBUF_PAGE_SIZE_PER_FREE_SPACE);
+ ut_ad(zip_size <= UNIV_PAGE_SIZE);
+
+ if (zip_size) {
+ n = max_ins_size
+ / (zip_size / IBUF_PAGE_SIZE_PER_FREE_SPACE);
+ } else {
+ n = max_ins_size
+ / (UNIV_PAGE_SIZE / IBUF_PAGE_SIZE_PER_FREE_SPACE);
+ }
+
+ if (n == 3) {
+ n = 2;
+ }
+
+ if (n > 3) {
+ n = 3;
+ }
+
+ return(n);
+}
+
+/*********************************************************************//**
+Translates the ibuf free bits to the free space on a page in bytes.
+@return maximum insert size after reorganize for the page */
+UNIV_INLINE
+ulint
+ibuf_index_page_calc_free_from_bits(
+/*================================*/
+ ulint zip_size,/*!< in: compressed page size in bytes;
+ 0 for uncompressed pages */
+ ulint bits) /*!< in: value for ibuf bitmap bits */
+{
+ ut_ad(bits < 4);
+ ut_ad(ut_is_2pow(zip_size));
+ ut_ad(!zip_size || zip_size > IBUF_PAGE_SIZE_PER_FREE_SPACE);
+ ut_ad(zip_size <= UNIV_PAGE_SIZE);
+
+ if (zip_size) {
+ if (bits == 3) {
+ return(4 * zip_size / IBUF_PAGE_SIZE_PER_FREE_SPACE);
+ }
+
+ return(bits * zip_size / IBUF_PAGE_SIZE_PER_FREE_SPACE);
+ }
+
+ if (bits == 3) {
+ return(4 * UNIV_PAGE_SIZE / IBUF_PAGE_SIZE_PER_FREE_SPACE);
+ }
+
+ return(bits * (UNIV_PAGE_SIZE / IBUF_PAGE_SIZE_PER_FREE_SPACE));
+}
+
+/*********************************************************************//**
+Translates the free space on a compressed page to a value in the ibuf bitmap.
+@return value for ibuf bitmap bits */
+UNIV_INLINE
+ulint
+ibuf_index_page_calc_free_zip(
+/*==========================*/
+ ulint zip_size,
+ /*!< in: compressed page size in bytes */
+ const buf_block_t* block) /*!< in: buffer block */
+{
+ ulint max_ins_size;
+ const page_zip_des_t* page_zip;
+ lint zip_max_ins;
+
+ ut_ad(zip_size == buf_block_get_zip_size(block));
+ ut_ad(zip_size);
+
+ max_ins_size = page_get_max_insert_size_after_reorganize(
+ buf_block_get_frame(block), 1);
+
+ page_zip = buf_block_get_page_zip(block);
+ zip_max_ins = page_zip_max_ins_size(page_zip,
+ FALSE/* not clustered */);
+
+ if (UNIV_UNLIKELY(zip_max_ins < 0)) {
+ return(0);
+ } else if (UNIV_LIKELY(max_ins_size > (ulint) zip_max_ins)) {
+ max_ins_size = (ulint) zip_max_ins;
+ }
+
+ return(ibuf_index_page_calc_free_bits(zip_size, max_ins_size));
+}
+
+/*********************************************************************//**
+Translates the free space on a page to a value in the ibuf bitmap.
+@return value for ibuf bitmap bits */
+UNIV_INLINE
+ulint
+ibuf_index_page_calc_free(
+/*======================*/
+ ulint zip_size,/*!< in: compressed page size in bytes;
+ 0 for uncompressed pages */
+ const buf_block_t* block) /*!< in: buffer block */
+{
+ ut_ad(zip_size == buf_block_get_zip_size(block));
+
+ if (!zip_size) {
+ ulint max_ins_size;
+
+ max_ins_size = page_get_max_insert_size_after_reorganize(
+ buf_block_get_frame(block), 1);
+
+ return(ibuf_index_page_calc_free_bits(0, max_ins_size));
+ } else {
+ return(ibuf_index_page_calc_free_zip(zip_size, block));
+ }
+}
+
+/************************************************************************//**
+Updates the free bits of an uncompressed page in the ibuf bitmap if
+there is not enough free on the page any more. This is done in a
+separate mini-transaction, hence this operation does not restrict
+further work to only ibuf bitmap operations, which would result if the
+latch to the bitmap page were kept. NOTE: The free bits in the insert
+buffer bitmap must never exceed the free space on a page. It is
+unsafe to increment the bits in a separately committed
+mini-transaction, because in crash recovery, the free bits could
+momentarily be set too high. It is only safe to use this function for
+decrementing the free bits. Should more free space become available,
+we must not update the free bits here, because that would break crash
+recovery. */
+UNIV_INLINE
+void
+ibuf_update_free_bits_if_full(
+/*==========================*/
+ buf_block_t* block, /*!< in: index page to which we have added new
+ records; the free bits are updated if the
+ index is non-clustered and non-unique and
+ the page level is 0, and the page becomes
+ fuller */
+ ulint max_ins_size,/*!< in: value of maximum insert size with
+ reorganize before the latest operation
+ performed to the page */
+ ulint increase)/*!< in: upper limit for the additional space
+ used in the latest operation, if known, or
+ ULINT_UNDEFINED */
+{
+ ulint before;
+ ulint after;
+
+ ut_ad(!buf_block_get_page_zip(block));
+
+ before = ibuf_index_page_calc_free_bits(0, max_ins_size);
+
+ if (max_ins_size >= increase) {
+#if ULINT32_UNDEFINED <= UNIV_PAGE_SIZE
+# error "ULINT32_UNDEFINED <= UNIV_PAGE_SIZE"
+#endif
+ after = ibuf_index_page_calc_free_bits(0, max_ins_size
+ - increase);
+#ifdef UNIV_IBUF_DEBUG
+ ut_a(after <= ibuf_index_page_calc_free(0, block));
+#endif
+ } else {
+ after = ibuf_index_page_calc_free(0, block);
+ }
+
+ if (after == 0) {
+ /* We move the page to the front of the buffer pool LRU list:
+ the purpose of this is to prevent those pages to which we
+ cannot make inserts using the insert buffer from slipping
+ out of the buffer pool */
+
+ buf_page_make_young(&block->page);
+ }
+
+ if (before > after) {
+ ibuf_set_free_bits(block, after, before);
+ }
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/innobase/include/ibuf0types.h b/storage/innobase/include/ibuf0types.h
new file mode 100644
index 00000000000..55944f879b2
--- /dev/null
+++ b/storage/innobase/include/ibuf0types.h
@@ -0,0 +1,31 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/ibuf0types.h
+Insert buffer global types
+
+Created 7/29/1997 Heikki Tuuri
+*******************************************************/
+
+#ifndef ibuf0types_h
+#define ibuf0types_h
+
+typedef struct ibuf_struct ibuf_t;
+
+#endif
diff --git a/storage/innobase/include/lock0iter.h b/storage/innobase/include/lock0iter.h
new file mode 100644
index 00000000000..25a57c9740c
--- /dev/null
+++ b/storage/innobase/include/lock0iter.h
@@ -0,0 +1,69 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/lock0iter.h
+Lock queue iterator type and function prototypes.
+
+Created July 16, 2007 Vasil Dimov
+*******************************************************/
+
+#ifndef lock0iter_h
+#define lock0iter_h
+
+#include "univ.i"
+#include "lock0types.h"
+
+typedef struct lock_queue_iterator_struct {
+ const lock_t* current_lock;
+ /* In case this is a record lock queue (not table lock queue)
+ then bit_no is the record number within the heap in which the
+ record is stored. */
+ ulint bit_no;
+} lock_queue_iterator_t;
+
+/*******************************************************************//**
+Initialize lock queue iterator so that it starts to iterate from
+"lock". bit_no specifies the record number within the heap where the
+record is stored. It can be undefined (ULINT_UNDEFINED) in two cases:
+1. If the lock is a table lock, thus we have a table lock queue;
+2. If the lock is a record lock and it is a wait lock. In this case
+ bit_no is calculated in this function by using
+ lock_rec_find_set_bit(). There is exactly one bit set in the bitmap
+ of a wait lock. */
+UNIV_INTERN
+void
+lock_queue_iterator_reset(
+/*======================*/
+ lock_queue_iterator_t* iter, /*!< out: iterator */
+ const lock_t* lock, /*!< in: lock to start from */
+ ulint bit_no);/*!< in: record number in the
+ heap */
+
+/*******************************************************************//**
+Gets the previous lock in the lock queue, returns NULL if there are no
+more locks (i.e. the current lock is the first one). The iterator is
+receded (if not-NULL is returned).
+@return previous lock or NULL */
+
+const lock_t*
+lock_queue_iterator_get_prev(
+/*=========================*/
+ lock_queue_iterator_t* iter); /*!< in/out: iterator */
+
+#endif /* lock0iter_h */
diff --git a/storage/innobase/include/lock0lock.h b/storage/innobase/include/lock0lock.h
new file mode 100644
index 00000000000..fa5db831d4f
--- /dev/null
+++ b/storage/innobase/include/lock0lock.h
@@ -0,0 +1,809 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/lock0lock.h
+The transaction lock system
+
+Created 5/7/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef lock0lock_h
+#define lock0lock_h
+
+#include "univ.i"
+#include "buf0types.h"
+#include "trx0types.h"
+#include "mtr0types.h"
+#include "rem0types.h"
+#include "dict0types.h"
+#include "que0types.h"
+#include "lock0types.h"
+#include "read0types.h"
+#include "hash0hash.h"
+#include "ut0vec.h"
+
+#ifdef UNIV_DEBUG
+extern ibool lock_print_waits;
+#endif /* UNIV_DEBUG */
+/* Buffer for storing information about the most recent deadlock error */
+extern FILE* lock_latest_err_file;
+
+/*********************************************************************//**
+Gets the size of a lock struct.
+@return size in bytes */
+UNIV_INTERN
+ulint
+lock_get_size(void);
+/*===============*/
+/*********************************************************************//**
+Creates the lock system at database start. */
+UNIV_INTERN
+void
+lock_sys_create(
+/*============*/
+ ulint n_cells); /*!< in: number of slots in lock hash table */
+/*********************************************************************//**
+Checks if some transaction has an implicit x-lock on a record in a clustered
+index.
+@return transaction which has the x-lock, or NULL */
+UNIV_INLINE
+trx_t*
+lock_clust_rec_some_has_impl(
+/*=========================*/
+ const rec_t* rec, /*!< in: user record */
+ dict_index_t* index, /*!< in: clustered index */
+ const ulint* offsets);/*!< in: rec_get_offsets(rec, index) */
+/*********************************************************************//**
+Gets the heap_no of the smallest user record on a page.
+@return heap_no of smallest user record, or PAGE_HEAP_NO_SUPREMUM */
+UNIV_INLINE
+ulint
+lock_get_min_heap_no(
+/*=================*/
+ const buf_block_t* block); /*!< in: buffer block */
+/*************************************************************//**
+Updates the lock table when we have reorganized a page. NOTE: we copy
+also the locks set on the infimum of the page; the infimum may carry
+locks if an update of a record is occurring on the page, and its locks
+were temporarily stored on the infimum. */
+UNIV_INTERN
+void
+lock_move_reorganize_page(
+/*======================*/
+ const buf_block_t* block, /*!< in: old index page, now
+ reorganized */
+ const buf_block_t* oblock);/*!< in: copy of the old, not
+ reorganized page */
+/*************************************************************//**
+Moves the explicit locks on user records to another page if a record
+list end is moved to another page. */
+UNIV_INTERN
+void
+lock_move_rec_list_end(
+/*===================*/
+ const buf_block_t* new_block, /*!< in: index page to move to */
+ const buf_block_t* block, /*!< in: index page */
+ const rec_t* rec); /*!< in: record on page: this
+ is the first record moved */
+/*************************************************************//**
+Moves the explicit locks on user records to another page if a record
+list start is moved to another page. */
+UNIV_INTERN
+void
+lock_move_rec_list_start(
+/*=====================*/
+ const buf_block_t* new_block, /*!< in: index page to move to */
+ const buf_block_t* block, /*!< in: index page */
+ const rec_t* rec, /*!< in: record on page:
+ this is the first
+ record NOT copied */
+ const rec_t* old_end); /*!< in: old
+ previous-to-last
+ record on new_page
+ before the records
+ were copied */
+/*************************************************************//**
+Updates the lock table when a page is split to the right. */
+UNIV_INTERN
+void
+lock_update_split_right(
+/*====================*/
+ const buf_block_t* right_block, /*!< in: right page */
+ const buf_block_t* left_block); /*!< in: left page */
+/*************************************************************//**
+Updates the lock table when a page is merged to the right. */
+UNIV_INTERN
+void
+lock_update_merge_right(
+/*====================*/
+ const buf_block_t* right_block, /*!< in: right page to
+ which merged */
+ const rec_t* orig_succ, /*!< in: original
+ successor of infimum
+ on the right page
+ before merge */
+ const buf_block_t* left_block); /*!< in: merged index
+ page which will be
+ discarded */
+/*************************************************************//**
+Updates the lock table when the root page is copied to another in
+btr_root_raise_and_insert. Note that we leave lock structs on the
+root page, even though they do not make sense on other than leaf
+pages: the reason is that in a pessimistic update the infimum record
+of the root page will act as a dummy carrier of the locks of the record
+to be updated. */
+UNIV_INTERN
+void
+lock_update_root_raise(
+/*===================*/
+ const buf_block_t* block, /*!< in: index page to which copied */
+ const buf_block_t* root); /*!< in: root page */
+/*************************************************************//**
+Updates the lock table when a page is copied to another and the original page
+is removed from the chain of leaf pages, except if page is the root! */
+UNIV_INTERN
+void
+lock_update_copy_and_discard(
+/*=========================*/
+ const buf_block_t* new_block, /*!< in: index page to
+ which copied */
+ const buf_block_t* block); /*!< in: index page;
+ NOT the root! */
+/*************************************************************//**
+Updates the lock table when a page is split to the left. */
+UNIV_INTERN
+void
+lock_update_split_left(
+/*===================*/
+ const buf_block_t* right_block, /*!< in: right page */
+ const buf_block_t* left_block); /*!< in: left page */
+/*************************************************************//**
+Updates the lock table when a page is merged to the left. */
+UNIV_INTERN
+void
+lock_update_merge_left(
+/*===================*/
+ const buf_block_t* left_block, /*!< in: left page to
+ which merged */
+ const rec_t* orig_pred, /*!< in: original predecessor
+ of supremum on the left page
+ before merge */
+ const buf_block_t* right_block); /*!< in: merged index page
+ which will be discarded */
+/*************************************************************//**
+Resets the original locks on heir and replaces them with gap type locks
+inherited from rec. */
+UNIV_INTERN
+void
+lock_rec_reset_and_inherit_gap_locks(
+/*=================================*/
+ const buf_block_t* heir_block, /*!< in: block containing the
+ record which inherits */
+ const buf_block_t* block, /*!< in: block containing the
+ record from which inherited;
+ does NOT reset the locks on
+ this record */
+ ulint heir_heap_no, /*!< in: heap_no of the
+ inheriting record */
+ ulint heap_no); /*!< in: heap_no of the
+ donating record */
+/*************************************************************//**
+Updates the lock table when a page is discarded. */
+UNIV_INTERN
+void
+lock_update_discard(
+/*================*/
+ const buf_block_t* heir_block, /*!< in: index page
+ which will inherit the locks */
+ ulint heir_heap_no, /*!< in: heap_no of the record
+ which will inherit the locks */
+ const buf_block_t* block); /*!< in: index page
+ which will be discarded */
+/*************************************************************//**
+Updates the lock table when a new user record is inserted. */
+UNIV_INTERN
+void
+lock_update_insert(
+/*===============*/
+ const buf_block_t* block, /*!< in: buffer block containing rec */
+ const rec_t* rec); /*!< in: the inserted record */
+/*************************************************************//**
+Updates the lock table when a record is removed. */
+UNIV_INTERN
+void
+lock_update_delete(
+/*===============*/
+ const buf_block_t* block, /*!< in: buffer block containing rec */
+ const rec_t* rec); /*!< in: the record to be removed */
+/*********************************************************************//**
+Stores on the page infimum record the explicit locks of another record.
+This function is used to store the lock state of a record when it is
+updated and the size of the record changes in the update. The record
+is in such an update moved, perhaps to another page. The infimum record
+acts as a dummy carrier record, taking care of lock releases while the
+actual record is being moved. */
+UNIV_INTERN
+void
+lock_rec_store_on_page_infimum(
+/*===========================*/
+ const buf_block_t* block, /*!< in: buffer block containing rec */
+ const rec_t* rec); /*!< in: record whose lock state
+ is stored on the infimum
+ record of the same page; lock
+ bits are reset on the
+ record */
+/*********************************************************************//**
+Restores the state of explicit lock requests on a single record, where the
+state was stored on the infimum of the page. */
+UNIV_INTERN
+void
+lock_rec_restore_from_page_infimum(
+/*===============================*/
+ const buf_block_t* block, /*!< in: buffer block containing rec */
+ const rec_t* rec, /*!< in: record whose lock state
+ is restored */
+ const buf_block_t* donator);/*!< in: page (rec is not
+ necessarily on this page)
+ whose infimum stored the lock
+ state; lock bits are reset on
+ the infimum */
+/*********************************************************************//**
+Returns TRUE if there are explicit record locks on a page.
+@return TRUE if there are explicit record locks on the page */
+UNIV_INTERN
+ibool
+lock_rec_expl_exist_on_page(
+/*========================*/
+ ulint space, /*!< in: space id */
+ ulint page_no);/*!< in: page number */
+/*********************************************************************//**
+Checks if locks of other transactions prevent an immediate insert of
+a record. If they do, first tests if the query thread should anyway
+be suspended for some reason; if not, then puts the transaction and
+the query thread to the lock wait state and inserts a waiting request
+for a gap x-lock to the lock queue.
+@return DB_SUCCESS, DB_LOCK_WAIT, DB_DEADLOCK, or DB_QUE_THR_SUSPENDED */
+UNIV_INTERN
+ulint
+lock_rec_insert_check_and_lock(
+/*===========================*/
+ ulint flags, /*!< in: if BTR_NO_LOCKING_FLAG bit is
+ set, does nothing */
+ const rec_t* rec, /*!< in: record after which to insert */
+ buf_block_t* block, /*!< in/out: buffer block of rec */
+ dict_index_t* index, /*!< in: index */
+ que_thr_t* thr, /*!< in: query thread */
+ mtr_t* mtr, /*!< in/out: mini-transaction */
+ ibool* inherit);/*!< out: set to TRUE if the new
+ inserted record maybe should inherit
+ LOCK_GAP type locks from the successor
+ record */
+/*********************************************************************//**
+Checks if locks of other transactions prevent an immediate modify (update,
+delete mark, or delete unmark) of a clustered index record. If they do,
+first tests if the query thread should anyway be suspended for some
+reason; if not, then puts the transaction and the query thread to the
+lock wait state and inserts a waiting request for a record x-lock to the
+lock queue.
+@return DB_SUCCESS, DB_LOCK_WAIT, DB_DEADLOCK, or DB_QUE_THR_SUSPENDED */
+UNIV_INTERN
+ulint
+lock_clust_rec_modify_check_and_lock(
+/*=================================*/
+ ulint flags, /*!< in: if BTR_NO_LOCKING_FLAG
+ bit is set, does nothing */
+ const buf_block_t* block, /*!< in: buffer block of rec */
+ const rec_t* rec, /*!< in: record which should be
+ modified */
+ dict_index_t* index, /*!< in: clustered index */
+ const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */
+ que_thr_t* thr); /*!< in: query thread */
+/*********************************************************************//**
+Checks if locks of other transactions prevent an immediate modify
+(delete mark or delete unmark) of a secondary index record.
+@return DB_SUCCESS, DB_LOCK_WAIT, DB_DEADLOCK, or DB_QUE_THR_SUSPENDED */
+UNIV_INTERN
+ulint
+lock_sec_rec_modify_check_and_lock(
+/*===============================*/
+ ulint flags, /*!< in: if BTR_NO_LOCKING_FLAG
+ bit is set, does nothing */
+ buf_block_t* block, /*!< in/out: buffer block of rec */
+ const rec_t* rec, /*!< in: record which should be
+ modified; NOTE: as this is a secondary
+ index, we always have to modify the
+ clustered index record first: see the
+ comment below */
+ dict_index_t* index, /*!< in: secondary index */
+ que_thr_t* thr, /*!< in: query thread */
+ mtr_t* mtr); /*!< in/out: mini-transaction */
+/*********************************************************************//**
+Like the counterpart for a clustered index below, but now we read a
+secondary index record.
+@return DB_SUCCESS, DB_LOCK_WAIT, DB_DEADLOCK, or DB_QUE_THR_SUSPENDED */
+UNIV_INTERN
+ulint
+lock_sec_rec_read_check_and_lock(
+/*=============================*/
+ ulint flags, /*!< in: if BTR_NO_LOCKING_FLAG
+ bit is set, does nothing */
+ const buf_block_t* block, /*!< in: buffer block of rec */
+ const rec_t* rec, /*!< in: user record or page
+ supremum record which should
+ be read or passed over by a
+ read cursor */
+ dict_index_t* index, /*!< in: secondary index */
+ const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */
+ enum lock_mode mode, /*!< in: mode of the lock which
+ the read cursor should set on
+ records: LOCK_S or LOCK_X; the
+ latter is possible in
+ SELECT FOR UPDATE */
+ ulint gap_mode,/*!< in: LOCK_ORDINARY, LOCK_GAP, or
+ LOCK_REC_NOT_GAP */
+ que_thr_t* thr); /*!< in: query thread */
+/*********************************************************************//**
+Checks if locks of other transactions prevent an immediate read, or passing
+over by a read cursor, of a clustered index record. If they do, first tests
+if the query thread should anyway be suspended for some reason; if not, then
+puts the transaction and the query thread to the lock wait state and inserts a
+waiting request for a record lock to the lock queue. Sets the requested mode
+lock on the record.
+@return DB_SUCCESS, DB_LOCK_WAIT, DB_DEADLOCK, or DB_QUE_THR_SUSPENDED */
+UNIV_INTERN
+ulint
+lock_clust_rec_read_check_and_lock(
+/*===============================*/
+ ulint flags, /*!< in: if BTR_NO_LOCKING_FLAG
+ bit is set, does nothing */
+ const buf_block_t* block, /*!< in: buffer block of rec */
+ const rec_t* rec, /*!< in: user record or page
+ supremum record which should
+ be read or passed over by a
+ read cursor */
+ dict_index_t* index, /*!< in: clustered index */
+ const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */
+ enum lock_mode mode, /*!< in: mode of the lock which
+ the read cursor should set on
+ records: LOCK_S or LOCK_X; the
+ latter is possible in
+ SELECT FOR UPDATE */
+ ulint gap_mode,/*!< in: LOCK_ORDINARY, LOCK_GAP, or
+ LOCK_REC_NOT_GAP */
+ que_thr_t* thr); /*!< in: query thread */
+/*********************************************************************//**
+Checks if locks of other transactions prevent an immediate read, or passing
+over by a read cursor, of a clustered index record. If they do, first tests
+if the query thread should anyway be suspended for some reason; if not, then
+puts the transaction and the query thread to the lock wait state and inserts a
+waiting request for a record lock to the lock queue. Sets the requested mode
+lock on the record. This is an alternative version of
+lock_clust_rec_read_check_and_lock() that does not require the parameter
+"offsets".
+@return DB_SUCCESS, DB_LOCK_WAIT, DB_DEADLOCK, or DB_QUE_THR_SUSPENDED */
+UNIV_INTERN
+ulint
+lock_clust_rec_read_check_and_lock_alt(
+/*===================================*/
+ ulint flags, /*!< in: if BTR_NO_LOCKING_FLAG
+ bit is set, does nothing */
+ const buf_block_t* block, /*!< in: buffer block of rec */
+ const rec_t* rec, /*!< in: user record or page
+ supremum record which should
+ be read or passed over by a
+ read cursor */
+ dict_index_t* index, /*!< in: clustered index */
+ enum lock_mode mode, /*!< in: mode of the lock which
+ the read cursor should set on
+ records: LOCK_S or LOCK_X; the
+ latter is possible in
+ SELECT FOR UPDATE */
+ ulint gap_mode,/*!< in: LOCK_ORDINARY, LOCK_GAP, or
+ LOCK_REC_NOT_GAP */
+ que_thr_t* thr); /*!< in: query thread */
+/*********************************************************************//**
+Checks that a record is seen in a consistent read.
+@return TRUE if sees, or FALSE if an earlier version of the record
+should be retrieved */
+UNIV_INTERN
+ibool
+lock_clust_rec_cons_read_sees(
+/*==========================*/
+ const rec_t* rec, /*!< in: user record which should be read or
+ passed over by a read cursor */
+ dict_index_t* index, /*!< in: clustered index */
+ const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */
+ read_view_t* view); /*!< in: consistent read view */
+/*********************************************************************//**
+Checks that a non-clustered index record is seen in a consistent read.
+
+NOTE that a non-clustered index page contains so little information on
+its modifications that also in the case FALSE, the present version of
+rec may be the right, but we must check this from the clustered index
+record.
+
+@return TRUE if certainly sees, or FALSE if an earlier version of the
+clustered index record might be needed */
+UNIV_INTERN
+ulint
+lock_sec_rec_cons_read_sees(
+/*========================*/
+ const rec_t* rec, /*!< in: user record which
+ should be read or passed over
+ by a read cursor */
+ const read_view_t* view); /*!< in: consistent read view */
+/*********************************************************************//**
+Locks the specified database table in the mode given. If the lock cannot
+be granted immediately, the query thread is put to wait.
+@return DB_SUCCESS, DB_LOCK_WAIT, DB_DEADLOCK, or DB_QUE_THR_SUSPENDED */
+UNIV_INTERN
+ulint
+lock_table(
+/*=======*/
+ ulint flags, /*!< in: if BTR_NO_LOCKING_FLAG bit is set,
+ does nothing */
+ dict_table_t* table, /*!< in: database table in dictionary cache */
+ enum lock_mode mode, /*!< in: lock mode */
+ que_thr_t* thr); /*!< in: query thread */
+/*************************************************************//**
+Removes a granted record lock of a transaction from the queue and grants
+locks to other transactions waiting in the queue if they now are entitled
+to a lock. */
+UNIV_INTERN
+void
+lock_rec_unlock(
+/*============*/
+ trx_t* trx, /*!< in: transaction that has
+ set a record lock */
+ const buf_block_t* block, /*!< in: buffer block containing rec */
+ const rec_t* rec, /*!< in: record */
+ enum lock_mode lock_mode);/*!< in: LOCK_S or LOCK_X */
+/*********************************************************************//**
+Releases transaction locks, and releases possible other transactions waiting
+because of these locks. */
+UNIV_INTERN
+void
+lock_release_off_kernel(
+/*====================*/
+ trx_t* trx); /*!< in: transaction */
+/*********************************************************************//**
+Cancels a waiting lock request and releases possible other transactions
+waiting behind it. */
+UNIV_INTERN
+void
+lock_cancel_waiting_and_release(
+/*============================*/
+ lock_t* lock); /*!< in: waiting lock request */
+
+/*********************************************************************//**
+Removes locks on a table to be dropped or truncated.
+If remove_also_table_sx_locks is TRUE then table-level S and X locks are
+also removed in addition to other table-level and record-level locks.
+No lock, that is going to be removed, is allowed to be a wait lock. */
+UNIV_INTERN
+void
+lock_remove_all_on_table(
+/*=====================*/
+ dict_table_t* table, /*!< in: table to be dropped
+ or truncated */
+ ibool remove_also_table_sx_locks);/*!< in: also removes
+ table S and X locks */
+
+/*********************************************************************//**
+Calculates the fold value of a page file address: used in inserting or
+searching for a lock in the hash table.
+@return folded value */
+UNIV_INLINE
+ulint
+lock_rec_fold(
+/*==========*/
+ ulint space, /*!< in: space */
+ ulint page_no)/*!< in: page number */
+ __attribute__((const));
+/*********************************************************************//**
+Calculates the hash value of a page file address: used in inserting or
+searching for a lock in the hash table.
+@return hashed value */
+UNIV_INLINE
+ulint
+lock_rec_hash(
+/*==========*/
+ ulint space, /*!< in: space */
+ ulint page_no);/*!< in: page number */
+
+/**********************************************************************//**
+Looks for a set bit in a record lock bitmap. Returns ULINT_UNDEFINED,
+if none found.
+@return bit index == heap number of the record, or ULINT_UNDEFINED if
+none found */
+UNIV_INTERN
+ulint
+lock_rec_find_set_bit(
+/*==================*/
+ const lock_t* lock); /*!< in: record lock with at least one
+ bit set */
+
+/*********************************************************************//**
+Gets the source table of an ALTER TABLE transaction. The table must be
+covered by an IX or IS table lock.
+@return the source table of transaction, if it is covered by an IX or
+IS table lock; dest if there is no source table, and NULL if the
+transaction is locking more than two tables or an inconsistency is
+found */
+UNIV_INTERN
+dict_table_t*
+lock_get_src_table(
+/*===============*/
+ trx_t* trx, /*!< in: transaction */
+ dict_table_t* dest, /*!< in: destination of ALTER TABLE */
+ enum lock_mode* mode); /*!< out: lock mode of the source table */
+/*********************************************************************//**
+Determine if the given table is exclusively "owned" by the given
+transaction, i.e., transaction holds LOCK_IX and possibly LOCK_AUTO_INC
+on the table.
+@return TRUE if table is only locked by trx, with LOCK_IX, and
+possibly LOCK_AUTO_INC */
+UNIV_INTERN
+ibool
+lock_is_table_exclusive(
+/*====================*/
+ dict_table_t* table, /*!< in: table */
+ trx_t* trx); /*!< in: transaction */
+/*********************************************************************//**
+Checks if a lock request lock1 has to wait for request lock2.
+@return TRUE if lock1 has to wait for lock2 to be removed */
+UNIV_INTERN
+ibool
+lock_has_to_wait(
+/*=============*/
+ const lock_t* lock1, /*!< in: waiting lock */
+ const lock_t* lock2); /*!< in: another lock; NOTE that it is
+ assumed that this has a lock bit set
+ on the same record as in lock1 if the
+ locks are record locks */
+/*********************************************************************//**
+Checks that a transaction id is sensible, i.e., not in the future.
+@return TRUE if ok */
+UNIV_INTERN
+ibool
+lock_check_trx_id_sanity(
+/*=====================*/
+ trx_id_t trx_id, /*!< in: trx id */
+ const rec_t* rec, /*!< in: user record */
+ dict_index_t* index, /*!< in: clustered index */
+ const ulint* offsets, /*!< in: rec_get_offsets(rec, index) */
+ ibool has_kernel_mutex);/*!< in: TRUE if the caller owns the
+ kernel mutex */
+/*********************************************************************//**
+Prints info of a table lock. */
+UNIV_INTERN
+void
+lock_table_print(
+/*=============*/
+ FILE* file, /*!< in: file where to print */
+ const lock_t* lock); /*!< in: table type lock */
+/*********************************************************************//**
+Prints info of a record lock. */
+UNIV_INTERN
+void
+lock_rec_print(
+/*===========*/
+ FILE* file, /*!< in: file where to print */
+ const lock_t* lock); /*!< in: record type lock */
+/*********************************************************************//**
+Prints info of locks for all transactions. */
+UNIV_INTERN
+void
+lock_print_info_summary(
+/*====================*/
+ FILE* file); /*!< in: file where to print */
+/*********************************************************************//**
+Prints info of locks for each transaction. */
+UNIV_INTERN
+void
+lock_print_info_all_transactions(
+/*=============================*/
+ FILE* file); /*!< in: file where to print */
+/*********************************************************************//**
+Return approximate number or record locks (bits set in the bitmap) for
+this transaction. Since delete-marked records may be removed, the
+record count will not be precise. */
+UNIV_INTERN
+ulint
+lock_number_of_rows_locked(
+/*=======================*/
+ trx_t* trx); /*!< in: transaction */
+/*******************************************************************//**
+Release all the transaction's autoinc locks. */
+UNIV_INTERN
+void
+lock_release_autoinc_locks(
+/*=======================*/
+ trx_t* trx); /*!< in/out: transaction */
+
+/*******************************************************************//**
+Gets the type of a lock. Non-inline version for using outside of the
+lock module.
+@return LOCK_TABLE or LOCK_REC */
+UNIV_INTERN
+ulint
+lock_get_type(
+/*==========*/
+ const lock_t* lock); /*!< in: lock */
+
+/*******************************************************************//**
+Gets the id of the transaction owning a lock.
+@return transaction id */
+UNIV_INTERN
+ullint
+lock_get_trx_id(
+/*============*/
+ const lock_t* lock); /*!< in: lock */
+
+/*******************************************************************//**
+Gets the mode of a lock in a human readable string.
+The string should not be free()'d or modified.
+@return lock mode */
+UNIV_INTERN
+const char*
+lock_get_mode_str(
+/*==============*/
+ const lock_t* lock); /*!< in: lock */
+
+/*******************************************************************//**
+Gets the type of a lock in a human readable string.
+The string should not be free()'d or modified.
+@return lock type */
+UNIV_INTERN
+const char*
+lock_get_type_str(
+/*==============*/
+ const lock_t* lock); /*!< in: lock */
+
+/*******************************************************************//**
+Gets the id of the table on which the lock is.
+@return id of the table */
+UNIV_INTERN
+ullint
+lock_get_table_id(
+/*==============*/
+ const lock_t* lock); /*!< in: lock */
+
+/*******************************************************************//**
+Gets the name of the table on which the lock is.
+The string should not be free()'d or modified.
+@return name of the table */
+UNIV_INTERN
+const char*
+lock_get_table_name(
+/*================*/
+ const lock_t* lock); /*!< in: lock */
+
+/*******************************************************************//**
+For a record lock, gets the index on which the lock is.
+@return index */
+UNIV_INTERN
+const dict_index_t*
+lock_rec_get_index(
+/*===============*/
+ const lock_t* lock); /*!< in: lock */
+
+/*******************************************************************//**
+For a record lock, gets the name of the index on which the lock is.
+The string should not be free()'d or modified.
+@return name of the index */
+UNIV_INTERN
+const char*
+lock_rec_get_index_name(
+/*====================*/
+ const lock_t* lock); /*!< in: lock */
+
+/*******************************************************************//**
+For a record lock, gets the tablespace number on which the lock is.
+@return tablespace number */
+UNIV_INTERN
+ulint
+lock_rec_get_space_id(
+/*==================*/
+ const lock_t* lock); /*!< in: lock */
+
+/*******************************************************************//**
+For a record lock, gets the page number on which the lock is.
+@return page number */
+UNIV_INTERN
+ulint
+lock_rec_get_page_no(
+/*=================*/
+ const lock_t* lock); /*!< in: lock */
+
+/** Lock modes and types */
+/* @{ */
+#define LOCK_MODE_MASK 0xFUL /*!< mask used to extract mode from the
+ type_mode field in a lock */
+/** Lock types */
+/* @{ */
+#define LOCK_TABLE 16 /*!< table lock */
+#define LOCK_REC 32 /*!< record lock */
+#define LOCK_TYPE_MASK 0xF0UL /*!< mask used to extract lock type from the
+ type_mode field in a lock */
+#if LOCK_MODE_MASK & LOCK_TYPE_MASK
+# error "LOCK_MODE_MASK & LOCK_TYPE_MASK"
+#endif
+
+#define LOCK_WAIT 256 /*!< Waiting lock flag; when set, it
+ means that the lock has not yet been
+ granted, it is just waiting for its
+ turn in the wait queue */
+/* Precise modes */
+#define LOCK_ORDINARY 0 /*!< this flag denotes an ordinary
+ next-key lock in contrast to LOCK_GAP
+ or LOCK_REC_NOT_GAP */
+#define LOCK_GAP 512 /*!< when this bit is set, it means that the
+ lock holds only on the gap before the record;
+ for instance, an x-lock on the gap does not
+ give permission to modify the record on which
+ the bit is set; locks of this type are created
+ when records are removed from the index chain
+ of records */
+#define LOCK_REC_NOT_GAP 1024 /*!< this bit means that the lock is only on
+ the index record and does NOT block inserts
+ to the gap before the index record; this is
+ used in the case when we retrieve a record
+ with a unique key, and is also used in
+ locking plain SELECTs (not part of UPDATE
+ or DELETE) when the user has set the READ
+ COMMITTED isolation level */
+#define LOCK_INSERT_INTENTION 2048 /*!< this bit is set when we place a waiting
+ gap type record lock request in order to let
+ an insert of an index record to wait until
+ there are no conflicting locks by other
+ transactions on the gap; note that this flag
+ remains set when the waiting lock is granted,
+ or if the lock is inherited to a neighboring
+ record */
+#if (LOCK_WAIT|LOCK_GAP|LOCK_REC_NOT_GAP|LOCK_INSERT_INTENTION)&LOCK_MODE_MASK
+# error
+#endif
+#if (LOCK_WAIT|LOCK_GAP|LOCK_REC_NOT_GAP|LOCK_INSERT_INTENTION)&LOCK_TYPE_MASK
+# error
+#endif
+/* @} */
+
+/** Lock operation struct */
+typedef struct lock_op_struct lock_op_t;
+/** Lock operation struct */
+struct lock_op_struct{
+ dict_table_t* table; /*!< table to be locked */
+ enum lock_mode mode; /*!< lock mode */
+};
+
+/** The lock system struct */
+struct lock_sys_struct{
+ hash_table_t* rec_hash; /*!< hash table of the record locks */
+};
+
+/** The lock system */
+extern lock_sys_t* lock_sys;
+
+
+#ifndef UNIV_NONINL
+#include "lock0lock.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/lock0lock.ic b/storage/innobase/include/lock0lock.ic
new file mode 100644
index 00000000000..014722f51c4
--- /dev/null
+++ b/storage/innobase/include/lock0lock.ic
@@ -0,0 +1,121 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/lock0lock.ic
+The transaction lock system
+
+Created 5/7/1996 Heikki Tuuri
+*******************************************************/
+
+#include "sync0sync.h"
+#include "srv0srv.h"
+#include "dict0dict.h"
+#include "row0row.h"
+#include "trx0sys.h"
+#include "trx0trx.h"
+#include "buf0buf.h"
+#include "page0page.h"
+#include "page0cur.h"
+#include "row0vers.h"
+#include "que0que.h"
+#include "btr0cur.h"
+#include "read0read.h"
+#include "log0recv.h"
+
+/*********************************************************************//**
+Calculates the fold value of a page file address: used in inserting or
+searching for a lock in the hash table.
+@return folded value */
+UNIV_INLINE
+ulint
+lock_rec_fold(
+/*==========*/
+ ulint space, /*!< in: space */
+ ulint page_no)/*!< in: page number */
+{
+ return(ut_fold_ulint_pair(space, page_no));
+}
+
+/*********************************************************************//**
+Calculates the hash value of a page file address: used in inserting or
+searching for a lock in the hash table.
+@return hashed value */
+UNIV_INLINE
+ulint
+lock_rec_hash(
+/*==========*/
+ ulint space, /*!< in: space */
+ ulint page_no)/*!< in: page number */
+{
+ return(hash_calc_hash(lock_rec_fold(space, page_no),
+ lock_sys->rec_hash));
+}
+
+/*********************************************************************//**
+Checks if some transaction has an implicit x-lock on a record in a clustered
+index.
+@return transaction which has the x-lock, or NULL */
+UNIV_INLINE
+trx_t*
+lock_clust_rec_some_has_impl(
+/*=========================*/
+ const rec_t* rec, /*!< in: user record */
+ dict_index_t* index, /*!< in: clustered index */
+ const ulint* offsets)/*!< in: rec_get_offsets(rec, index) */
+{
+ trx_id_t trx_id;
+
+ ut_ad(mutex_own(&kernel_mutex));
+ ut_ad(dict_index_is_clust(index));
+ ut_ad(page_rec_is_user_rec(rec));
+
+ trx_id = row_get_rec_trx_id(rec, index, offsets);
+
+ if (trx_is_active(trx_id)) {
+ /* The modifying or inserting transaction is active */
+
+ return(trx_get_on_id(trx_id));
+ }
+
+ return(NULL);
+}
+
+/*********************************************************************//**
+Gets the heap_no of the smallest user record on a page.
+@return heap_no of smallest user record, or PAGE_HEAP_NO_SUPREMUM */
+UNIV_INLINE
+ulint
+lock_get_min_heap_no(
+/*=================*/
+ const buf_block_t* block) /*!< in: buffer block */
+{
+ const page_t* page = block->frame;
+
+ if (page_is_comp(page)) {
+ return(rec_get_heap_no_new(
+ page
+ + rec_get_next_offs(page + PAGE_NEW_INFIMUM,
+ TRUE)));
+ } else {
+ return(rec_get_heap_no_old(
+ page
+ + rec_get_next_offs(page + PAGE_OLD_INFIMUM,
+ FALSE)));
+ }
+}
diff --git a/storage/innobase/include/lock0priv.h b/storage/innobase/include/lock0priv.h
new file mode 100644
index 00000000000..287c151b19f
--- /dev/null
+++ b/storage/innobase/include/lock0priv.h
@@ -0,0 +1,108 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/lock0priv.h
+Lock module internal structures and methods.
+
+Created July 12, 2007 Vasil Dimov
+*******************************************************/
+
+#ifndef lock0priv_h
+#define lock0priv_h
+
+#ifndef LOCK_MODULE_IMPLEMENTATION
+/* If you need to access members of the structures defined in this
+file, please write appropriate functions that retrieve them and put
+those functions in lock/ */
+#error Do not include lock0priv.h outside of the lock/ module
+#endif
+
+#include "univ.i"
+#include "dict0types.h"
+#include "hash0hash.h"
+#include "trx0types.h"
+#include "ut0lst.h"
+
+/** A table lock */
+typedef struct lock_table_struct lock_table_t;
+/** A table lock */
+struct lock_table_struct {
+ dict_table_t* table; /*!< database table in dictionary
+ cache */
+ UT_LIST_NODE_T(lock_t)
+ locks; /*!< list of locks on the same
+ table */
+};
+
+/** Record lock for a page */
+typedef struct lock_rec_struct lock_rec_t;
+/** Record lock for a page */
+struct lock_rec_struct {
+ ulint space; /*!< space id */
+ ulint page_no; /*!< page number */
+ ulint n_bits; /*!< number of bits in the lock
+ bitmap; NOTE: the lock bitmap is
+ placed immediately after the
+ lock struct */
+};
+
+/** Lock struct */
+struct lock_struct {
+ trx_t* trx; /*!< transaction owning the
+ lock */
+ UT_LIST_NODE_T(lock_t)
+ trx_locks; /*!< list of the locks of the
+ transaction */
+ ulint type_mode; /*!< lock type, mode, LOCK_GAP or
+ LOCK_REC_NOT_GAP,
+ LOCK_INSERT_INTENTION,
+ wait flag, ORed */
+ hash_node_t hash; /*!< hash chain node for a record
+ lock */
+ dict_index_t* index; /*!< index for a record lock */
+ union {
+ lock_table_t tab_lock;/*!< table lock */
+ lock_rec_t rec_lock;/*!< record lock */
+ } un_member; /*!< lock details */
+};
+
+/*********************************************************************//**
+Gets the type of a lock.
+@return LOCK_TABLE or LOCK_REC */
+UNIV_INLINE
+ulint
+lock_get_type_low(
+/*==============*/
+ const lock_t* lock); /*!< in: lock */
+
+/*********************************************************************//**
+Gets the previous record lock set on a record.
+@return previous lock on the same record, NULL if none exists */
+UNIV_INTERN
+const lock_t*
+lock_rec_get_prev(
+/*==============*/
+ const lock_t* in_lock,/*!< in: record lock */
+ ulint heap_no);/*!< in: heap number of the record */
+
+#ifndef UNIV_NONINL
+#include "lock0priv.ic"
+#endif
+
+#endif /* lock0priv_h */
diff --git a/storage/innobase/include/lock0priv.ic b/storage/innobase/include/lock0priv.ic
new file mode 100644
index 00000000000..30447c99848
--- /dev/null
+++ b/storage/innobase/include/lock0priv.ic
@@ -0,0 +1,49 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/lock0priv.ic
+Lock module internal inline methods.
+
+Created July 16, 2007 Vasil Dimov
+*******************************************************/
+
+/* This file contains only methods which are used in
+lock/lock0* files, other than lock/lock0lock.c.
+I.e. lock/lock0lock.c contains more internal inline
+methods but they are used only in that file. */
+
+#ifndef LOCK_MODULE_IMPLEMENTATION
+#error Do not include lock0priv.ic outside of the lock/ module
+#endif
+
+/*********************************************************************//**
+Gets the type of a lock.
+@return LOCK_TABLE or LOCK_REC */
+UNIV_INLINE
+ulint
+lock_get_type_low(
+/*==============*/
+ const lock_t* lock) /*!< in: lock */
+{
+ ut_ad(lock);
+
+ return(lock->type_mode & LOCK_TYPE_MASK);
+}
+
+/* vim: set filetype=c: */
diff --git a/storage/innobase/include/lock0types.h b/storage/innobase/include/lock0types.h
new file mode 100644
index 00000000000..45f29e90fe9
--- /dev/null
+++ b/storage/innobase/include/lock0types.h
@@ -0,0 +1,45 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/lock0types.h
+The transaction lock system global types
+
+Created 5/7/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef lock0types_h
+#define lock0types_h
+
+#define lock_t ib_lock_t
+typedef struct lock_struct lock_t;
+typedef struct lock_sys_struct lock_sys_t;
+
+/* Basic lock modes */
+enum lock_mode {
+ LOCK_IS = 0, /* intention shared */
+ LOCK_IX, /* intention exclusive */
+ LOCK_S, /* shared */
+ LOCK_X, /* exclusive */
+ LOCK_AUTO_INC, /* locks the auto-inc counter of a table
+ in an exclusive mode */
+ LOCK_NONE, /* this is used elsewhere to note consistent read */
+ LOCK_NUM = LOCK_NONE/* number of lock modes */
+};
+
+#endif
diff --git a/storage/innobase/include/log0log.h b/storage/innobase/include/log0log.h
new file mode 100644
index 00000000000..059f548a085
--- /dev/null
+++ b/storage/innobase/include/log0log.h
@@ -0,0 +1,958 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 2009, Google Inc.
+
+Portions of this file contain modifications contributed and copyrighted by
+Google, Inc. Those modifications are gratefully acknowledged and are described
+briefly in the InnoDB documentation. The contributions by Google are
+incorporated with their permission, and subject to the conditions contained in
+the file COPYING.Google.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/log0log.h
+Database log
+
+Created 12/9/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef log0log_h
+#define log0log_h
+
+#include "univ.i"
+#include "ut0byte.h"
+#include "ut0lst.h"
+#ifndef UNIV_HOTBACKUP
+#include "sync0sync.h"
+#include "sync0rw.h"
+#endif /* !UNIV_HOTBACKUP */
+
+/** Redo log buffer */
+typedef struct log_struct log_t;
+/** Redo log group */
+typedef struct log_group_struct log_group_t;
+
+#ifdef UNIV_DEBUG
+/** Flag: write to log file? */
+extern ibool log_do_write;
+/** Flag: enable debug output when writing to the log? */
+extern ibool log_debug_writes;
+#else /* UNIV_DEBUG */
+/** Write to log */
+# define log_do_write TRUE
+#endif /* UNIV_DEBUG */
+
+/** Wait modes for log_write_up_to @{ */
+#define LOG_NO_WAIT 91
+#define LOG_WAIT_ONE_GROUP 92
+#define LOG_WAIT_ALL_GROUPS 93
+/* @} */
+/** Maximum number of log groups in log_group_struct::checkpoint_buf */
+#define LOG_MAX_N_GROUPS 32
+
+#ifndef UNIV_HOTBACKUP
+/****************************************************************//**
+Sets the global variable log_fsp_current_free_limit. Also makes a checkpoint,
+so that we know that the limit has been written to a log checkpoint field
+on disk. */
+UNIV_INTERN
+void
+log_fsp_current_free_limit_set_and_checkpoint(
+/*==========================================*/
+ ulint limit); /*!< in: limit to set */
+#endif /* !UNIV_HOTBACKUP */
+/*******************************************************************//**
+Calculates where in log files we find a specified lsn.
+@return log file number */
+UNIV_INTERN
+ulint
+log_calc_where_lsn_is(
+/*==================*/
+ ib_int64_t* log_file_offset, /*!< out: offset in that file
+ (including the header) */
+ ib_uint64_t first_header_lsn, /*!< in: first log file start
+ lsn */
+ ib_uint64_t lsn, /*!< in: lsn whose position to
+ determine */
+ ulint n_log_files, /*!< in: total number of log
+ files */
+ ib_int64_t log_file_size); /*!< in: log file size
+ (including the header) */
+#ifndef UNIV_HOTBACKUP
+/************************************************************//**
+Writes to the log the string given. The log must be released with
+log_release.
+@return end lsn of the log record, zero if did not succeed */
+UNIV_INLINE
+ib_uint64_t
+log_reserve_and_write_fast(
+/*=======================*/
+ byte* str, /*!< in: string */
+ ulint len, /*!< in: string length */
+ ib_uint64_t* start_lsn,/*!< out: start lsn of the log record */
+ ibool* success);/*!< out: TRUE if success */
+/***********************************************************************//**
+Releases the log mutex. */
+UNIV_INLINE
+void
+log_release(void);
+/*=============*/
+/***********************************************************************//**
+Checks if there is need for a log buffer flush or a new checkpoint, and does
+this if yes. Any database operation should call this when it has modified
+more than about 4 pages. NOTE that this function may only be called when the
+OS thread owns no synchronization objects except the dictionary mutex. */
+UNIV_INLINE
+void
+log_free_check(void);
+/*================*/
+/************************************************************//**
+Opens the log for log_write_low. The log must be closed with log_close and
+released with log_release.
+@return start lsn of the log record */
+UNIV_INTERN
+ib_uint64_t
+log_reserve_and_open(
+/*=================*/
+ ulint len); /*!< in: length of data to be catenated */
+/************************************************************//**
+Writes to the log the string given. It is assumed that the caller holds the
+log mutex. */
+UNIV_INTERN
+void
+log_write_low(
+/*==========*/
+ byte* str, /*!< in: string */
+ ulint str_len); /*!< in: string length */
+/************************************************************//**
+Closes the log.
+@return lsn */
+UNIV_INTERN
+ib_uint64_t
+log_close(void);
+/*===========*/
+/************************************************************//**
+Gets the current lsn.
+@return current lsn */
+UNIV_INLINE
+ib_uint64_t
+log_get_lsn(void);
+/*=============*/
+/****************************************************************
+Gets the log group capacity. It is OK to read the value without
+holding log_sys->mutex because it is constant.
+@return log group capacity */
+UNIV_INLINE
+ulint
+log_get_capacity(void);
+/*==================*/
+/******************************************************//**
+Initializes the log. */
+UNIV_INTERN
+void
+log_init(void);
+/*==========*/
+/******************************************************************//**
+Inits a log group to the log system. */
+UNIV_INTERN
+void
+log_group_init(
+/*===========*/
+ ulint id, /*!< in: group id */
+ ulint n_files, /*!< in: number of log files */
+ ulint file_size, /*!< in: log file size in bytes */
+ ulint space_id, /*!< in: space id of the file space
+ which contains the log files of this
+ group */
+ ulint archive_space_id); /*!< in: space id of the file space
+ which contains some archived log
+ files for this group; currently, only
+ for the first log group this is
+ used */
+/******************************************************//**
+Completes an i/o to a log file. */
+UNIV_INTERN
+void
+log_io_complete(
+/*============*/
+ log_group_t* group); /*!< in: log group */
+/******************************************************//**
+This function is called, e.g., when a transaction wants to commit. It checks
+that the log has been written to the log file up to the last log entry written
+by the transaction. If there is a flush running, it waits and checks if the
+flush flushed enough. If not, starts a new flush. */
+UNIV_INTERN
+void
+log_write_up_to(
+/*============*/
+ ib_uint64_t lsn, /*!< in: log sequence number up to which
+ the log should be written,
+ IB_ULONGLONG_MAX if not specified */
+ ulint wait, /*!< in: LOG_NO_WAIT, LOG_WAIT_ONE_GROUP,
+ or LOG_WAIT_ALL_GROUPS */
+ ibool flush_to_disk);
+ /*!< in: TRUE if we want the written log
+ also to be flushed to disk */
+/****************************************************************//**
+Does a syncronous flush of the log buffer to disk. */
+UNIV_INTERN
+void
+log_buffer_flush_to_disk(void);
+/*==========================*/
+/****************************************************************//**
+This functions writes the log buffer to the log file and if 'flush'
+is set it forces a flush of the log file as well. This is meant to be
+called from background master thread only as it does not wait for
+the write (+ possible flush) to finish. */
+UNIV_INTERN
+void
+log_buffer_sync_in_background(
+/*==========================*/
+ ibool flush); /*<! in: flush the logs to disk */
+/****************************************************************//**
+Advances the smallest lsn for which there are unflushed dirty blocks in the
+buffer pool and also may make a new checkpoint. NOTE: this function may only
+be called if the calling thread owns no synchronization objects!
+@return FALSE if there was a flush batch of the same type running,
+which means that we could not start this flush batch */
+UNIV_INTERN
+ibool
+log_preflush_pool_modified_pages(
+/*=============================*/
+ ib_uint64_t new_oldest, /*!< in: try to advance
+ oldest_modified_lsn at least
+ to this lsn */
+ ibool sync); /*!< in: TRUE if synchronous
+ operation is desired */
+/******************************************************//**
+Makes a checkpoint. Note that this function does not flush dirty
+blocks from the buffer pool: it only checks what is lsn of the oldest
+modification in the pool, and writes information about the lsn in
+log files. Use log_make_checkpoint_at to flush also the pool.
+@return TRUE if success, FALSE if a checkpoint write was already running */
+UNIV_INTERN
+ibool
+log_checkpoint(
+/*===========*/
+ ibool sync, /*!< in: TRUE if synchronous operation is
+ desired */
+ ibool write_always); /*!< in: the function normally checks if the
+ the new checkpoint would have a greater
+ lsn than the previous one: if not, then no
+ physical write is done; by setting this
+ parameter TRUE, a physical write will always be
+ made to log files */
+/****************************************************************//**
+Makes a checkpoint at a given lsn or later. */
+UNIV_INTERN
+void
+log_make_checkpoint_at(
+/*===================*/
+ ib_uint64_t lsn, /*!< in: make a checkpoint at this or a
+ later lsn, if IB_ULONGLONG_MAX, makes
+ a checkpoint at the latest lsn */
+ ibool write_always); /*!< in: the function normally checks if
+ the the new checkpoint would have a
+ greater lsn than the previous one: if
+ not, then no physical write is done;
+ by setting this parameter TRUE, a
+ physical write will always be made to
+ log files */
+/****************************************************************//**
+Makes a checkpoint at the latest lsn and writes it to first page of each
+data file in the database, so that we know that the file spaces contain
+all modifications up to that lsn. This can only be called at database
+shutdown. This function also writes all log in log files to the log archive. */
+UNIV_INTERN
+void
+logs_empty_and_mark_files_at_shutdown(void);
+/*=======================================*/
+/******************************************************//**
+Reads a checkpoint info from a log group header to log_sys->checkpoint_buf. */
+UNIV_INTERN
+void
+log_group_read_checkpoint_info(
+/*===========================*/
+ log_group_t* group, /*!< in: log group */
+ ulint field); /*!< in: LOG_CHECKPOINT_1 or LOG_CHECKPOINT_2 */
+/*******************************************************************//**
+Gets info from a checkpoint about a log group. */
+UNIV_INTERN
+void
+log_checkpoint_get_nth_group_info(
+/*==============================*/
+ const byte* buf, /*!< in: buffer containing checkpoint info */
+ ulint n, /*!< in: nth slot */
+ ulint* file_no,/*!< out: archived file number */
+ ulint* offset);/*!< out: archived file offset */
+/******************************************************//**
+Writes checkpoint info to groups. */
+UNIV_INTERN
+void
+log_groups_write_checkpoint_info(void);
+/*==================================*/
+/********************************************************************//**
+Starts an archiving operation.
+@return TRUE if succeed, FALSE if an archiving operation was already running */
+UNIV_INTERN
+ibool
+log_archive_do(
+/*===========*/
+ ibool sync, /*!< in: TRUE if synchronous operation is desired */
+ ulint* n_bytes);/*!< out: archive log buffer size, 0 if nothing to
+ archive */
+/****************************************************************//**
+Writes the log contents to the archive up to the lsn when this function was
+called, and stops the archiving. When archiving is started again, the archived
+log file numbers start from a number one higher, so that the archiving will
+not write again to the archived log files which exist when this function
+returns.
+@return DB_SUCCESS or DB_ERROR */
+UNIV_INTERN
+ulint
+log_archive_stop(void);
+/*==================*/
+/****************************************************************//**
+Starts again archiving which has been stopped.
+@return DB_SUCCESS or DB_ERROR */
+UNIV_INTERN
+ulint
+log_archive_start(void);
+/*===================*/
+/****************************************************************//**
+Stop archiving the log so that a gap may occur in the archived log files.
+@return DB_SUCCESS or DB_ERROR */
+UNIV_INTERN
+ulint
+log_archive_noarchivelog(void);
+/*==========================*/
+/****************************************************************//**
+Start archiving the log so that a gap may occur in the archived log files.
+@return DB_SUCCESS or DB_ERROR */
+UNIV_INTERN
+ulint
+log_archive_archivelog(void);
+/*========================*/
+/******************************************************//**
+Generates an archived log file name. */
+UNIV_INTERN
+void
+log_archived_file_name_gen(
+/*=======================*/
+ char* buf, /*!< in: buffer where to write */
+ ulint id, /*!< in: group id */
+ ulint file_no);/*!< in: file number */
+#else /* !UNIV_HOTBACKUP */
+/******************************************************//**
+Writes info to a buffer of a log group when log files are created in
+backup restoration. */
+UNIV_INTERN
+void
+log_reset_first_header_and_checkpoint(
+/*==================================*/
+ byte* hdr_buf,/*!< in: buffer which will be written to the
+ start of the first log file */
+ ib_uint64_t start); /*!< in: lsn of the start of the first log file;
+ we pretend that there is a checkpoint at
+ start + LOG_BLOCK_HDR_SIZE */
+#endif /* !UNIV_HOTBACKUP */
+/********************************************************************//**
+Checks that there is enough free space in the log to start a new query step.
+Flushes the log buffer or makes a new checkpoint if necessary. NOTE: this
+function may only be called if the calling thread owns no synchronization
+objects! */
+UNIV_INTERN
+void
+log_check_margins(void);
+/*===================*/
+#ifndef UNIV_HOTBACKUP
+/******************************************************//**
+Reads a specified log segment to a buffer. */
+UNIV_INTERN
+void
+log_group_read_log_seg(
+/*===================*/
+ ulint type, /*!< in: LOG_ARCHIVE or LOG_RECOVER */
+ byte* buf, /*!< in: buffer where to read */
+ log_group_t* group, /*!< in: log group */
+ ib_uint64_t start_lsn, /*!< in: read area start */
+ ib_uint64_t end_lsn); /*!< in: read area end */
+/******************************************************//**
+Writes a buffer to a log file group. */
+UNIV_INTERN
+void
+log_group_write_buf(
+/*================*/
+ log_group_t* group, /*!< in: log group */
+ byte* buf, /*!< in: buffer */
+ ulint len, /*!< in: buffer len; must be divisible
+ by OS_FILE_LOG_BLOCK_SIZE */
+ ib_uint64_t start_lsn, /*!< in: start lsn of the buffer; must
+ be divisible by
+ OS_FILE_LOG_BLOCK_SIZE */
+ ulint new_data_offset);/*!< in: start offset of new data in
+ buf: this parameter is used to decide
+ if we have to write a new log file
+ header */
+/********************************************************//**
+Sets the field values in group to correspond to a given lsn. For this function
+to work, the values must already be correctly initialized to correspond to
+some lsn, for instance, a checkpoint lsn. */
+UNIV_INTERN
+void
+log_group_set_fields(
+/*=================*/
+ log_group_t* group, /*!< in/out: group */
+ ib_uint64_t lsn); /*!< in: lsn for which the values should be
+ set */
+/******************************************************//**
+Calculates the data capacity of a log group, when the log file headers are not
+included.
+@return capacity in bytes */
+UNIV_INTERN
+ulint
+log_group_get_capacity(
+/*===================*/
+ const log_group_t* group); /*!< in: log group */
+#endif /* !UNIV_HOTBACKUP */
+/************************************************************//**
+Gets a log block flush bit.
+@return TRUE if this block was the first to be written in a log flush */
+UNIV_INLINE
+ibool
+log_block_get_flush_bit(
+/*====================*/
+ const byte* log_block); /*!< in: log block */
+/************************************************************//**
+Gets a log block number stored in the header.
+@return log block number stored in the block header */
+UNIV_INLINE
+ulint
+log_block_get_hdr_no(
+/*=================*/
+ const byte* log_block); /*!< in: log block */
+/************************************************************//**
+Gets a log block data length.
+@return log block data length measured as a byte offset from the block start */
+UNIV_INLINE
+ulint
+log_block_get_data_len(
+/*===================*/
+ const byte* log_block); /*!< in: log block */
+/************************************************************//**
+Sets the log block data length. */
+UNIV_INLINE
+void
+log_block_set_data_len(
+/*===================*/
+ byte* log_block, /*!< in/out: log block */
+ ulint len); /*!< in: data length */
+/************************************************************//**
+Calculates the checksum for a log block.
+@return checksum */
+UNIV_INLINE
+ulint
+log_block_calc_checksum(
+/*====================*/
+ const byte* block); /*!< in: log block */
+/************************************************************//**
+Gets a log block checksum field value.
+@return checksum */
+UNIV_INLINE
+ulint
+log_block_get_checksum(
+/*===================*/
+ const byte* log_block); /*!< in: log block */
+/************************************************************//**
+Sets a log block checksum field value. */
+UNIV_INLINE
+void
+log_block_set_checksum(
+/*===================*/
+ byte* log_block, /*!< in/out: log block */
+ ulint checksum); /*!< in: checksum */
+/************************************************************//**
+Gets a log block first mtr log record group offset.
+@return first mtr log record group byte offset from the block start, 0
+if none */
+UNIV_INLINE
+ulint
+log_block_get_first_rec_group(
+/*==========================*/
+ const byte* log_block); /*!< in: log block */
+/************************************************************//**
+Sets the log block first mtr log record group offset. */
+UNIV_INLINE
+void
+log_block_set_first_rec_group(
+/*==========================*/
+ byte* log_block, /*!< in/out: log block */
+ ulint offset); /*!< in: offset, 0 if none */
+/************************************************************//**
+Gets a log block checkpoint number field (4 lowest bytes).
+@return checkpoint no (4 lowest bytes) */
+UNIV_INLINE
+ulint
+log_block_get_checkpoint_no(
+/*========================*/
+ const byte* log_block); /*!< in: log block */
+/************************************************************//**
+Initializes a log block in the log buffer. */
+UNIV_INLINE
+void
+log_block_init(
+/*===========*/
+ byte* log_block, /*!< in: pointer to the log buffer */
+ ib_uint64_t lsn); /*!< in: lsn within the log block */
+/************************************************************//**
+Initializes a log block in the log buffer in the old, < 3.23.52 format, where
+there was no checksum yet. */
+UNIV_INLINE
+void
+log_block_init_in_old_format(
+/*=========================*/
+ byte* log_block, /*!< in: pointer to the log buffer */
+ ib_uint64_t lsn); /*!< in: lsn within the log block */
+/************************************************************//**
+Converts a lsn to a log block number.
+@return log block number, it is > 0 and <= 1G */
+UNIV_INLINE
+ulint
+log_block_convert_lsn_to_no(
+/*========================*/
+ ib_uint64_t lsn); /*!< in: lsn of a byte within the block */
+/******************************************************//**
+Prints info of the log. */
+UNIV_INTERN
+void
+log_print(
+/*======*/
+ FILE* file); /*!< in: file where to print */
+/******************************************************//**
+Peeks the current lsn.
+@return TRUE if success, FALSE if could not get the log system mutex */
+UNIV_INTERN
+ibool
+log_peek_lsn(
+/*=========*/
+ ib_uint64_t* lsn); /*!< out: if returns TRUE, current lsn is here */
+/**********************************************************************//**
+Refreshes the statistics used to print per-second averages. */
+UNIV_INTERN
+void
+log_refresh_stats(void);
+/*===================*/
+
+extern log_t* log_sys;
+
+/* Values used as flags */
+#define LOG_FLUSH 7652559
+#define LOG_CHECKPOINT 78656949
+#ifdef UNIV_LOG_ARCHIVE
+# define LOG_ARCHIVE 11122331
+#endif /* UNIV_LOG_ARCHIVE */
+#define LOG_RECOVER 98887331
+
+/* The counting of lsn's starts from this value: this must be non-zero */
+#define LOG_START_LSN ((ib_uint64_t) (16 * OS_FILE_LOG_BLOCK_SIZE))
+
+#define LOG_BUFFER_SIZE (srv_log_buffer_size * UNIV_PAGE_SIZE)
+#define LOG_ARCHIVE_BUF_SIZE (srv_log_buffer_size * UNIV_PAGE_SIZE / 4)
+
+/* Offsets of a log block header */
+#define LOG_BLOCK_HDR_NO 0 /* block number which must be > 0 and
+ is allowed to wrap around at 2G; the
+ highest bit is set to 1 if this is the
+ first log block in a log flush write
+ segment */
+#define LOG_BLOCK_FLUSH_BIT_MASK 0x80000000UL
+ /* mask used to get the highest bit in
+ the preceding field */
+#define LOG_BLOCK_HDR_DATA_LEN 4 /* number of bytes of log written to
+ this block */
+#define LOG_BLOCK_FIRST_REC_GROUP 6 /* offset of the first start of an
+ mtr log record group in this log block,
+ 0 if none; if the value is the same
+ as LOG_BLOCK_HDR_DATA_LEN, it means
+ that the first rec group has not yet
+ been catenated to this log block, but
+ if it will, it will start at this
+ offset; an archive recovery can
+ start parsing the log records starting
+ from this offset in this log block,
+ if value not 0 */
+#define LOG_BLOCK_CHECKPOINT_NO 8 /* 4 lower bytes of the value of
+ log_sys->next_checkpoint_no when the
+ log block was last written to: if the
+ block has not yet been written full,
+ this value is only updated before a
+ log buffer flush */
+#define LOG_BLOCK_HDR_SIZE 12 /* size of the log block header in
+ bytes */
+
+/* Offsets of a log block trailer from the end of the block */
+#define LOG_BLOCK_CHECKSUM 4 /* 4 byte checksum of the log block
+ contents; in InnoDB versions
+ < 3.23.52 this did not contain the
+ checksum but the same value as
+ .._HDR_NO */
+#define LOG_BLOCK_TRL_SIZE 4 /* trailer size in bytes */
+
+/* Offsets for a checkpoint field */
+#define LOG_CHECKPOINT_NO 0
+#define LOG_CHECKPOINT_LSN 8
+#define LOG_CHECKPOINT_OFFSET 16
+#define LOG_CHECKPOINT_LOG_BUF_SIZE 20
+#define LOG_CHECKPOINT_ARCHIVED_LSN 24
+#define LOG_CHECKPOINT_GROUP_ARRAY 32
+
+/* For each value smaller than LOG_MAX_N_GROUPS the following 8 bytes: */
+
+#define LOG_CHECKPOINT_ARCHIVED_FILE_NO 0
+#define LOG_CHECKPOINT_ARCHIVED_OFFSET 4
+
+#define LOG_CHECKPOINT_ARRAY_END (LOG_CHECKPOINT_GROUP_ARRAY\
+ + LOG_MAX_N_GROUPS * 8)
+#define LOG_CHECKPOINT_CHECKSUM_1 LOG_CHECKPOINT_ARRAY_END
+#define LOG_CHECKPOINT_CHECKSUM_2 (4 + LOG_CHECKPOINT_ARRAY_END)
+#define LOG_CHECKPOINT_FSP_FREE_LIMIT (8 + LOG_CHECKPOINT_ARRAY_END)
+ /* current fsp free limit in
+ tablespace 0, in units of one
+ megabyte; this information is only used
+ by ibbackup to decide if it can
+ truncate unused ends of
+ non-auto-extending data files in space
+ 0 */
+#define LOG_CHECKPOINT_FSP_MAGIC_N (12 + LOG_CHECKPOINT_ARRAY_END)
+ /* this magic number tells if the
+ checkpoint contains the above field:
+ the field was added to
+ InnoDB-3.23.50 */
+#define LOG_CHECKPOINT_SIZE (16 + LOG_CHECKPOINT_ARRAY_END)
+
+#define LOG_CHECKPOINT_FSP_MAGIC_N_VAL 1441231243
+
+/* Offsets of a log file header */
+#define LOG_GROUP_ID 0 /* log group number */
+#define LOG_FILE_START_LSN 4 /* lsn of the start of data in this
+ log file */
+#define LOG_FILE_NO 12 /* 4-byte archived log file number;
+ this field is only defined in an
+ archived log file */
+#define LOG_FILE_WAS_CREATED_BY_HOT_BACKUP 16
+ /* a 32-byte field which contains
+ the string 'ibbackup' and the
+ creation time if the log file was
+ created by ibbackup --restore;
+ when mysqld is first time started
+ on the restored database, it can
+ print helpful info for the user */
+#define LOG_FILE_ARCH_COMPLETED OS_FILE_LOG_BLOCK_SIZE
+ /* this 4-byte field is TRUE when
+ the writing of an archived log file
+ has been completed; this field is
+ only defined in an archived log file */
+#define LOG_FILE_END_LSN (OS_FILE_LOG_BLOCK_SIZE + 4)
+ /* lsn where the archived log file
+ at least extends: actually the
+ archived log file may extend to a
+ later lsn, as long as it is within the
+ same log block as this lsn; this field
+ is defined only when an archived log
+ file has been completely written */
+#define LOG_CHECKPOINT_1 OS_FILE_LOG_BLOCK_SIZE
+ /* first checkpoint field in the log
+ header; we write alternately to the
+ checkpoint fields when we make new
+ checkpoints; this field is only defined
+ in the first log file of a log group */
+#define LOG_CHECKPOINT_2 (3 * OS_FILE_LOG_BLOCK_SIZE)
+ /* second checkpoint field in the log
+ header */
+#define LOG_FILE_HDR_SIZE (4 * OS_FILE_LOG_BLOCK_SIZE)
+
+#define LOG_GROUP_OK 301
+#define LOG_GROUP_CORRUPTED 302
+
+/** Log group consists of a number of log files, each of the same size; a log
+group is implemented as a space in the sense of the module fil0fil. */
+struct log_group_struct{
+ /* The following fields are protected by log_sys->mutex */
+ ulint id; /*!< log group id */
+ ulint n_files; /*!< number of files in the group */
+ ulint file_size; /*!< individual log file size in bytes,
+ including the log file header */
+ ulint space_id; /*!< file space which implements the log
+ group */
+ ulint state; /*!< LOG_GROUP_OK or
+ LOG_GROUP_CORRUPTED */
+ ib_uint64_t lsn; /*!< lsn used to fix coordinates within
+ the log group */
+ ulint lsn_offset; /*!< the offset of the above lsn */
+ ulint n_pending_writes;/*!< number of currently pending flush
+ writes for this log group */
+ byte** file_header_bufs;/*!< buffers for each file
+ header in the group */
+ /*-----------------------------*/
+ byte** archive_file_header_bufs;/*!< buffers for each file
+ header in the group */
+ ulint archive_space_id;/*!< file space which
+ implements the log group
+ archive */
+ ulint archived_file_no;/*!< file number corresponding to
+ log_sys->archived_lsn */
+ ulint archived_offset;/*!< file offset corresponding to
+ log_sys->archived_lsn, 0 if we have
+ not yet written to the archive file
+ number archived_file_no */
+ ulint next_archived_file_no;/*!< during an archive write,
+ until the write is completed, we
+ store the next value for
+ archived_file_no here: the write
+ completion function then sets the new
+ value to ..._file_no */
+ ulint next_archived_offset; /*!< like the preceding field */
+ /*-----------------------------*/
+ ib_uint64_t scanned_lsn; /*!< used only in recovery: recovery scan
+ succeeded up to this lsn in this log
+ group */
+ byte* checkpoint_buf; /*!< checkpoint header is written from
+ this buffer to the group */
+ UT_LIST_NODE_T(log_group_t)
+ log_groups; /*!< list of log groups */
+};
+
+/** Redo log buffer */
+struct log_struct{
+ byte pad[64]; /*!< padding to prevent other memory
+ update hotspots from residing on the
+ same memory cache line */
+ ib_uint64_t lsn; /*!< log sequence number */
+ ulint buf_free; /*!< first free offset within the log
+ buffer */
+#ifndef UNIV_HOTBACKUP
+ mutex_t mutex; /*!< mutex protecting the log */
+#endif /* !UNIV_HOTBACKUP */
+ byte* buf; /*!< log buffer */
+ ulint buf_size; /*!< log buffer size in bytes */
+ ulint max_buf_free; /*!< recommended maximum value of
+ buf_free, after which the buffer is
+ flushed */
+ ulint old_buf_free; /*!< value of buf free when log was
+ last time opened; only in the debug
+ version */
+ ib_uint64_t old_lsn; /*!< value of lsn when log was
+ last time opened; only in the
+ debug version */
+ ibool check_flush_or_checkpoint;
+ /*!< this is set to TRUE when there may
+ be need to flush the log buffer, or
+ preflush buffer pool pages, or make
+ a checkpoint; this MUST be TRUE when
+ lsn - last_checkpoint_lsn >
+ max_checkpoint_age; this flag is
+ peeked at by log_free_check(), which
+ does not reserve the log mutex */
+ UT_LIST_BASE_NODE_T(log_group_t)
+ log_groups; /*!< log groups */
+
+#ifndef UNIV_HOTBACKUP
+ /** The fields involved in the log buffer flush @{ */
+
+ ulint buf_next_to_write;/*!< first offset in the log buffer
+ where the byte content may not exist
+ written to file, e.g., the start
+ offset of a log record catenated
+ later; this is advanced when a flush
+ operation is completed to all the log
+ groups */
+ ib_uint64_t written_to_some_lsn;
+ /*!< first log sequence number not yet
+ written to any log group; for this to
+ be advanced, it is enough that the
+ write i/o has been completed for any
+ one log group */
+ ib_uint64_t written_to_all_lsn;
+ /*!< first log sequence number not yet
+ written to some log group; for this to
+ be advanced, it is enough that the
+ write i/o has been completed for all
+ log groups */
+ ib_uint64_t write_lsn; /*!< end lsn for the current running
+ write */
+ ulint write_end_offset;/*!< the data in buffer has
+ been written up to this offset
+ when the current write ends:
+ this field will then be copied
+ to buf_next_to_write */
+ ib_uint64_t current_flush_lsn;/*!< end lsn for the current running
+ write + flush operation */
+ ib_uint64_t flushed_to_disk_lsn;
+ /*!< how far we have written the log
+ AND flushed to disk */
+ ulint n_pending_writes;/*!< number of currently
+ pending flushes or writes */
+ /* NOTE on the 'flush' in names of the fields below: starting from
+ 4.0.14, we separate the write of the log file and the actual fsync()
+ or other method to flush it to disk. The names below shhould really
+ be 'flush_or_write'! */
+ os_event_t no_flush_event; /*!< this event is in the reset state
+ when a flush or a write is running;
+ a thread should wait for this without
+ owning the log mutex, but NOTE that
+ to set or reset this event, the
+ thread MUST own the log mutex! */
+ ibool one_flushed; /*!< during a flush, this is
+ first FALSE and becomes TRUE
+ when one log group has been
+ written or flushed */
+ os_event_t one_flushed_event;/*!< this event is reset when the
+ flush or write has not yet completed
+ for any log group; e.g., this means
+ that a transaction has been committed
+ when this is set; a thread should wait
+ for this without owning the log mutex,
+ but NOTE that to set or reset this
+ event, the thread MUST own the log
+ mutex! */
+ ulint n_log_ios; /*!< number of log i/os initiated thus
+ far */
+ ulint n_log_ios_old; /*!< number of log i/o's at the
+ previous printout */
+ time_t last_printout_time;/*!< when log_print was last time
+ called */
+ /* @} */
+
+ /** Fields involved in checkpoints @{ */
+ ulint log_group_capacity; /*!< capacity of the log group; if
+ the checkpoint age exceeds this, it is
+ a serious error because it is possible
+ we will then overwrite log and spoil
+ crash recovery */
+ ulint max_modified_age_async;
+ /*!< when this recommended
+ value for lsn -
+ buf_pool_get_oldest_modification()
+ is exceeded, we start an
+ asynchronous preflush of pool pages */
+ ulint max_modified_age_sync;
+ /*!< when this recommended
+ value for lsn -
+ buf_pool_get_oldest_modification()
+ is exceeded, we start a
+ synchronous preflush of pool pages */
+ ulint adm_checkpoint_interval;
+ /*!< administrator-specified checkpoint
+ interval in terms of log growth in
+ bytes; the interval actually used by
+ the database can be smaller */
+ ulint max_checkpoint_age_async;
+ /*!< when this checkpoint age
+ is exceeded we start an
+ asynchronous writing of a new
+ checkpoint */
+ ulint max_checkpoint_age;
+ /*!< this is the maximum allowed value
+ for lsn - last_checkpoint_lsn when a
+ new query step is started */
+ ib_uint64_t next_checkpoint_no;
+ /*!< next checkpoint number */
+ ib_uint64_t last_checkpoint_lsn;
+ /*!< latest checkpoint lsn */
+ ib_uint64_t next_checkpoint_lsn;
+ /*!< next checkpoint lsn */
+ ulint n_pending_checkpoint_writes;
+ /*!< number of currently pending
+ checkpoint writes */
+ rw_lock_t checkpoint_lock;/*!< this latch is x-locked when a
+ checkpoint write is running; a thread
+ should wait for this without owning
+ the log mutex */
+#endif /* !UNIV_HOTBACKUP */
+ byte* checkpoint_buf; /*!< checkpoint header is read to this
+ buffer */
+ /* @} */
+#ifdef UNIV_LOG_ARCHIVE
+ /** Fields involved in archiving @{ */
+ ulint archiving_state;/*!< LOG_ARCH_ON, LOG_ARCH_STOPPING
+ LOG_ARCH_STOPPED, LOG_ARCH_OFF */
+ ib_uint64_t archived_lsn; /*!< archiving has advanced to this
+ lsn */
+ ulint max_archived_lsn_age_async;
+ /*!< recommended maximum age of
+ archived_lsn, before we start
+ asynchronous copying to the archive */
+ ulint max_archived_lsn_age;
+ /*!< maximum allowed age for
+ archived_lsn */
+ ib_uint64_t next_archived_lsn;/*!< during an archive write,
+ until the write is completed, we
+ store the next value for
+ archived_lsn here: the write
+ completion function then sets the new
+ value to archived_lsn */
+ ulint archiving_phase;/*!< LOG_ARCHIVE_READ or
+ LOG_ARCHIVE_WRITE */
+ ulint n_pending_archive_ios;
+ /*!< number of currently pending reads
+ or writes in archiving */
+ rw_lock_t archive_lock; /*!< this latch is x-locked when an
+ archive write is running; a thread
+ should wait for this without owning
+ the log mutex */
+ ulint archive_buf_size;/*!< size of archive_buf */
+ byte* archive_buf; /*!< log segment is written to the
+ archive from this buffer */
+ os_event_t archiving_on; /*!< if archiving has been stopped,
+ a thread can wait for this event to
+ become signaled */
+ /* @} */
+#endif /* UNIV_LOG_ARCHIVE */
+};
+
+#ifdef UNIV_LOG_ARCHIVE
+/** Archiving state @{ */
+#define LOG_ARCH_ON 71
+#define LOG_ARCH_STOPPING 72
+#define LOG_ARCH_STOPPING2 73
+#define LOG_ARCH_STOPPED 74
+#define LOG_ARCH_OFF 75
+/* @} */
+#endif /* UNIV_LOG_ARCHIVE */
+
+#ifndef UNIV_NONINL
+#include "log0log.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/log0log.ic b/storage/innobase/include/log0log.ic
new file mode 100644
index 00000000000..d071985982a
--- /dev/null
+++ b/storage/innobase/include/log0log.ic
@@ -0,0 +1,417 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/log0log.ic
+Database log
+
+Created 12/9/1995 Heikki Tuuri
+*******************************************************/
+
+#include "os0file.h"
+#include "mach0data.h"
+#include "mtr0mtr.h"
+
+/******************************************************//**
+Checks by parsing that the catenated log segment for a single mtr is
+consistent. */
+UNIV_INTERN
+ibool
+log_check_log_recs(
+/*===============*/
+ byte* buf, /*!< in: pointer to the start of
+ the log segment in the
+ log_sys->buf log buffer */
+ ulint len, /*!< in: segment length in bytes */
+ ib_uint64_t buf_start_lsn); /*!< in: buffer start lsn */
+
+/************************************************************//**
+Gets a log block flush bit.
+@return TRUE if this block was the first to be written in a log flush */
+UNIV_INLINE
+ibool
+log_block_get_flush_bit(
+/*====================*/
+ const byte* log_block) /*!< in: log block */
+{
+ if (LOG_BLOCK_FLUSH_BIT_MASK
+ & mach_read_from_4(log_block + LOG_BLOCK_HDR_NO)) {
+
+ return(TRUE);
+ }
+
+ return(FALSE);
+}
+
+/************************************************************//**
+Sets the log block flush bit. */
+UNIV_INLINE
+void
+log_block_set_flush_bit(
+/*====================*/
+ byte* log_block, /*!< in/out: log block */
+ ibool val) /*!< in: value to set */
+{
+ ulint field;
+
+ field = mach_read_from_4(log_block + LOG_BLOCK_HDR_NO);
+
+ if (val) {
+ field = field | LOG_BLOCK_FLUSH_BIT_MASK;
+ } else {
+ field = field & ~LOG_BLOCK_FLUSH_BIT_MASK;
+ }
+
+ mach_write_to_4(log_block + LOG_BLOCK_HDR_NO, field);
+}
+
+/************************************************************//**
+Gets a log block number stored in the header.
+@return log block number stored in the block header */
+UNIV_INLINE
+ulint
+log_block_get_hdr_no(
+/*=================*/
+ const byte* log_block) /*!< in: log block */
+{
+ return(~LOG_BLOCK_FLUSH_BIT_MASK
+ & mach_read_from_4(log_block + LOG_BLOCK_HDR_NO));
+}
+
+/************************************************************//**
+Sets the log block number stored in the header; NOTE that this must be set
+before the flush bit! */
+UNIV_INLINE
+void
+log_block_set_hdr_no(
+/*=================*/
+ byte* log_block, /*!< in/out: log block */
+ ulint n) /*!< in: log block number: must be > 0 and
+ < LOG_BLOCK_FLUSH_BIT_MASK */
+{
+ ut_ad(n > 0);
+ ut_ad(n < LOG_BLOCK_FLUSH_BIT_MASK);
+
+ mach_write_to_4(log_block + LOG_BLOCK_HDR_NO, n);
+}
+
+/************************************************************//**
+Gets a log block data length.
+@return log block data length measured as a byte offset from the block start */
+UNIV_INLINE
+ulint
+log_block_get_data_len(
+/*===================*/
+ const byte* log_block) /*!< in: log block */
+{
+ return(mach_read_from_2(log_block + LOG_BLOCK_HDR_DATA_LEN));
+}
+
+/************************************************************//**
+Sets the log block data length. */
+UNIV_INLINE
+void
+log_block_set_data_len(
+/*===================*/
+ byte* log_block, /*!< in/out: log block */
+ ulint len) /*!< in: data length */
+{
+ mach_write_to_2(log_block + LOG_BLOCK_HDR_DATA_LEN, len);
+}
+
+/************************************************************//**
+Gets a log block first mtr log record group offset.
+@return first mtr log record group byte offset from the block start, 0
+if none */
+UNIV_INLINE
+ulint
+log_block_get_first_rec_group(
+/*==========================*/
+ const byte* log_block) /*!< in: log block */
+{
+ return(mach_read_from_2(log_block + LOG_BLOCK_FIRST_REC_GROUP));
+}
+
+/************************************************************//**
+Sets the log block first mtr log record group offset. */
+UNIV_INLINE
+void
+log_block_set_first_rec_group(
+/*==========================*/
+ byte* log_block, /*!< in/out: log block */
+ ulint offset) /*!< in: offset, 0 if none */
+{
+ mach_write_to_2(log_block + LOG_BLOCK_FIRST_REC_GROUP, offset);
+}
+
+/************************************************************//**
+Gets a log block checkpoint number field (4 lowest bytes).
+@return checkpoint no (4 lowest bytes) */
+UNIV_INLINE
+ulint
+log_block_get_checkpoint_no(
+/*========================*/
+ const byte* log_block) /*!< in: log block */
+{
+ return(mach_read_from_4(log_block + LOG_BLOCK_CHECKPOINT_NO));
+}
+
+/************************************************************//**
+Sets a log block checkpoint number field (4 lowest bytes). */
+UNIV_INLINE
+void
+log_block_set_checkpoint_no(
+/*========================*/
+ byte* log_block, /*!< in/out: log block */
+ ib_uint64_t no) /*!< in: checkpoint no */
+{
+ mach_write_to_4(log_block + LOG_BLOCK_CHECKPOINT_NO, (ulint) no);
+}
+
+/************************************************************//**
+Converts a lsn to a log block number.
+@return log block number, it is > 0 and <= 1G */
+UNIV_INLINE
+ulint
+log_block_convert_lsn_to_no(
+/*========================*/
+ ib_uint64_t lsn) /*!< in: lsn of a byte within the block */
+{
+ return(((ulint) (lsn / OS_FILE_LOG_BLOCK_SIZE) & 0x3FFFFFFFUL) + 1);
+}
+
+/************************************************************//**
+Calculates the checksum for a log block.
+@return checksum */
+UNIV_INLINE
+ulint
+log_block_calc_checksum(
+/*====================*/
+ const byte* block) /*!< in: log block */
+{
+ ulint sum;
+ ulint sh;
+ ulint i;
+
+ sum = 1;
+ sh = 0;
+
+ for (i = 0; i < OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_TRL_SIZE; i++) {
+ ulint b = (ulint) block[i];
+ sum &= 0x7FFFFFFFUL;
+ sum += b;
+ sum += b << sh;
+ sh++;
+ if (sh > 24) {
+ sh = 0;
+ }
+ }
+
+ return(sum);
+}
+
+/************************************************************//**
+Gets a log block checksum field value.
+@return checksum */
+UNIV_INLINE
+ulint
+log_block_get_checksum(
+/*===================*/
+ const byte* log_block) /*!< in: log block */
+{
+ return(mach_read_from_4(log_block + OS_FILE_LOG_BLOCK_SIZE
+ - LOG_BLOCK_CHECKSUM));
+}
+
+/************************************************************//**
+Sets a log block checksum field value. */
+UNIV_INLINE
+void
+log_block_set_checksum(
+/*===================*/
+ byte* log_block, /*!< in/out: log block */
+ ulint checksum) /*!< in: checksum */
+{
+ mach_write_to_4(log_block + OS_FILE_LOG_BLOCK_SIZE
+ - LOG_BLOCK_CHECKSUM,
+ checksum);
+}
+
+/************************************************************//**
+Initializes a log block in the log buffer. */
+UNIV_INLINE
+void
+log_block_init(
+/*===========*/
+ byte* log_block, /*!< in: pointer to the log buffer */
+ ib_uint64_t lsn) /*!< in: lsn within the log block */
+{
+ ulint no;
+
+ ut_ad(mutex_own(&(log_sys->mutex)));
+
+ no = log_block_convert_lsn_to_no(lsn);
+
+ log_block_set_hdr_no(log_block, no);
+
+ log_block_set_data_len(log_block, LOG_BLOCK_HDR_SIZE);
+ log_block_set_first_rec_group(log_block, 0);
+}
+
+/************************************************************//**
+Initializes a log block in the log buffer in the old format, where there
+was no checksum yet. */
+UNIV_INLINE
+void
+log_block_init_in_old_format(
+/*=========================*/
+ byte* log_block, /*!< in: pointer to the log buffer */
+ ib_uint64_t lsn) /*!< in: lsn within the log block */
+{
+ ulint no;
+
+ ut_ad(mutex_own(&(log_sys->mutex)));
+
+ no = log_block_convert_lsn_to_no(lsn);
+
+ log_block_set_hdr_no(log_block, no);
+ mach_write_to_4(log_block + OS_FILE_LOG_BLOCK_SIZE
+ - LOG_BLOCK_CHECKSUM, no);
+ log_block_set_data_len(log_block, LOG_BLOCK_HDR_SIZE);
+ log_block_set_first_rec_group(log_block, 0);
+}
+
+#ifndef UNIV_HOTBACKUP
+/************************************************************//**
+Writes to the log the string given. The log must be released with
+log_release.
+@return end lsn of the log record, zero if did not succeed */
+UNIV_INLINE
+ib_uint64_t
+log_reserve_and_write_fast(
+/*=======================*/
+ byte* str, /*!< in: string */
+ ulint len, /*!< in: string length */
+ ib_uint64_t* start_lsn,/*!< out: start lsn of the log record */
+ ibool* success)/*!< out: TRUE if success */
+{
+ log_t* log = log_sys;
+ ulint data_len;
+ ib_uint64_t lsn;
+
+ *success = TRUE;
+
+ mutex_enter(&(log->mutex));
+
+ data_len = len + log->buf_free % OS_FILE_LOG_BLOCK_SIZE;
+
+ if (data_len >= OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_TRL_SIZE) {
+
+ /* The string does not fit within the current log block
+ or the log block would become full */
+
+ *success = FALSE;
+
+ mutex_exit(&(log->mutex));
+
+ return(0);
+ }
+
+ *start_lsn = log->lsn;
+
+ ut_memcpy(log->buf + log->buf_free, str, len);
+
+ log_block_set_data_len((byte*) ut_align_down(log->buf + log->buf_free,
+ OS_FILE_LOG_BLOCK_SIZE),
+ data_len);
+#ifdef UNIV_LOG_DEBUG
+ log->old_buf_free = log->buf_free;
+ log->old_lsn = log->lsn;
+#endif
+ log->buf_free += len;
+
+ ut_ad(log->buf_free <= log->buf_size);
+
+ lsn = log->lsn += len;
+
+#ifdef UNIV_LOG_DEBUG
+ log_check_log_recs(log->buf + log->old_buf_free,
+ log->buf_free - log->old_buf_free, log->old_lsn);
+#endif
+ return(lsn);
+}
+
+/***********************************************************************//**
+Releases the log mutex. */
+UNIV_INLINE
+void
+log_release(void)
+/*=============*/
+{
+ mutex_exit(&(log_sys->mutex));
+}
+
+/************************************************************//**
+Gets the current lsn.
+@return current lsn */
+UNIV_INLINE
+ib_uint64_t
+log_get_lsn(void)
+/*=============*/
+{
+ ib_uint64_t lsn;
+
+ mutex_enter(&(log_sys->mutex));
+
+ lsn = log_sys->lsn;
+
+ mutex_exit(&(log_sys->mutex));
+
+ return(lsn);
+}
+
+/****************************************************************
+Gets the log group capacity. It is OK to read the value without
+holding log_sys->mutex because it is constant.
+@return log group capacity */
+UNIV_INLINE
+ulint
+log_get_capacity(void)
+/*==================*/
+{
+ return(log_sys->log_group_capacity);
+}
+
+/***********************************************************************//**
+Checks if there is need for a log buffer flush or a new checkpoint, and does
+this if yes. Any database operation should call this when it has modified
+more than about 4 pages. NOTE that this function may only be called when the
+OS thread owns no synchronization objects except the dictionary mutex. */
+UNIV_INLINE
+void
+log_free_check(void)
+/*================*/
+{
+ /* ut_ad(sync_thread_levels_empty()); */
+
+ if (log_sys->check_flush_or_checkpoint) {
+
+ log_check_margins();
+ }
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/innobase/include/log0recv.h b/storage/innobase/include/log0recv.h
new file mode 100644
index 00000000000..8468c213bdb
--- /dev/null
+++ b/storage/innobase/include/log0recv.h
@@ -0,0 +1,466 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/log0recv.h
+Recovery
+
+Created 9/20/1997 Heikki Tuuri
+*******************************************************/
+
+#ifndef log0recv_h
+#define log0recv_h
+
+#include "univ.i"
+#include "ut0byte.h"
+#include "buf0types.h"
+#include "hash0hash.h"
+#include "log0log.h"
+
+#ifdef UNIV_HOTBACKUP
+extern ibool recv_replay_file_ops;
+
+/*******************************************************************//**
+Reads the checkpoint info needed in hot backup.
+@return TRUE if success */
+UNIV_INTERN
+ibool
+recv_read_cp_info_for_backup(
+/*=========================*/
+ const byte* hdr, /*!< in: buffer containing the log group
+ header */
+ ib_uint64_t* lsn, /*!< out: checkpoint lsn */
+ ulint* offset, /*!< out: checkpoint offset in the log group */
+ ulint* fsp_limit,/*!< out: fsp limit of space 0,
+ 1000000000 if the database is running
+ with < version 3.23.50 of InnoDB */
+ ib_uint64_t* cp_no, /*!< out: checkpoint number */
+ ib_uint64_t* first_header_lsn);
+ /*!< out: lsn of of the start of the
+ first log file */
+/*******************************************************************//**
+Scans the log segment and n_bytes_scanned is set to the length of valid
+log scanned. */
+UNIV_INTERN
+void
+recv_scan_log_seg_for_backup(
+/*=========================*/
+ byte* buf, /*!< in: buffer containing log data */
+ ulint buf_len, /*!< in: data length in that buffer */
+ ib_uint64_t* scanned_lsn, /*!< in/out: lsn of buffer start,
+ we return scanned lsn */
+ ulint* scanned_checkpoint_no,
+ /*!< in/out: 4 lowest bytes of the
+ highest scanned checkpoint number so
+ far */
+ ulint* n_bytes_scanned);/*!< out: how much we were able to
+ scan, smaller than buf_len if log
+ data ended here */
+#endif /* UNIV_HOTBACKUP */
+/*******************************************************************//**
+Returns TRUE if recovery is currently running.
+@return recv_recovery_on */
+UNIV_INLINE
+ibool
+recv_recovery_is_on(void);
+/*=====================*/
+#ifdef UNIV_LOG_ARCHIVE
+/*******************************************************************//**
+Returns TRUE if recovery from backup is currently running.
+@return recv_recovery_from_backup_on */
+UNIV_INLINE
+ibool
+recv_recovery_from_backup_is_on(void);
+/*=================================*/
+#endif /* UNIV_LOG_ARCHIVE */
+/************************************************************************//**
+Applies the hashed log records to the page, if the page lsn is less than the
+lsn of a log record. This can be called when a buffer page has just been
+read in, or also for a page already in the buffer pool. */
+UNIV_INTERN
+void
+recv_recover_page_func(
+/*===================*/
+#ifndef UNIV_HOTBACKUP
+ ibool just_read_in,
+ /*!< in: TRUE if the i/o handler calls
+ this for a freshly read page */
+#endif /* !UNIV_HOTBACKUP */
+ buf_block_t* block); /*!< in/out: buffer block */
+#ifndef UNIV_HOTBACKUP
+/** Wrapper for recv_recover_page_func().
+Applies the hashed log records to the page, if the page lsn is less than the
+lsn of a log record. This can be called when a buffer page has just been
+read in, or also for a page already in the buffer pool.
+@param jri in: TRUE if just read in (the i/o handler calls this for
+a freshly read page)
+@param block in/out: the buffer block
+*/
+# define recv_recover_page(jri, block) recv_recover_page_func(jri, block)
+#else /* !UNIV_HOTBACKUP */
+/** Wrapper for recv_recover_page_func().
+Applies the hashed log records to the page, if the page lsn is less than the
+lsn of a log record. This can be called when a buffer page has just been
+read in, or also for a page already in the buffer pool.
+@param jri in: TRUE if just read in (the i/o handler calls this for
+a freshly read page)
+@param block in/out: the buffer block
+*/
+# define recv_recover_page(jri, block) recv_recover_page_func(block)
+#endif /* !UNIV_HOTBACKUP */
+/********************************************************//**
+Recovers from a checkpoint. When this function returns, the database is able
+to start processing of new user transactions, but the function
+recv_recovery_from_checkpoint_finish should be called later to complete
+the recovery and free the resources used in it.
+@return error code or DB_SUCCESS */
+UNIV_INTERN
+ulint
+recv_recovery_from_checkpoint_start_func(
+/*=====================================*/
+#ifdef UNIV_LOG_ARCHIVE
+ ulint type, /*!< in: LOG_CHECKPOINT or
+ LOG_ARCHIVE */
+ ib_uint64_t limit_lsn, /*!< in: recover up to this lsn
+ if possible */
+#endif /* UNIV_LOG_ARCHIVE */
+ ib_uint64_t min_flushed_lsn,/*!< in: min flushed lsn from
+ data files */
+ ib_uint64_t max_flushed_lsn);/*!< in: max flushed lsn from
+ data files */
+#ifdef UNIV_LOG_ARCHIVE
+/** Wrapper for recv_recovery_from_checkpoint_start_func().
+Recovers from a checkpoint. When this function returns, the database is able
+to start processing of new user transactions, but the function
+recv_recovery_from_checkpoint_finish should be called later to complete
+the recovery and free the resources used in it.
+@param type in: LOG_CHECKPOINT or LOG_ARCHIVE
+@param lim in: recover up to this log sequence number if possible
+@param min in: minimum flushed log sequence number from data files
+@param max in: maximum flushed log sequence number from data files
+@return error code or DB_SUCCESS */
+# define recv_recovery_from_checkpoint_start(type,lim,min,max) \
+ recv_recovery_from_checkpoint_start_func(type,lim,min,max)
+#else /* UNIV_LOG_ARCHIVE */
+/** Wrapper for recv_recovery_from_checkpoint_start_func().
+Recovers from a checkpoint. When this function returns, the database is able
+to start processing of new user transactions, but the function
+recv_recovery_from_checkpoint_finish should be called later to complete
+the recovery and free the resources used in it.
+@param type ignored: LOG_CHECKPOINT or LOG_ARCHIVE
+@param lim ignored: recover up to this log sequence number if possible
+@param min in: minimum flushed log sequence number from data files
+@param max in: maximum flushed log sequence number from data files
+@return error code or DB_SUCCESS */
+# define recv_recovery_from_checkpoint_start(type,lim,min,max) \
+ recv_recovery_from_checkpoint_start_func(min,max)
+#endif /* UNIV_LOG_ARCHIVE */
+/********************************************************//**
+Completes recovery from a checkpoint. */
+UNIV_INTERN
+void
+recv_recovery_from_checkpoint_finish(void);
+/*======================================*/
+/*******************************************************//**
+Scans log from a buffer and stores new log data to the parsing buffer.
+Parses and hashes the log records if new data found. Unless
+UNIV_HOTBACKUP is defined, this function will apply log records
+automatically when the hash table becomes full.
+@return TRUE if limit_lsn has been reached, or not able to scan any
+more in this log group */
+UNIV_INTERN
+ibool
+recv_scan_log_recs(
+/*===============*/
+ ulint available_memory,/*!< in: we let the hash table of recs
+ to grow to this size, at the maximum */
+ ibool store_to_hash, /*!< in: TRUE if the records should be
+ stored to the hash table; this is set
+ to FALSE if just debug checking is
+ needed */
+ const byte* buf, /*!< in: buffer containing a log
+ segment or garbage */
+ ulint len, /*!< in: buffer length */
+ ib_uint64_t start_lsn, /*!< in: buffer start lsn */
+ ib_uint64_t* contiguous_lsn, /*!< in/out: it is known that all log
+ groups contain contiguous log data up
+ to this lsn */
+ ib_uint64_t* group_scanned_lsn);/*!< out: scanning succeeded up to
+ this lsn */
+/******************************************************//**
+Resets the logs. The contents of log files will be lost! */
+UNIV_INTERN
+void
+recv_reset_logs(
+/*============*/
+ ib_uint64_t lsn, /*!< in: reset to this lsn
+ rounded up to be divisible by
+ OS_FILE_LOG_BLOCK_SIZE, after
+ which we add
+ LOG_BLOCK_HDR_SIZE */
+#ifdef UNIV_LOG_ARCHIVE
+ ulint arch_log_no, /*!< in: next archived log file number */
+#endif /* UNIV_LOG_ARCHIVE */
+ ibool new_logs_created);/*!< in: TRUE if resetting logs
+ is done at the log creation;
+ FALSE if it is done after
+ archive recovery */
+#ifdef UNIV_HOTBACKUP
+/******************************************************//**
+Creates new log files after a backup has been restored. */
+UNIV_INTERN
+void
+recv_reset_log_files_for_backup(
+/*============================*/
+ const char* log_dir, /*!< in: log file directory path */
+ ulint n_log_files, /*!< in: number of log files */
+ ulint log_file_size, /*!< in: log file size */
+ ib_uint64_t lsn); /*!< in: new start lsn, must be
+ divisible by OS_FILE_LOG_BLOCK_SIZE */
+#endif /* UNIV_HOTBACKUP */
+/********************************************************//**
+Creates the recovery system. */
+UNIV_INTERN
+void
+recv_sys_create(void);
+/*=================*/
+/********************************************************//**
+Inits the recovery system for a recovery operation. */
+UNIV_INTERN
+void
+recv_sys_init(
+/*==========*/
+ ulint available_memory); /*!< in: available memory in bytes */
+/*******************************************************************//**
+Empties the hash table of stored log records, applying them to appropriate
+pages. */
+UNIV_INTERN
+void
+recv_apply_hashed_log_recs(
+/*=======================*/
+ ibool allow_ibuf); /*!< in: if TRUE, also ibuf operations are
+ allowed during the application; if FALSE,
+ no ibuf operations are allowed, and after
+ the application all file pages are flushed to
+ disk and invalidated in buffer pool: this
+ alternative means that no new log records
+ can be generated during the application */
+#ifdef UNIV_HOTBACKUP
+/*******************************************************************//**
+Applies log records in the hash table to a backup. */
+UNIV_INTERN
+void
+recv_apply_log_recs_for_backup(void);
+/*================================*/
+#endif
+#ifdef UNIV_LOG_ARCHIVE
+/********************************************************//**
+Recovers from archived log files, and also from log files, if they exist.
+@return error code or DB_SUCCESS */
+UNIV_INTERN
+ulint
+recv_recovery_from_archive_start(
+/*=============================*/
+ ib_uint64_t min_flushed_lsn,/*!< in: min flushed lsn field from the
+ data files */
+ ib_uint64_t limit_lsn, /*!< in: recover up to this lsn if
+ possible */
+ ulint first_log_no); /*!< in: number of the first archived
+ log file to use in the recovery; the
+ file will be searched from
+ INNOBASE_LOG_ARCH_DIR specified in
+ server config file */
+/********************************************************//**
+Completes recovery from archive. */
+UNIV_INTERN
+void
+recv_recovery_from_archive_finish(void);
+/*===================================*/
+#endif /* UNIV_LOG_ARCHIVE */
+
+/** Block of log record data */
+typedef struct recv_data_struct recv_data_t;
+/** Block of log record data */
+struct recv_data_struct{
+ recv_data_t* next; /*!< pointer to the next block or NULL */
+ /*!< the log record data is stored physically
+ immediately after this struct, max amount
+ RECV_DATA_BLOCK_SIZE bytes of it */
+};
+
+/** Stored log record struct */
+typedef struct recv_struct recv_t;
+/** Stored log record struct */
+struct recv_struct{
+ byte type; /*!< log record type */
+ ulint len; /*!< log record body length in bytes */
+ recv_data_t* data; /*!< chain of blocks containing the log record
+ body */
+ ib_uint64_t start_lsn;/*!< start lsn of the log segment written by
+ the mtr which generated this log record: NOTE
+ that this is not necessarily the start lsn of
+ this log record */
+ ib_uint64_t end_lsn;/*!< end lsn of the log segment written by
+ the mtr which generated this log record: NOTE
+ that this is not necessarily the end lsn of
+ this log record */
+ UT_LIST_NODE_T(recv_t)
+ rec_list;/*!< list of log records for this page */
+};
+
+/** States of recv_addr_struct */
+enum recv_addr_state {
+ /** not yet processed */
+ RECV_NOT_PROCESSED,
+ /** page is being read */
+ RECV_BEING_READ,
+ /** log records are being applied on the page */
+ RECV_BEING_PROCESSED,
+ /** log records have been applied on the page, or they have
+ been discarded because the tablespace does not exist */
+ RECV_PROCESSED
+};
+
+/** Hashed page file address struct */
+typedef struct recv_addr_struct recv_addr_t;
+/** Hashed page file address struct */
+struct recv_addr_struct{
+ enum recv_addr_state state;
+ /*!< recovery state of the page */
+ ulint space; /*!< space id */
+ ulint page_no;/*!< page number */
+ UT_LIST_BASE_NODE_T(recv_t)
+ rec_list;/*!< list of log records for this page */
+ hash_node_t addr_hash;/*!< hash node in the hash bucket chain */
+};
+
+/** Recovery system data structure */
+typedef struct recv_sys_struct recv_sys_t;
+/** Recovery system data structure */
+struct recv_sys_struct{
+#ifndef UNIV_HOTBACKUP
+ mutex_t mutex; /*!< mutex protecting the fields apply_log_recs,
+ n_addrs, and the state field in each recv_addr
+ struct */
+#endif /* !UNIV_HOTBACKUP */
+ ibool apply_log_recs;
+ /*!< this is TRUE when log rec application to
+ pages is allowed; this flag tells the
+ i/o-handler if it should do log record
+ application */
+ ibool apply_batch_on;
+ /*!< this is TRUE when a log rec application
+ batch is running */
+ ib_uint64_t lsn; /*!< log sequence number */
+ ulint last_log_buf_size;
+ /*!< size of the log buffer when the database
+ last time wrote to the log */
+ byte* last_block;
+ /*!< possible incomplete last recovered log
+ block */
+ byte* last_block_buf_start;
+ /*!< the nonaligned start address of the
+ preceding buffer */
+ byte* buf; /*!< buffer for parsing log records */
+ ulint len; /*!< amount of data in buf */
+ ib_uint64_t parse_start_lsn;
+ /*!< this is the lsn from which we were able to
+ start parsing log records and adding them to
+ the hash table; zero if a suitable
+ start point not found yet */
+ ib_uint64_t scanned_lsn;
+ /*!< the log data has been scanned up to this
+ lsn */
+ ulint scanned_checkpoint_no;
+ /*!< the log data has been scanned up to this
+ checkpoint number (lowest 4 bytes) */
+ ulint recovered_offset;
+ /*!< start offset of non-parsed log records in
+ buf */
+ ib_uint64_t recovered_lsn;
+ /*!< the log records have been parsed up to
+ this lsn */
+ ib_uint64_t limit_lsn;/*!< recovery should be made at most
+ up to this lsn */
+ ibool found_corrupt_log;
+ /*!< this is set to TRUE if we during log
+ scan find a corrupt log block, or a corrupt
+ log record, or there is a log parsing
+ buffer overflow */
+#ifdef UNIV_LOG_ARCHIVE
+ log_group_t* archive_group;
+ /*!< in archive recovery: the log group whose
+ archive is read */
+#endif /* !UNIV_LOG_ARCHIVE */
+ mem_heap_t* heap; /*!< memory heap of log records and file
+ addresses*/
+ hash_table_t* addr_hash;/*!< hash table of file addresses of pages */
+ ulint n_addrs;/*!< number of not processed hashed file
+ addresses in the hash table */
+};
+
+/** The recovery system */
+extern recv_sys_t* recv_sys;
+
+/** TRUE when applying redo log records during crash recovery; FALSE
+otherwise. Note that this is FALSE while a background thread is
+rolling back incomplete transactions. */
+extern ibool recv_recovery_on;
+/** If the following is TRUE, the buffer pool file pages must be invalidated
+after recovery and no ibuf operations are allowed; this becomes TRUE if
+the log record hash table becomes too full, and log records must be merged
+to file pages already before the recovery is finished: in this case no
+ibuf operations are allowed, as they could modify the pages read in the
+buffer pool before the pages have been recovered to the up-to-date state.
+
+TRUE means that recovery is running and no operations on the log files
+are allowed yet: the variable name is misleading. */
+extern ibool recv_no_ibuf_operations;
+/** TRUE when recv_init_crash_recovery() has been called. */
+extern ibool recv_needed_recovery;
+
+/** TRUE if buf_page_is_corrupted() should check if the log sequence
+number (FIL_PAGE_LSN) is in the future. Initially FALSE, and set by
+recv_recovery_from_checkpoint_start_func(). */
+extern ibool recv_lsn_checks_on;
+#ifdef UNIV_HOTBACKUP
+/** TRUE when the redo log is being backed up */
+extern ibool recv_is_making_a_backup;
+#endif /* UNIV_HOTBACKUP */
+/** Maximum page number encountered in the redo log */
+extern ulint recv_max_parsed_page_no;
+
+/** Size of the parsing buffer; it must accommodate RECV_SCAN_SIZE many
+times! */
+#define RECV_PARSING_BUF_SIZE (2 * 1024 * 1024)
+
+/** Size of block reads when the log groups are scanned forward to do a
+roll-forward */
+#define RECV_SCAN_SIZE (4 * UNIV_PAGE_SIZE)
+
+/** This many frames must be left free in the buffer pool when we scan
+the log and store the scanned log records in the buffer pool: we will
+use these free frames to read in pages when we start applying the
+log records to the database. */
+extern ulint recv_n_pool_free_frames;
+
+#ifndef UNIV_NONINL
+#include "log0recv.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/log0recv.ic b/storage/innobase/include/log0recv.ic
new file mode 100644
index 00000000000..0a8e55b96fa
--- /dev/null
+++ b/storage/innobase/include/log0recv.ic
@@ -0,0 +1,53 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/log0recv.ic
+Recovery
+
+Created 9/20/1997 Heikki Tuuri
+*******************************************************/
+
+#include "univ.i"
+
+/*******************************************************************//**
+Returns TRUE if recovery is currently running.
+@return recv_recovery_on */
+UNIV_INLINE
+ibool
+recv_recovery_is_on(void)
+/*=====================*/
+{
+ return(UNIV_UNLIKELY(recv_recovery_on));
+}
+
+#ifdef UNIV_LOG_ARCHIVE
+/** TRUE when applying redo log records from an archived log file */
+extern ibool recv_recovery_from_backup_on;
+
+/*******************************************************************//**
+Returns TRUE if recovery from backup is currently running.
+@return recv_recovery_from_backup_on */
+UNIV_INLINE
+ibool
+recv_recovery_from_backup_is_on(void)
+/*=================================*/
+{
+ return(recv_recovery_from_backup_on);
+}
+#endif /* UNIV_LOG_ARCHIVE */
diff --git a/storage/innobase/include/mach0data.h b/storage/innobase/include/mach0data.h
new file mode 100644
index 00000000000..44ee3df22ce
--- /dev/null
+++ b/storage/innobase/include/mach0data.h
@@ -0,0 +1,400 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/mach0data.h
+Utilities for converting data from the database file
+to the machine format.
+
+Created 11/28/1995 Heikki Tuuri
+***********************************************************************/
+
+#ifndef mach0data_h
+#define mach0data_h
+
+#include "univ.i"
+#include "ut0byte.h"
+
+/* The data and all fields are always stored in a database file
+in the same format: ascii, big-endian, ... .
+All data in the files MUST be accessed using the functions in this
+module. */
+
+/*******************************************************//**
+The following function is used to store data in one byte. */
+UNIV_INLINE
+void
+mach_write_to_1(
+/*============*/
+ byte* b, /*!< in: pointer to byte where to store */
+ ulint n); /*!< in: ulint integer to be stored, >= 0, < 256 */
+/********************************************************//**
+The following function is used to fetch data from one byte.
+@return ulint integer, >= 0, < 256 */
+UNIV_INLINE
+ulint
+mach_read_from_1(
+/*=============*/
+ const byte* b) /*!< in: pointer to byte */
+ __attribute__((nonnull, pure));
+/*******************************************************//**
+The following function is used to store data in two consecutive
+bytes. We store the most significant byte to the lower address. */
+UNIV_INLINE
+void
+mach_write_to_2(
+/*============*/
+ byte* b, /*!< in: pointer to two bytes where to store */
+ ulint n); /*!< in: ulint integer to be stored, >= 0, < 64k */
+/********************************************************//**
+The following function is used to fetch data from two consecutive
+bytes. The most significant byte is at the lowest address.
+@return ulint integer, >= 0, < 64k */
+UNIV_INLINE
+ulint
+mach_read_from_2(
+/*=============*/
+ const byte* b) /*!< in: pointer to two bytes */
+ __attribute__((nonnull, pure));
+
+/********************************************************//**
+The following function is used to convert a 16-bit data item
+to the canonical format, for fast bytewise equality test
+against memory.
+@return 16-bit integer in canonical format */
+UNIV_INLINE
+uint16
+mach_encode_2(
+/*==========*/
+ ulint n) /*!< in: integer in machine-dependent format */
+ __attribute__((const));
+/********************************************************//**
+The following function is used to convert a 16-bit data item
+from the canonical format, for fast bytewise equality test
+against memory.
+@return integer in machine-dependent format */
+UNIV_INLINE
+ulint
+mach_decode_2(
+/*==========*/
+ uint16 n) /*!< in: 16-bit integer in canonical format */
+ __attribute__((const));
+/*******************************************************//**
+The following function is used to store data in 3 consecutive
+bytes. We store the most significant byte to the lowest address. */
+UNIV_INLINE
+void
+mach_write_to_3(
+/*============*/
+ byte* b, /*!< in: pointer to 3 bytes where to store */
+ ulint n); /*!< in: ulint integer to be stored */
+/********************************************************//**
+The following function is used to fetch data from 3 consecutive
+bytes. The most significant byte is at the lowest address.
+@return ulint integer */
+UNIV_INLINE
+ulint
+mach_read_from_3(
+/*=============*/
+ const byte* b) /*!< in: pointer to 3 bytes */
+ __attribute__((nonnull, pure));
+/*******************************************************//**
+The following function is used to store data in four consecutive
+bytes. We store the most significant byte to the lowest address. */
+UNIV_INLINE
+void
+mach_write_to_4(
+/*============*/
+ byte* b, /*!< in: pointer to four bytes where to store */
+ ulint n); /*!< in: ulint integer to be stored */
+/********************************************************//**
+The following function is used to fetch data from 4 consecutive
+bytes. The most significant byte is at the lowest address.
+@return ulint integer */
+UNIV_INLINE
+ulint
+mach_read_from_4(
+/*=============*/
+ const byte* b) /*!< in: pointer to four bytes */
+ __attribute__((nonnull, pure));
+/*********************************************************//**
+Writes a ulint in a compressed form (1..5 bytes).
+@return stored size in bytes */
+UNIV_INLINE
+ulint
+mach_write_compressed(
+/*==================*/
+ byte* b, /*!< in: pointer to memory where to store */
+ ulint n); /*!< in: ulint integer to be stored */
+/*********************************************************//**
+Returns the size of an ulint when written in the compressed form.
+@return compressed size in bytes */
+UNIV_INLINE
+ulint
+mach_get_compressed_size(
+/*=====================*/
+ ulint n) /*!< in: ulint integer to be stored */
+ __attribute__((const));
+/*********************************************************//**
+Reads a ulint in a compressed form.
+@return read integer */
+UNIV_INLINE
+ulint
+mach_read_compressed(
+/*=================*/
+ const byte* b) /*!< in: pointer to memory from where to read */
+ __attribute__((nonnull, pure));
+/*******************************************************//**
+The following function is used to store data in 6 consecutive
+bytes. We store the most significant byte to the lowest address. */
+UNIV_INLINE
+void
+mach_write_to_6(
+/*============*/
+ byte* b, /*!< in: pointer to 6 bytes where to store */
+ dulint n); /*!< in: dulint integer to be stored */
+/********************************************************//**
+The following function is used to fetch data from 6 consecutive
+bytes. The most significant byte is at the lowest address.
+@return dulint integer */
+UNIV_INLINE
+dulint
+mach_read_from_6(
+/*=============*/
+ const byte* b) /*!< in: pointer to 6 bytes */
+ __attribute__((nonnull, pure));
+/*******************************************************//**
+The following function is used to store data in 7 consecutive
+bytes. We store the most significant byte to the lowest address. */
+UNIV_INLINE
+void
+mach_write_to_7(
+/*============*/
+ byte* b, /*!< in: pointer to 7 bytes where to store */
+ dulint n); /*!< in: dulint integer to be stored */
+/********************************************************//**
+The following function is used to fetch data from 7 consecutive
+bytes. The most significant byte is at the lowest address.
+@return dulint integer */
+UNIV_INLINE
+dulint
+mach_read_from_7(
+/*=============*/
+ const byte* b) /*!< in: pointer to 7 bytes */
+ __attribute__((nonnull, pure));
+/*******************************************************//**
+The following function is used to store data in 8 consecutive
+bytes. We store the most significant byte to the lowest address. */
+UNIV_INLINE
+void
+mach_write_to_8(
+/*============*/
+ byte* b, /*!< in: pointer to 8 bytes where to store */
+ dulint n); /*!< in: dulint integer to be stored */
+/*******************************************************//**
+The following function is used to store data in 8 consecutive
+bytes. We store the most significant byte to the lowest address. */
+UNIV_INLINE
+void
+mach_write_ull(
+/*===========*/
+ byte* b, /*!< in: pointer to 8 bytes where to store */
+ ib_uint64_t n); /*!< in: 64-bit integer to be stored */
+/********************************************************//**
+The following function is used to fetch data from 8 consecutive
+bytes. The most significant byte is at the lowest address.
+@return dulint integer */
+UNIV_INLINE
+dulint
+mach_read_from_8(
+/*=============*/
+ const byte* b) /*!< in: pointer to 8 bytes */
+ __attribute__((nonnull, pure));
+/********************************************************//**
+The following function is used to fetch data from 8 consecutive
+bytes. The most significant byte is at the lowest address.
+@return 64-bit integer */
+UNIV_INLINE
+ib_uint64_t
+mach_read_ull(
+/*==========*/
+ const byte* b) /*!< in: pointer to 8 bytes */
+ __attribute__((nonnull, pure));
+/*********************************************************//**
+Writes a dulint in a compressed form (5..9 bytes).
+@return size in bytes */
+UNIV_INLINE
+ulint
+mach_dulint_write_compressed(
+/*=========================*/
+ byte* b, /*!< in: pointer to memory where to store */
+ dulint n); /*!< in: dulint integer to be stored */
+/*********************************************************//**
+Returns the size of a dulint when written in the compressed form.
+@return compressed size in bytes */
+UNIV_INLINE
+ulint
+mach_dulint_get_compressed_size(
+/*============================*/
+ dulint n); /*!< in: dulint integer to be stored */
+/*********************************************************//**
+Reads a dulint in a compressed form.
+@return read dulint */
+UNIV_INLINE
+dulint
+mach_dulint_read_compressed(
+/*========================*/
+ const byte* b) /*!< in: pointer to memory from where to read */
+ __attribute__((nonnull, pure));
+/*********************************************************//**
+Writes a dulint in a compressed form (1..11 bytes).
+@return size in bytes */
+UNIV_INLINE
+ulint
+mach_dulint_write_much_compressed(
+/*==============================*/
+ byte* b, /*!< in: pointer to memory where to store */
+ dulint n); /*!< in: dulint integer to be stored */
+/*********************************************************//**
+Returns the size of a dulint when written in the compressed form.
+@return compressed size in bytes */
+UNIV_INLINE
+ulint
+mach_dulint_get_much_compressed_size(
+/*=================================*/
+ dulint n) /*!< in: dulint integer to be stored */
+ __attribute__((const));
+/*********************************************************//**
+Reads a dulint in a compressed form.
+@return read dulint */
+UNIV_INLINE
+dulint
+mach_dulint_read_much_compressed(
+/*=============================*/
+ const byte* b) /*!< in: pointer to memory from where to read */
+ __attribute__((nonnull, pure));
+/*********************************************************//**
+Reads a ulint in a compressed form if the log record fully contains it.
+@return pointer to end of the stored field, NULL if not complete */
+UNIV_INTERN
+byte*
+mach_parse_compressed(
+/*==================*/
+ byte* ptr, /*!< in: pointer to buffer from where to read */
+ byte* end_ptr,/*!< in: pointer to end of the buffer */
+ ulint* val); /*!< out: read value */
+/*********************************************************//**
+Reads a dulint in a compressed form if the log record fully contains it.
+@return pointer to end of the stored field, NULL if not complete */
+UNIV_INTERN
+byte*
+mach_dulint_parse_compressed(
+/*=========================*/
+ byte* ptr, /*!< in: pointer to buffer from where to read */
+ byte* end_ptr,/*!< in: pointer to end of the buffer */
+ dulint* val); /*!< out: read value */
+#ifndef UNIV_HOTBACKUP
+/*********************************************************//**
+Reads a double. It is stored in a little-endian format.
+@return double read */
+UNIV_INLINE
+double
+mach_double_read(
+/*=============*/
+ const byte* b) /*!< in: pointer to memory from where to read */
+ __attribute__((nonnull, pure));
+/*********************************************************//**
+Writes a double. It is stored in a little-endian format. */
+UNIV_INLINE
+void
+mach_double_write(
+/*==============*/
+ byte* b, /*!< in: pointer to memory where to write */
+ double d); /*!< in: double */
+/*********************************************************//**
+Reads a float. It is stored in a little-endian format.
+@return float read */
+UNIV_INLINE
+float
+mach_float_read(
+/*============*/
+ const byte* b) /*!< in: pointer to memory from where to read */
+ __attribute__((nonnull, pure));
+/*********************************************************//**
+Writes a float. It is stored in a little-endian format. */
+UNIV_INLINE
+void
+mach_float_write(
+/*=============*/
+ byte* b, /*!< in: pointer to memory where to write */
+ float d); /*!< in: float */
+/*********************************************************//**
+Reads a ulint stored in the little-endian format.
+@return unsigned long int */
+UNIV_INLINE
+ulint
+mach_read_from_n_little_endian(
+/*===========================*/
+ const byte* buf, /*!< in: from where to read */
+ ulint buf_size) /*!< in: from how many bytes to read */
+ __attribute__((nonnull, pure));
+/*********************************************************//**
+Writes a ulint in the little-endian format. */
+UNIV_INLINE
+void
+mach_write_to_n_little_endian(
+/*==========================*/
+ byte* dest, /*!< in: where to write */
+ ulint dest_size, /*!< in: into how many bytes to write */
+ ulint n); /*!< in: unsigned long int to write */
+/*********************************************************//**
+Reads a ulint stored in the little-endian format.
+@return unsigned long int */
+UNIV_INLINE
+ulint
+mach_read_from_2_little_endian(
+/*===========================*/
+ const byte* buf) /*!< in: from where to read */
+ __attribute__((nonnull, pure));
+/*********************************************************//**
+Writes a ulint in the little-endian format. */
+UNIV_INLINE
+void
+mach_write_to_2_little_endian(
+/*==========================*/
+ byte* dest, /*!< in: where to write */
+ ulint n); /*!< in: unsigned long int to write */
+
+/*********************************************************//**
+Convert integral type from storage byte order (big endian) to
+host byte order.
+@return integer value */
+UNIV_INLINE
+ullint
+mach_read_int_type(
+/*===============*/
+ const byte* src, /*!< in: where to read from */
+ ulint len, /*!< in: length of src */
+ ibool unsigned_type); /*!< in: signed or unsigned flag */
+#endif /* !UNIV_HOTBACKUP */
+
+#ifndef UNIV_NONINL
+#include "mach0data.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/mach0data.ic b/storage/innobase/include/mach0data.ic
new file mode 100644
index 00000000000..ef20356bd31
--- /dev/null
+++ b/storage/innobase/include/mach0data.ic
@@ -0,0 +1,786 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/mach0data.ic
+Utilities for converting data from the database file
+to the machine format.
+
+Created 11/28/1995 Heikki Tuuri
+***********************************************************************/
+
+#include "ut0mem.h"
+
+/*******************************************************//**
+The following function is used to store data in one byte. */
+UNIV_INLINE
+void
+mach_write_to_1(
+/*============*/
+ byte* b, /*!< in: pointer to byte where to store */
+ ulint n) /*!< in: ulint integer to be stored, >= 0, < 256 */
+{
+ ut_ad(b);
+ ut_ad(n <= 0xFFUL);
+
+ b[0] = (byte)n;
+}
+
+/********************************************************//**
+The following function is used to fetch data from one byte.
+@return ulint integer, >= 0, < 256 */
+UNIV_INLINE
+ulint
+mach_read_from_1(
+/*=============*/
+ const byte* b) /*!< in: pointer to byte */
+{
+ ut_ad(b);
+ return((ulint)(b[0]));
+}
+
+/*******************************************************//**
+The following function is used to store data in two consecutive
+bytes. We store the most significant byte to the lowest address. */
+UNIV_INLINE
+void
+mach_write_to_2(
+/*============*/
+ byte* b, /*!< in: pointer to two bytes where to store */
+ ulint n) /*!< in: ulint integer to be stored */
+{
+ ut_ad(b);
+ ut_ad(n <= 0xFFFFUL);
+
+ b[0] = (byte)(n >> 8);
+ b[1] = (byte)(n);
+}
+
+/********************************************************//**
+The following function is used to fetch data from 2 consecutive
+bytes. The most significant byte is at the lowest address.
+@return ulint integer */
+UNIV_INLINE
+ulint
+mach_read_from_2(
+/*=============*/
+ const byte* b) /*!< in: pointer to 2 bytes */
+{
+ ut_ad(b);
+ return( ((ulint)(b[0]) << 8)
+ + (ulint)(b[1])
+ );
+}
+
+/********************************************************//**
+The following function is used to convert a 16-bit data item
+to the canonical format, for fast bytewise equality test
+against memory.
+@return 16-bit integer in canonical format */
+UNIV_INLINE
+uint16
+mach_encode_2(
+/*==========*/
+ ulint n) /*!< in: integer in machine-dependent format */
+{
+ uint16 ret;
+ ut_ad(2 == sizeof ret);
+ mach_write_to_2((byte*) &ret, n);
+ return(ret);
+}
+/********************************************************//**
+The following function is used to convert a 16-bit data item
+from the canonical format, for fast bytewise equality test
+against memory.
+@return integer in machine-dependent format */
+UNIV_INLINE
+ulint
+mach_decode_2(
+/*==========*/
+ uint16 n) /*!< in: 16-bit integer in canonical format */
+{
+ ut_ad(2 == sizeof n);
+ return(mach_read_from_2((const byte*) &n));
+}
+
+/*******************************************************//**
+The following function is used to store data in 3 consecutive
+bytes. We store the most significant byte to the lowest address. */
+UNIV_INLINE
+void
+mach_write_to_3(
+/*============*/
+ byte* b, /*!< in: pointer to 3 bytes where to store */
+ ulint n) /*!< in: ulint integer to be stored */
+{
+ ut_ad(b);
+ ut_ad(n <= 0xFFFFFFUL);
+
+ b[0] = (byte)(n >> 16);
+ b[1] = (byte)(n >> 8);
+ b[2] = (byte)(n);
+}
+
+/********************************************************//**
+The following function is used to fetch data from 3 consecutive
+bytes. The most significant byte is at the lowest address.
+@return ulint integer */
+UNIV_INLINE
+ulint
+mach_read_from_3(
+/*=============*/
+ const byte* b) /*!< in: pointer to 3 bytes */
+{
+ ut_ad(b);
+ return( ((ulint)(b[0]) << 16)
+ + ((ulint)(b[1]) << 8)
+ + (ulint)(b[2])
+ );
+}
+
+/*******************************************************//**
+The following function is used to store data in four consecutive
+bytes. We store the most significant byte to the lowest address. */
+UNIV_INLINE
+void
+mach_write_to_4(
+/*============*/
+ byte* b, /*!< in: pointer to four bytes where to store */
+ ulint n) /*!< in: ulint integer to be stored */
+{
+ ut_ad(b);
+
+ b[0] = (byte)(n >> 24);
+ b[1] = (byte)(n >> 16);
+ b[2] = (byte)(n >> 8);
+ b[3] = (byte)n;
+}
+
+/********************************************************//**
+The following function is used to fetch data from 4 consecutive
+bytes. The most significant byte is at the lowest address.
+@return ulint integer */
+UNIV_INLINE
+ulint
+mach_read_from_4(
+/*=============*/
+ const byte* b) /*!< in: pointer to four bytes */
+{
+ ut_ad(b);
+ return( ((ulint)(b[0]) << 24)
+ + ((ulint)(b[1]) << 16)
+ + ((ulint)(b[2]) << 8)
+ + (ulint)(b[3])
+ );
+}
+
+/*********************************************************//**
+Writes a ulint in a compressed form where the first byte codes the
+length of the stored ulint. We look at the most significant bits of
+the byte. If the most significant bit is zero, it means 1-byte storage,
+else if the 2nd bit is 0, it means 2-byte storage, else if 3rd is 0,
+it means 3-byte storage, else if 4th is 0, it means 4-byte storage,
+else the storage is 5-byte.
+@return compressed size in bytes */
+UNIV_INLINE
+ulint
+mach_write_compressed(
+/*==================*/
+ byte* b, /*!< in: pointer to memory where to store */
+ ulint n) /*!< in: ulint integer (< 2^32) to be stored */
+{
+ ut_ad(b);
+
+ if (n < 0x80UL) {
+ mach_write_to_1(b, n);
+ return(1);
+ } else if (n < 0x4000UL) {
+ mach_write_to_2(b, n | 0x8000UL);
+ return(2);
+ } else if (n < 0x200000UL) {
+ mach_write_to_3(b, n | 0xC00000UL);
+ return(3);
+ } else if (n < 0x10000000UL) {
+ mach_write_to_4(b, n | 0xE0000000UL);
+ return(4);
+ } else {
+ mach_write_to_1(b, 0xF0UL);
+ mach_write_to_4(b + 1, n);
+ return(5);
+ }
+}
+
+/*********************************************************//**
+Returns the size of a ulint when written in the compressed form.
+@return compressed size in bytes */
+UNIV_INLINE
+ulint
+mach_get_compressed_size(
+/*=====================*/
+ ulint n) /*!< in: ulint integer (< 2^32) to be stored */
+{
+ if (n < 0x80UL) {
+ return(1);
+ } else if (n < 0x4000UL) {
+ return(2);
+ } else if (n < 0x200000UL) {
+ return(3);
+ } else if (n < 0x10000000UL) {
+ return(4);
+ } else {
+ return(5);
+ }
+}
+
+/*********************************************************//**
+Reads a ulint in a compressed form.
+@return read integer (< 2^32) */
+UNIV_INLINE
+ulint
+mach_read_compressed(
+/*=================*/
+ const byte* b) /*!< in: pointer to memory from where to read */
+{
+ ulint flag;
+
+ ut_ad(b);
+
+ flag = mach_read_from_1(b);
+
+ if (flag < 0x80UL) {
+ return(flag);
+ } else if (flag < 0xC0UL) {
+ return(mach_read_from_2(b) & 0x7FFFUL);
+ } else if (flag < 0xE0UL) {
+ return(mach_read_from_3(b) & 0x3FFFFFUL);
+ } else if (flag < 0xF0UL) {
+ return(mach_read_from_4(b) & 0x1FFFFFFFUL);
+ } else {
+ ut_ad(flag == 0xF0UL);
+ return(mach_read_from_4(b + 1));
+ }
+}
+
+/*******************************************************//**
+The following function is used to store data in 8 consecutive
+bytes. We store the most significant byte to the lowest address. */
+UNIV_INLINE
+void
+mach_write_to_8(
+/*============*/
+ byte* b, /*!< in: pointer to 8 bytes where to store */
+ dulint n) /*!< in: dulint integer to be stored */
+{
+ ut_ad(b);
+
+ mach_write_to_4(b, ut_dulint_get_high(n));
+ mach_write_to_4(b + 4, ut_dulint_get_low(n));
+}
+
+/*******************************************************//**
+The following function is used to store data in 8 consecutive
+bytes. We store the most significant byte to the lowest address. */
+UNIV_INLINE
+void
+mach_write_ull(
+/*===========*/
+ byte* b, /*!< in: pointer to 8 bytes where to store */
+ ib_uint64_t n) /*!< in: 64-bit integer to be stored */
+{
+ ut_ad(b);
+
+ mach_write_to_4(b, (ulint) (n >> 32));
+ mach_write_to_4(b + 4, (ulint) n);
+}
+
+/********************************************************//**
+The following function is used to fetch data from 8 consecutive
+bytes. The most significant byte is at the lowest address.
+@return dulint integer */
+UNIV_INLINE
+dulint
+mach_read_from_8(
+/*=============*/
+ const byte* b) /*!< in: pointer to 8 bytes */
+{
+ ulint high;
+ ulint low;
+
+ ut_ad(b);
+
+ high = mach_read_from_4(b);
+ low = mach_read_from_4(b + 4);
+
+ return(ut_dulint_create(high, low));
+}
+
+/********************************************************//**
+The following function is used to fetch data from 8 consecutive
+bytes. The most significant byte is at the lowest address.
+@return 64-bit integer */
+UNIV_INLINE
+ib_uint64_t
+mach_read_ull(
+/*==========*/
+ const byte* b) /*!< in: pointer to 8 bytes */
+{
+ ib_uint64_t ull;
+
+ ull = ((ib_uint64_t) mach_read_from_4(b)) << 32;
+ ull |= (ib_uint64_t) mach_read_from_4(b + 4);
+
+ return(ull);
+}
+
+/*******************************************************//**
+The following function is used to store data in 7 consecutive
+bytes. We store the most significant byte to the lowest address. */
+UNIV_INLINE
+void
+mach_write_to_7(
+/*============*/
+ byte* b, /*!< in: pointer to 7 bytes where to store */
+ dulint n) /*!< in: dulint integer to be stored */
+{
+ ut_ad(b);
+
+ mach_write_to_3(b, ut_dulint_get_high(n));
+ mach_write_to_4(b + 3, ut_dulint_get_low(n));
+}
+
+/********************************************************//**
+The following function is used to fetch data from 7 consecutive
+bytes. The most significant byte is at the lowest address.
+@return dulint integer */
+UNIV_INLINE
+dulint
+mach_read_from_7(
+/*=============*/
+ const byte* b) /*!< in: pointer to 7 bytes */
+{
+ ulint high;
+ ulint low;
+
+ ut_ad(b);
+
+ high = mach_read_from_3(b);
+ low = mach_read_from_4(b + 3);
+
+ return(ut_dulint_create(high, low));
+}
+
+/*******************************************************//**
+The following function is used to store data in 6 consecutive
+bytes. We store the most significant byte to the lowest address. */
+UNIV_INLINE
+void
+mach_write_to_6(
+/*============*/
+ byte* b, /*!< in: pointer to 6 bytes where to store */
+ dulint n) /*!< in: dulint integer to be stored */
+{
+ ut_ad(b);
+
+ mach_write_to_2(b, ut_dulint_get_high(n));
+ mach_write_to_4(b + 2, ut_dulint_get_low(n));
+}
+
+/********************************************************//**
+The following function is used to fetch data from 6 consecutive
+bytes. The most significant byte is at the lowest address.
+@return dulint integer */
+UNIV_INLINE
+dulint
+mach_read_from_6(
+/*=============*/
+ const byte* b) /*!< in: pointer to 6 bytes */
+{
+ ulint high;
+ ulint low;
+
+ ut_ad(b);
+
+ high = mach_read_from_2(b);
+ low = mach_read_from_4(b + 2);
+
+ return(ut_dulint_create(high, low));
+}
+
+/*********************************************************//**
+Writes a dulint in a compressed form (5..9 bytes).
+@return size in bytes */
+UNIV_INLINE
+ulint
+mach_dulint_write_compressed(
+/*=========================*/
+ byte* b, /*!< in: pointer to memory where to store */
+ dulint n) /*!< in: dulint integer to be stored */
+{
+ ulint size;
+
+ ut_ad(b);
+
+ size = mach_write_compressed(b, ut_dulint_get_high(n));
+ mach_write_to_4(b + size, ut_dulint_get_low(n));
+
+ return(size + 4);
+}
+
+/*********************************************************//**
+Returns the size of a dulint when written in the compressed form.
+@return compressed size in bytes */
+UNIV_INLINE
+ulint
+mach_dulint_get_compressed_size(
+/*============================*/
+ dulint n) /*!< in: dulint integer to be stored */
+{
+ return(4 + mach_get_compressed_size(ut_dulint_get_high(n)));
+}
+
+/*********************************************************//**
+Reads a dulint in a compressed form.
+@return read dulint */
+UNIV_INLINE
+dulint
+mach_dulint_read_compressed(
+/*========================*/
+ const byte* b) /*!< in: pointer to memory from where to read */
+{
+ ulint high;
+ ulint low;
+ ulint size;
+
+ ut_ad(b);
+
+ high = mach_read_compressed(b);
+
+ size = mach_get_compressed_size(high);
+
+ low = mach_read_from_4(b + size);
+
+ return(ut_dulint_create(high, low));
+}
+
+/*********************************************************//**
+Writes a dulint in a compressed form (1..11 bytes).
+@return size in bytes */
+UNIV_INLINE
+ulint
+mach_dulint_write_much_compressed(
+/*==============================*/
+ byte* b, /*!< in: pointer to memory where to store */
+ dulint n) /*!< in: dulint integer to be stored */
+{
+ ulint size;
+
+ ut_ad(b);
+
+ if (ut_dulint_get_high(n) == 0) {
+ return(mach_write_compressed(b, ut_dulint_get_low(n)));
+ }
+
+ *b = (byte)0xFF;
+ size = 1 + mach_write_compressed(b + 1, ut_dulint_get_high(n));
+
+ size += mach_write_compressed(b + size, ut_dulint_get_low(n));
+
+ return(size);
+}
+
+/*********************************************************//**
+Returns the size of a dulint when written in the compressed form.
+@return compressed size in bytes */
+UNIV_INLINE
+ulint
+mach_dulint_get_much_compressed_size(
+/*=================================*/
+ dulint n) /*!< in: dulint integer to be stored */
+{
+ if (0 == ut_dulint_get_high(n)) {
+ return(mach_get_compressed_size(ut_dulint_get_low(n)));
+ }
+
+ return(1 + mach_get_compressed_size(ut_dulint_get_high(n))
+ + mach_get_compressed_size(ut_dulint_get_low(n)));
+}
+
+/*********************************************************//**
+Reads a dulint in a compressed form.
+@return read dulint */
+UNIV_INLINE
+dulint
+mach_dulint_read_much_compressed(
+/*=============================*/
+ const byte* b) /*!< in: pointer to memory from where to read */
+{
+ ulint high;
+ ulint low;
+ ulint size;
+
+ ut_ad(b);
+
+ if (*b != (byte)0xFF) {
+ high = 0;
+ size = 0;
+ } else {
+ high = mach_read_compressed(b + 1);
+
+ size = 1 + mach_get_compressed_size(high);
+ }
+
+ low = mach_read_compressed(b + size);
+
+ return(ut_dulint_create(high, low));
+}
+#ifndef UNIV_HOTBACKUP
+/*********************************************************//**
+Reads a double. It is stored in a little-endian format.
+@return double read */
+UNIV_INLINE
+double
+mach_double_read(
+/*=============*/
+ const byte* b) /*!< in: pointer to memory from where to read */
+{
+ double d;
+ ulint i;
+ byte* ptr;
+
+ ptr = (byte*)&d;
+
+ for (i = 0; i < sizeof(double); i++) {
+#ifdef WORDS_BIGENDIAN
+ ptr[sizeof(double) - i - 1] = b[i];
+#else
+ ptr[i] = b[i];
+#endif
+ }
+
+ return(d);
+}
+
+/*********************************************************//**
+Writes a double. It is stored in a little-endian format. */
+UNIV_INLINE
+void
+mach_double_write(
+/*==============*/
+ byte* b, /*!< in: pointer to memory where to write */
+ double d) /*!< in: double */
+{
+ ulint i;
+ byte* ptr;
+
+ ptr = (byte*)&d;
+
+ for (i = 0; i < sizeof(double); i++) {
+#ifdef WORDS_BIGENDIAN
+ b[i] = ptr[sizeof(double) - i - 1];
+#else
+ b[i] = ptr[i];
+#endif
+ }
+}
+
+/*********************************************************//**
+Reads a float. It is stored in a little-endian format.
+@return float read */
+UNIV_INLINE
+float
+mach_float_read(
+/*============*/
+ const byte* b) /*!< in: pointer to memory from where to read */
+{
+ float d;
+ ulint i;
+ byte* ptr;
+
+ ptr = (byte*)&d;
+
+ for (i = 0; i < sizeof(float); i++) {
+#ifdef WORDS_BIGENDIAN
+ ptr[sizeof(float) - i - 1] = b[i];
+#else
+ ptr[i] = b[i];
+#endif
+ }
+
+ return(d);
+}
+
+/*********************************************************//**
+Writes a float. It is stored in a little-endian format. */
+UNIV_INLINE
+void
+mach_float_write(
+/*=============*/
+ byte* b, /*!< in: pointer to memory where to write */
+ float d) /*!< in: float */
+{
+ ulint i;
+ byte* ptr;
+
+ ptr = (byte*)&d;
+
+ for (i = 0; i < sizeof(float); i++) {
+#ifdef WORDS_BIGENDIAN
+ b[i] = ptr[sizeof(float) - i - 1];
+#else
+ b[i] = ptr[i];
+#endif
+ }
+}
+
+/*********************************************************//**
+Reads a ulint stored in the little-endian format.
+@return unsigned long int */
+UNIV_INLINE
+ulint
+mach_read_from_n_little_endian(
+/*===========================*/
+ const byte* buf, /*!< in: from where to read */
+ ulint buf_size) /*!< in: from how many bytes to read */
+{
+ ulint n = 0;
+ const byte* ptr;
+
+ ut_ad(buf_size <= sizeof(ulint));
+ ut_ad(buf_size > 0);
+
+ ptr = buf + buf_size;
+
+ for (;;) {
+ ptr--;
+
+ n = n << 8;
+
+ n += (ulint)(*ptr);
+
+ if (ptr == buf) {
+ break;
+ }
+ }
+
+ return(n);
+}
+
+/*********************************************************//**
+Writes a ulint in the little-endian format. */
+UNIV_INLINE
+void
+mach_write_to_n_little_endian(
+/*==========================*/
+ byte* dest, /*!< in: where to write */
+ ulint dest_size, /*!< in: into how many bytes to write */
+ ulint n) /*!< in: unsigned long int to write */
+{
+ byte* end;
+
+ ut_ad(dest_size <= sizeof(ulint));
+ ut_ad(dest_size > 0);
+
+ end = dest + dest_size;
+
+ for (;;) {
+ *dest = (byte)(n & 0xFF);
+
+ n = n >> 8;
+
+ dest++;
+
+ if (dest == end) {
+ break;
+ }
+ }
+
+ ut_ad(n == 0);
+}
+
+/*********************************************************//**
+Reads a ulint stored in the little-endian format.
+@return unsigned long int */
+UNIV_INLINE
+ulint
+mach_read_from_2_little_endian(
+/*===========================*/
+ const byte* buf) /*!< in: from where to read */
+{
+ return((ulint)(*buf) + ((ulint)(*(buf + 1))) * 256);
+}
+
+/*********************************************************//**
+Writes a ulint in the little-endian format. */
+UNIV_INLINE
+void
+mach_write_to_2_little_endian(
+/*==========================*/
+ byte* dest, /*!< in: where to write */
+ ulint n) /*!< in: unsigned long int to write */
+{
+ ut_ad(n < 256 * 256);
+
+ *dest = (byte)(n & 0xFFUL);
+
+ n = n >> 8;
+ dest++;
+
+ *dest = (byte)(n & 0xFFUL);
+}
+
+/*********************************************************//**
+Convert integral type from storage byte order (big endian) to
+host byte order.
+@return integer value */
+UNIV_INLINE
+ullint
+mach_read_int_type(
+/*===============*/
+ const byte* src, /*!< in: where to read from */
+ ulint len, /*!< in: length of src */
+ ibool unsigned_type) /*!< in: signed or unsigned flag */
+{
+ /* XXX this can be optimized on big-endian machines */
+
+ ullint ret;
+ uint i;
+
+ if (unsigned_type || (src[0] & 0x80)) {
+
+ ret = 0x0000000000000000ULL;
+ } else {
+
+ ret = 0xFFFFFFFFFFFFFF00ULL;
+ }
+
+ if (unsigned_type) {
+
+ ret |= src[0];
+ } else {
+
+ ret |= src[0] ^ 0x80;
+ }
+
+ for (i = 1; i < len; i++) {
+ ret <<= 8;
+ ret |= src[i];
+ }
+
+ return(ret);
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/innobase/include/mem0dbg.h b/storage/innobase/include/mem0dbg.h
new file mode 100644
index 00000000000..a064af5c678
--- /dev/null
+++ b/storage/innobase/include/mem0dbg.h
@@ -0,0 +1,143 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/mem0dbg.h
+The memory management: the debug code. This is not a compilation module,
+but is included in mem0mem.* !
+
+Created 6/9/1994 Heikki Tuuri
+*******************************************************/
+
+/* In the debug version each allocated field is surrounded with
+check fields whose sizes are given below */
+
+#ifdef UNIV_MEM_DEBUG
+#define MEM_FIELD_HEADER_SIZE ut_calc_align(2 * sizeof(ulint),\
+ UNIV_MEM_ALIGNMENT)
+#define MEM_FIELD_TRAILER_SIZE sizeof(ulint)
+#else
+#define MEM_FIELD_HEADER_SIZE 0
+#endif
+
+
+/* Space needed when allocating for a user a field of
+length N. The space is allocated only in multiples of
+UNIV_MEM_ALIGNMENT. In the debug version there are also
+check fields at the both ends of the field. */
+#ifdef UNIV_MEM_DEBUG
+#define MEM_SPACE_NEEDED(N) ut_calc_align((N) + MEM_FIELD_HEADER_SIZE\
+ + MEM_FIELD_TRAILER_SIZE, UNIV_MEM_ALIGNMENT)
+#else
+#define MEM_SPACE_NEEDED(N) ut_calc_align((N), UNIV_MEM_ALIGNMENT)
+#endif
+
+#if defined UNIV_MEM_DEBUG || defined UNIV_DEBUG
+/***************************************************************//**
+Checks a memory heap for consistency and prints the contents if requested.
+Outputs the sum of sizes of buffers given to the user (only in
+the debug version), the physical size of the heap and the number of
+blocks in the heap. In case of error returns 0 as sizes and number
+of blocks. */
+UNIV_INTERN
+void
+mem_heap_validate_or_print(
+/*=======================*/
+ mem_heap_t* heap, /*!< in: memory heap */
+ byte* top, /*!< in: calculate and validate only until
+ this top pointer in the heap is reached,
+ if this pointer is NULL, ignored */
+ ibool print, /*!< in: if TRUE, prints the contents
+ of the heap; works only in
+ the debug version */
+ ibool* error, /*!< out: TRUE if error */
+ ulint* us_size,/*!< out: allocated memory
+ (for the user) in the heap,
+ if a NULL pointer is passed as this
+ argument, it is ignored; in the
+ non-debug version this is always -1 */
+ ulint* ph_size,/*!< out: physical size of the heap,
+ if a NULL pointer is passed as this
+ argument, it is ignored */
+ ulint* n_blocks); /*!< out: number of blocks in the heap,
+ if a NULL pointer is passed as this
+ argument, it is ignored */
+/**************************************************************//**
+Validates the contents of a memory heap.
+@return TRUE if ok */
+UNIV_INTERN
+ibool
+mem_heap_validate(
+/*==============*/
+ mem_heap_t* heap); /*!< in: memory heap */
+#endif /* UNIV_MEM_DEBUG || UNIV_DEBUG */
+#ifdef UNIV_DEBUG
+/**************************************************************//**
+Checks that an object is a memory heap (or a block of it)
+@return TRUE if ok */
+UNIV_INTERN
+ibool
+mem_heap_check(
+/*===========*/
+ mem_heap_t* heap); /*!< in: memory heap */
+#endif /* UNIV_DEBUG */
+#ifdef UNIV_MEM_DEBUG
+/*****************************************************************//**
+TRUE if no memory is currently allocated.
+@return TRUE if no heaps exist */
+UNIV_INTERN
+ibool
+mem_all_freed(void);
+/*===============*/
+/*****************************************************************//**
+Validates the dynamic memory
+@return TRUE if error */
+UNIV_INTERN
+ibool
+mem_validate_no_assert(void);
+/*=========================*/
+/************************************************************//**
+Validates the dynamic memory
+@return TRUE if ok */
+UNIV_INTERN
+ibool
+mem_validate(void);
+/*===============*/
+#endif /* UNIV_MEM_DEBUG */
+/************************************************************//**
+Tries to find neigboring memory allocation blocks and dumps to stderr
+the neighborhood of a given pointer. */
+UNIV_INTERN
+void
+mem_analyze_corruption(
+/*===================*/
+ void* ptr); /*!< in: pointer to place of possible corruption */
+/*****************************************************************//**
+Prints information of dynamic memory usage and currently allocated memory
+heaps or buffers. Can only be used in the debug version. */
+UNIV_INTERN
+void
+mem_print_info(void);
+/*================*/
+/*****************************************************************//**
+Prints information of dynamic memory usage and currently allocated memory
+heaps or buffers since the last ..._print_info or..._print_new_info. */
+UNIV_INTERN
+void
+mem_print_new_info(void);
+/*====================*/
diff --git a/storage/innobase/include/mem0dbg.ic b/storage/innobase/include/mem0dbg.ic
new file mode 100644
index 00000000000..cb9245411dc
--- /dev/null
+++ b/storage/innobase/include/mem0dbg.ic
@@ -0,0 +1,112 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/mem0dbg.ic
+The memory management: the debug code. This is not an independent
+compilation module but is included in mem0mem.*.
+
+Created 6/8/1994 Heikki Tuuri
+*************************************************************************/
+
+#ifdef UNIV_MEM_DEBUG
+# ifndef UNIV_HOTBACKUP
+extern mutex_t mem_hash_mutex;
+# endif /* !UNIV_HOTBACKUP */
+extern ulint mem_current_allocated_memory;
+
+/******************************************************************//**
+Initializes an allocated memory field in the debug version. */
+UNIV_INTERN
+void
+mem_field_init(
+/*===========*/
+ byte* buf, /*!< in: memory field */
+ ulint n); /*!< in: how many bytes the user requested */
+/******************************************************************//**
+Erases an allocated memory field in the debug version. */
+UNIV_INTERN
+void
+mem_field_erase(
+/*============*/
+ byte* buf, /*!< in: memory field */
+ ulint n); /*!< in: how many bytes the user requested */
+/***************************************************************//**
+Initializes a buffer to a random combination of hex BA and BE.
+Used to initialize allocated memory. */
+UNIV_INTERN
+void
+mem_init_buf(
+/*=========*/
+ byte* buf, /*!< in: pointer to buffer */
+ ulint n); /*!< in: length of buffer */
+/***************************************************************//**
+Initializes a buffer to a random combination of hex DE and AD.
+Used to erase freed memory. */
+UNIV_INTERN
+void
+mem_erase_buf(
+/*==========*/
+ byte* buf, /*!< in: pointer to buffer */
+ ulint n); /*!< in: length of buffer */
+/***************************************************************//**
+Inserts a created memory heap to the hash table of
+current allocated memory heaps.
+Initializes the hash table when first called. */
+UNIV_INTERN
+void
+mem_hash_insert(
+/*============*/
+ mem_heap_t* heap, /*!< in: the created heap */
+ const char* file_name, /*!< in: file name of creation */
+ ulint line); /*!< in: line where created */
+/***************************************************************//**
+Removes a memory heap (which is going to be freed by the caller)
+from the list of live memory heaps. Returns the size of the heap
+in terms of how much memory in bytes was allocated for the user of
+the heap (not the total space occupied by the heap).
+Also validates the heap.
+NOTE: This function does not free the storage occupied by the
+heap itself, only the node in the list of heaps. */
+UNIV_INTERN
+void
+mem_hash_remove(
+/*============*/
+ mem_heap_t* heap, /*!< in: the heap to be freed */
+ const char* file_name, /*!< in: file name of freeing */
+ ulint line); /*!< in: line where freed */
+
+
+void
+mem_field_header_set_len(byte* field, ulint len);
+
+ulint
+mem_field_header_get_len(byte* field);
+
+void
+mem_field_header_set_check(byte* field, ulint check);
+
+ulint
+mem_field_header_get_check(byte* field);
+
+void
+mem_field_trailer_set_check(byte* field, ulint check);
+
+ulint
+mem_field_trailer_get_check(byte* field);
+#endif /* UNIV_MEM_DEBUG */
diff --git a/storage/innobase/include/mem0mem.h b/storage/innobase/include/mem0mem.h
new file mode 100644
index 00000000000..a092b024219
--- /dev/null
+++ b/storage/innobase/include/mem0mem.h
@@ -0,0 +1,392 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/mem0mem.h
+The memory management
+
+Created 6/9/1994 Heikki Tuuri
+*******************************************************/
+
+#ifndef mem0mem_h
+#define mem0mem_h
+
+#include "univ.i"
+#include "ut0mem.h"
+#include "ut0byte.h"
+#include "ut0rnd.h"
+#ifndef UNIV_HOTBACKUP
+# include "sync0sync.h"
+#endif /* UNIV_HOTBACKUP */
+#include "ut0lst.h"
+#include "mach0data.h"
+
+/* -------------------- MEMORY HEAPS ----------------------------- */
+
+/* The info structure stored at the beginning of a heap block */
+typedef struct mem_block_info_struct mem_block_info_t;
+
+/* A block of a memory heap consists of the info structure
+followed by an area of memory */
+typedef mem_block_info_t mem_block_t;
+
+/* A memory heap is a nonempty linear list of memory blocks */
+typedef mem_block_t mem_heap_t;
+
+/* Types of allocation for memory heaps: DYNAMIC means allocation from the
+dynamic memory pool of the C compiler, BUFFER means allocation from the
+buffer pool; the latter method is used for very big heaps */
+
+#define MEM_HEAP_DYNAMIC 0 /* the most common type */
+#define MEM_HEAP_BUFFER 1
+#define MEM_HEAP_BTR_SEARCH 2 /* this flag can optionally be
+ ORed to MEM_HEAP_BUFFER, in which
+ case heap->free_block is used in
+ some cases for memory allocations,
+ and if it's NULL, the memory
+ allocation functions can return
+ NULL. */
+
+/* The following start size is used for the first block in the memory heap if
+the size is not specified, i.e., 0 is given as the parameter in the call of
+create. The standard size is the maximum (payload) size of the blocks used for
+allocations of small buffers. */
+
+#define MEM_BLOCK_START_SIZE 64
+#define MEM_BLOCK_STANDARD_SIZE \
+ (UNIV_PAGE_SIZE >= 16384 ? 8000 : MEM_MAX_ALLOC_IN_BUF)
+
+/* If a memory heap is allowed to grow into the buffer pool, the following
+is the maximum size for a single allocated buffer: */
+#define MEM_MAX_ALLOC_IN_BUF (UNIV_PAGE_SIZE - 200)
+
+/******************************************************************//**
+Initializes the memory system. */
+UNIV_INTERN
+void
+mem_init(
+/*=====*/
+ ulint size); /*!< in: common pool size in bytes */
+/**************************************************************//**
+Use this macro instead of the corresponding function! Macro for memory
+heap creation. */
+
+#define mem_heap_create(N) mem_heap_create_func(\
+ (N), MEM_HEAP_DYNAMIC, __FILE__, __LINE__)
+/**************************************************************//**
+Use this macro instead of the corresponding function! Macro for memory
+heap creation. */
+
+#define mem_heap_create_in_buffer(N) mem_heap_create_func(\
+ (N), MEM_HEAP_BUFFER, __FILE__, __LINE__)
+/**************************************************************//**
+Use this macro instead of the corresponding function! Macro for memory
+heap creation. */
+
+#define mem_heap_create_in_btr_search(N) mem_heap_create_func(\
+ (N), MEM_HEAP_BTR_SEARCH | MEM_HEAP_BUFFER,\
+ __FILE__, __LINE__)
+
+/**************************************************************//**
+Use this macro instead of the corresponding function! Macro for memory
+heap freeing. */
+
+#define mem_heap_free(heap) mem_heap_free_func(\
+ (heap), __FILE__, __LINE__)
+/*****************************************************************//**
+NOTE: Use the corresponding macros instead of this function. Creates a
+memory heap. For debugging purposes, takes also the file name and line as
+arguments.
+@return own: memory heap, NULL if did not succeed (only possible for
+MEM_HEAP_BTR_SEARCH type heaps) */
+UNIV_INLINE
+mem_heap_t*
+mem_heap_create_func(
+/*=================*/
+ ulint n, /*!< in: desired start block size,
+ this means that a single user buffer
+ of size n will fit in the block,
+ 0 creates a default size block */
+ ulint type, /*!< in: heap type */
+ const char* file_name, /*!< in: file name where created */
+ ulint line); /*!< in: line where created */
+/*****************************************************************//**
+NOTE: Use the corresponding macro instead of this function. Frees the space
+occupied by a memory heap. In the debug version erases the heap memory
+blocks. */
+UNIV_INLINE
+void
+mem_heap_free_func(
+/*===============*/
+ mem_heap_t* heap, /*!< in, own: heap to be freed */
+ const char* file_name, /*!< in: file name where freed */
+ ulint line); /*!< in: line where freed */
+/***************************************************************//**
+Allocates and zero-fills n bytes of memory from a memory heap.
+@return allocated, zero-filled storage */
+UNIV_INLINE
+void*
+mem_heap_zalloc(
+/*============*/
+ mem_heap_t* heap, /*!< in: memory heap */
+ ulint n); /*!< in: number of bytes; if the heap is allowed
+ to grow into the buffer pool, this must be
+ <= MEM_MAX_ALLOC_IN_BUF */
+/***************************************************************//**
+Allocates n bytes of memory from a memory heap.
+@return allocated storage, NULL if did not succeed (only possible for
+MEM_HEAP_BTR_SEARCH type heaps) */
+UNIV_INLINE
+void*
+mem_heap_alloc(
+/*===========*/
+ mem_heap_t* heap, /*!< in: memory heap */
+ ulint n); /*!< in: number of bytes; if the heap is allowed
+ to grow into the buffer pool, this must be
+ <= MEM_MAX_ALLOC_IN_BUF */
+/*****************************************************************//**
+Returns a pointer to the heap top.
+@return pointer to the heap top */
+UNIV_INLINE
+byte*
+mem_heap_get_heap_top(
+/*==================*/
+ mem_heap_t* heap); /*!< in: memory heap */
+/*****************************************************************//**
+Frees the space in a memory heap exceeding the pointer given. The
+pointer must have been acquired from mem_heap_get_heap_top. The first
+memory block of the heap is not freed. */
+UNIV_INLINE
+void
+mem_heap_free_heap_top(
+/*===================*/
+ mem_heap_t* heap, /*!< in: heap from which to free */
+ byte* old_top);/*!< in: pointer to old top of heap */
+/*****************************************************************//**
+Empties a memory heap. The first memory block of the heap is not freed. */
+UNIV_INLINE
+void
+mem_heap_empty(
+/*===========*/
+ mem_heap_t* heap); /*!< in: heap to empty */
+/*****************************************************************//**
+Returns a pointer to the topmost element in a memory heap.
+The size of the element must be given.
+@return pointer to the topmost element */
+UNIV_INLINE
+void*
+mem_heap_get_top(
+/*=============*/
+ mem_heap_t* heap, /*!< in: memory heap */
+ ulint n); /*!< in: size of the topmost element */
+/*****************************************************************//**
+Frees the topmost element in a memory heap.
+The size of the element must be given. */
+UNIV_INLINE
+void
+mem_heap_free_top(
+/*==============*/
+ mem_heap_t* heap, /*!< in: memory heap */
+ ulint n); /*!< in: size of the topmost element */
+/*****************************************************************//**
+Returns the space in bytes occupied by a memory heap. */
+UNIV_INLINE
+ulint
+mem_heap_get_size(
+/*==============*/
+ mem_heap_t* heap); /*!< in: heap */
+/**************************************************************//**
+Use this macro instead of the corresponding function!
+Macro for memory buffer allocation */
+
+#define mem_zalloc(N) memset(mem_alloc(N), 0, (N));
+
+#define mem_alloc(N) mem_alloc_func((N), NULL, __FILE__, __LINE__)
+#define mem_alloc2(N,S) mem_alloc_func((N), (S), __FILE__, __LINE__)
+/***************************************************************//**
+NOTE: Use the corresponding macro instead of this function.
+Allocates a single buffer of memory from the dynamic memory of
+the C compiler. Is like malloc of C. The buffer must be freed
+with mem_free.
+@return own: free storage */
+UNIV_INLINE
+void*
+mem_alloc_func(
+/*===========*/
+ ulint n, /*!< in: requested size in bytes */
+ ulint* size, /*!< out: allocated size in bytes,
+ or NULL */
+ const char* file_name, /*!< in: file name where created */
+ ulint line); /*!< in: line where created */
+
+/**************************************************************//**
+Use this macro instead of the corresponding function!
+Macro for memory buffer freeing */
+
+#define mem_free(PTR) mem_free_func((PTR), __FILE__, __LINE__)
+/***************************************************************//**
+NOTE: Use the corresponding macro instead of this function.
+Frees a single buffer of storage from
+the dynamic memory of C compiler. Similar to free of C. */
+UNIV_INLINE
+void
+mem_free_func(
+/*==========*/
+ void* ptr, /*!< in, own: buffer to be freed */
+ const char* file_name, /*!< in: file name where created */
+ ulint line); /*!< in: line where created */
+
+/**********************************************************************//**
+Duplicates a NUL-terminated string.
+@return own: a copy of the string, must be deallocated with mem_free */
+UNIV_INLINE
+char*
+mem_strdup(
+/*=======*/
+ const char* str); /*!< in: string to be copied */
+/**********************************************************************//**
+Makes a NUL-terminated copy of a nonterminated string.
+@return own: a copy of the string, must be deallocated with mem_free */
+UNIV_INLINE
+char*
+mem_strdupl(
+/*========*/
+ const char* str, /*!< in: string to be copied */
+ ulint len); /*!< in: length of str, in bytes */
+
+/**********************************************************************//**
+Duplicates a NUL-terminated string, allocated from a memory heap.
+@return own: a copy of the string */
+UNIV_INTERN
+char*
+mem_heap_strdup(
+/*============*/
+ mem_heap_t* heap, /*!< in: memory heap where string is allocated */
+ const char* str); /*!< in: string to be copied */
+/**********************************************************************//**
+Makes a NUL-terminated copy of a nonterminated string,
+allocated from a memory heap.
+@return own: a copy of the string */
+UNIV_INLINE
+char*
+mem_heap_strdupl(
+/*=============*/
+ mem_heap_t* heap, /*!< in: memory heap where string is allocated */
+ const char* str, /*!< in: string to be copied */
+ ulint len); /*!< in: length of str, in bytes */
+
+/**********************************************************************//**
+Concatenate two strings and return the result, using a memory heap.
+@return own: the result */
+UNIV_INTERN
+char*
+mem_heap_strcat(
+/*============*/
+ mem_heap_t* heap, /*!< in: memory heap where string is allocated */
+ const char* s1, /*!< in: string 1 */
+ const char* s2); /*!< in: string 2 */
+
+/**********************************************************************//**
+Duplicate a block of data, allocated from a memory heap.
+@return own: a copy of the data */
+UNIV_INTERN
+void*
+mem_heap_dup(
+/*=========*/
+ mem_heap_t* heap, /*!< in: memory heap where copy is allocated */
+ const void* data, /*!< in: data to be copied */
+ ulint len); /*!< in: length of data, in bytes */
+
+/****************************************************************//**
+A simple (s)printf replacement that dynamically allocates the space for the
+formatted string from the given heap. This supports a very limited set of
+the printf syntax: types 's' and 'u' and length modifier 'l' (which is
+required for the 'u' type).
+@return heap-allocated formatted string */
+UNIV_INTERN
+char*
+mem_heap_printf(
+/*============*/
+ mem_heap_t* heap, /*!< in: memory heap */
+ const char* format, /*!< in: format string */
+ ...) __attribute__ ((format (printf, 2, 3)));
+
+#ifdef MEM_PERIODIC_CHECK
+/******************************************************************//**
+Goes through the list of all allocated mem blocks, checks their magic
+numbers, and reports possible corruption. */
+UNIV_INTERN
+void
+mem_validate_all_blocks(void);
+/*=========================*/
+#endif
+
+/*#######################################################################*/
+
+/* The info header of a block in a memory heap */
+
+struct mem_block_info_struct {
+ ulint magic_n;/* magic number for debugging */
+ char file_name[8];/* file name where the mem heap was created */
+ ulint line; /*!< line number where the mem heap was created */
+ UT_LIST_BASE_NODE_T(mem_block_t) base; /* In the first block in the
+ the list this is the base node of the list of blocks;
+ in subsequent blocks this is undefined */
+ UT_LIST_NODE_T(mem_block_t) list; /* This contains pointers to next
+ and prev in the list. The first block allocated
+ to the heap is also the first block in this list,
+ though it also contains the base node of the list. */
+ ulint len; /*!< physical length of this block in bytes */
+ ulint type; /*!< type of heap: MEM_HEAP_DYNAMIC, or
+ MEM_HEAP_BUF possibly ORed to MEM_HEAP_BTR_SEARCH */
+ ulint free; /*!< offset in bytes of the first free position for
+ user data in the block */
+ ulint start; /*!< the value of the struct field 'free' at the
+ creation of the block */
+#ifndef UNIV_HOTBACKUP
+ void* free_block;
+ /* if the MEM_HEAP_BTR_SEARCH bit is set in type,
+ and this is the heap root, this can contain an
+ allocated buffer frame, which can be appended as a
+ free block to the heap, if we need more space;
+ otherwise, this is NULL */
+ void* buf_block;
+ /* if this block has been allocated from the buffer
+ pool, this contains the buf_block_t handle;
+ otherwise, this is NULL */
+#endif /* !UNIV_HOTBACKUP */
+#ifdef MEM_PERIODIC_CHECK
+ UT_LIST_NODE_T(mem_block_t) mem_block_list;
+ /* List of all mem blocks allocated; protected
+ by the mem_comm_pool mutex */
+#endif
+};
+
+#define MEM_BLOCK_MAGIC_N 764741555
+#define MEM_FREED_BLOCK_MAGIC_N 547711122
+
+/* Header size for a memory heap block */
+#define MEM_BLOCK_HEADER_SIZE ut_calc_align(sizeof(mem_block_info_t),\
+ UNIV_MEM_ALIGNMENT)
+#include "mem0dbg.h"
+
+#ifndef UNIV_NONINL
+#include "mem0mem.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/mem0mem.ic b/storage/innobase/include/mem0mem.ic
new file mode 100644
index 00000000000..e7080d8c508
--- /dev/null
+++ b/storage/innobase/include/mem0mem.ic
@@ -0,0 +1,646 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/mem0mem.ic
+The memory management
+
+Created 6/8/1994 Heikki Tuuri
+*************************************************************************/
+
+#include "mem0dbg.ic"
+#ifndef UNIV_HOTBACKUP
+# include "mem0pool.h"
+#endif /* !UNIV_HOTBACKUP */
+
+/***************************************************************//**
+Creates a memory heap block where data can be allocated.
+@return own: memory heap block, NULL if did not succeed (only possible
+for MEM_HEAP_BTR_SEARCH type heaps) */
+UNIV_INTERN
+mem_block_t*
+mem_heap_create_block(
+/*==================*/
+ mem_heap_t* heap, /*!< in: memory heap or NULL if first block
+ should be created */
+ ulint n, /*!< in: number of bytes needed for user data */
+ ulint type, /*!< in: type of heap: MEM_HEAP_DYNAMIC or
+ MEM_HEAP_BUFFER */
+ const char* file_name,/*!< in: file name where created */
+ ulint line); /*!< in: line where created */
+/******************************************************************//**
+Frees a block from a memory heap. */
+UNIV_INTERN
+void
+mem_heap_block_free(
+/*================*/
+ mem_heap_t* heap, /*!< in: heap */
+ mem_block_t* block); /*!< in: block to free */
+#ifndef UNIV_HOTBACKUP
+/******************************************************************//**
+Frees the free_block field from a memory heap. */
+UNIV_INTERN
+void
+mem_heap_free_block_free(
+/*=====================*/
+ mem_heap_t* heap); /*!< in: heap */
+#endif /* !UNIV_HOTBACKUP */
+/***************************************************************//**
+Adds a new block to a memory heap.
+@return created block, NULL if did not succeed (only possible for
+MEM_HEAP_BTR_SEARCH type heaps) */
+UNIV_INTERN
+mem_block_t*
+mem_heap_add_block(
+/*===============*/
+ mem_heap_t* heap, /*!< in: memory heap */
+ ulint n); /*!< in: number of bytes user needs */
+
+UNIV_INLINE
+void
+mem_block_set_len(mem_block_t* block, ulint len)
+{
+ ut_ad(len > 0);
+
+ block->len = len;
+}
+
+UNIV_INLINE
+ulint
+mem_block_get_len(mem_block_t* block)
+{
+ return(block->len);
+}
+
+UNIV_INLINE
+void
+mem_block_set_type(mem_block_t* block, ulint type)
+{
+ ut_ad((type == MEM_HEAP_DYNAMIC) || (type == MEM_HEAP_BUFFER)
+ || (type == MEM_HEAP_BUFFER + MEM_HEAP_BTR_SEARCH));
+
+ block->type = type;
+}
+
+UNIV_INLINE
+ulint
+mem_block_get_type(mem_block_t* block)
+{
+ return(block->type);
+}
+
+UNIV_INLINE
+void
+mem_block_set_free(mem_block_t* block, ulint free)
+{
+ ut_ad(free > 0);
+ ut_ad(free <= mem_block_get_len(block));
+
+ block->free = free;
+}
+
+UNIV_INLINE
+ulint
+mem_block_get_free(mem_block_t* block)
+{
+ return(block->free);
+}
+
+UNIV_INLINE
+void
+mem_block_set_start(mem_block_t* block, ulint start)
+{
+ ut_ad(start > 0);
+
+ block->start = start;
+}
+
+UNIV_INLINE
+ulint
+mem_block_get_start(mem_block_t* block)
+{
+ return(block->start);
+}
+
+/***************************************************************//**
+Allocates and zero-fills n bytes of memory from a memory heap.
+@return allocated, zero-filled storage */
+UNIV_INLINE
+void*
+mem_heap_zalloc(
+/*============*/
+ mem_heap_t* heap, /*!< in: memory heap */
+ ulint n) /*!< in: number of bytes; if the heap is allowed
+ to grow into the buffer pool, this must be
+ <= MEM_MAX_ALLOC_IN_BUF */
+{
+ ut_ad(heap);
+ ut_ad(!(heap->type & MEM_HEAP_BTR_SEARCH));
+ return(memset(mem_heap_alloc(heap, n), 0, n));
+}
+
+/***************************************************************//**
+Allocates n bytes of memory from a memory heap.
+@return allocated storage, NULL if did not succeed (only possible for
+MEM_HEAP_BTR_SEARCH type heaps) */
+UNIV_INLINE
+void*
+mem_heap_alloc(
+/*===========*/
+ mem_heap_t* heap, /*!< in: memory heap */
+ ulint n) /*!< in: number of bytes; if the heap is allowed
+ to grow into the buffer pool, this must be
+ <= MEM_MAX_ALLOC_IN_BUF */
+{
+ mem_block_t* block;
+ void* buf;
+ ulint free;
+
+ ut_ad(mem_heap_check(heap));
+
+ block = UT_LIST_GET_LAST(heap->base);
+
+ ut_ad(!(block->type & MEM_HEAP_BUFFER) || (n <= MEM_MAX_ALLOC_IN_BUF));
+
+ /* Check if there is enough space in block. If not, create a new
+ block to the heap */
+
+ if (mem_block_get_len(block)
+ < mem_block_get_free(block) + MEM_SPACE_NEEDED(n)) {
+
+ block = mem_heap_add_block(heap, n);
+
+ if (block == NULL) {
+
+ return(NULL);
+ }
+ }
+
+ free = mem_block_get_free(block);
+
+ buf = (byte*)block + free;
+
+ mem_block_set_free(block, free + MEM_SPACE_NEEDED(n));
+
+#ifdef UNIV_MEM_DEBUG
+ UNIV_MEM_ALLOC(buf,
+ n + MEM_FIELD_HEADER_SIZE + MEM_FIELD_TRAILER_SIZE);
+
+ /* In the debug version write debugging info to the field */
+ mem_field_init((byte*)buf, n);
+
+ /* Advance buf to point at the storage which will be given to the
+ caller */
+ buf = (byte*)buf + MEM_FIELD_HEADER_SIZE;
+
+#endif
+#ifdef UNIV_SET_MEM_TO_ZERO
+ UNIV_MEM_ALLOC(buf, n);
+ memset(buf, '\0', n);
+#endif
+ UNIV_MEM_ALLOC(buf, n);
+ return(buf);
+}
+
+/*****************************************************************//**
+Returns a pointer to the heap top.
+@return pointer to the heap top */
+UNIV_INLINE
+byte*
+mem_heap_get_heap_top(
+/*==================*/
+ mem_heap_t* heap) /*!< in: memory heap */
+{
+ mem_block_t* block;
+ byte* buf;
+
+ ut_ad(mem_heap_check(heap));
+
+ block = UT_LIST_GET_LAST(heap->base);
+
+ buf = (byte*)block + mem_block_get_free(block);
+
+ return(buf);
+}
+
+/*****************************************************************//**
+Frees the space in a memory heap exceeding the pointer given. The
+pointer must have been acquired from mem_heap_get_heap_top. The first
+memory block of the heap is not freed. */
+UNIV_INLINE
+void
+mem_heap_free_heap_top(
+/*===================*/
+ mem_heap_t* heap, /*!< in: heap from which to free */
+ byte* old_top)/*!< in: pointer to old top of heap */
+{
+ mem_block_t* block;
+ mem_block_t* prev_block;
+#ifdef UNIV_MEM_DEBUG
+ ibool error;
+ ulint total_size;
+ ulint size;
+#endif
+
+ ut_ad(mem_heap_check(heap));
+
+#ifdef UNIV_MEM_DEBUG
+
+ /* Validate the heap and get its total allocated size */
+ mem_heap_validate_or_print(heap, NULL, FALSE, &error, &total_size,
+ NULL, NULL);
+ ut_a(!error);
+
+ /* Get the size below top pointer */
+ mem_heap_validate_or_print(heap, old_top, FALSE, &error, &size, NULL,
+ NULL);
+ ut_a(!error);
+
+#endif
+
+ block = UT_LIST_GET_LAST(heap->base);
+
+ while (block != NULL) {
+ if (((byte*)block + mem_block_get_free(block) >= old_top)
+ && ((byte*)block <= old_top)) {
+ /* Found the right block */
+
+ break;
+ }
+
+ /* Store prev_block value before freeing the current block
+ (the current block will be erased in freeing) */
+
+ prev_block = UT_LIST_GET_PREV(list, block);
+
+ mem_heap_block_free(heap, block);
+
+ block = prev_block;
+ }
+
+ ut_ad(block);
+
+ /* Set the free field of block */
+ mem_block_set_free(block, old_top - (byte*)block);
+
+#ifdef UNIV_MEM_DEBUG
+ ut_ad(mem_block_get_start(block) <= mem_block_get_free(block));
+
+ /* In the debug version erase block from top up */
+ mem_erase_buf(old_top, (byte*)block + block->len - old_top);
+
+ /* Update allocated memory count */
+ mutex_enter(&mem_hash_mutex);
+ mem_current_allocated_memory -= (total_size - size);
+ mutex_exit(&mem_hash_mutex);
+#else /* UNIV_MEM_DEBUG */
+ UNIV_MEM_ASSERT_W(old_top, (byte*)block + block->len - old_top);
+#endif /* UNIV_MEM_DEBUG */
+ UNIV_MEM_ALLOC(old_top, (byte*)block + block->len - old_top);
+
+ /* If free == start, we may free the block if it is not the first
+ one */
+
+ if ((heap != block) && (mem_block_get_free(block)
+ == mem_block_get_start(block))) {
+ mem_heap_block_free(heap, block);
+ }
+}
+
+/*****************************************************************//**
+Empties a memory heap. The first memory block of the heap is not freed. */
+UNIV_INLINE
+void
+mem_heap_empty(
+/*===========*/
+ mem_heap_t* heap) /*!< in: heap to empty */
+{
+ mem_heap_free_heap_top(heap, (byte*)heap + mem_block_get_start(heap));
+#ifndef UNIV_HOTBACKUP
+ if (heap->free_block) {
+ mem_heap_free_block_free(heap);
+ }
+#endif /* !UNIV_HOTBACKUP */
+}
+
+/*****************************************************************//**
+Returns a pointer to the topmost element in a memory heap. The size of the
+element must be given.
+@return pointer to the topmost element */
+UNIV_INLINE
+void*
+mem_heap_get_top(
+/*=============*/
+ mem_heap_t* heap, /*!< in: memory heap */
+ ulint n) /*!< in: size of the topmost element */
+{
+ mem_block_t* block;
+ void* buf;
+
+ ut_ad(mem_heap_check(heap));
+
+ block = UT_LIST_GET_LAST(heap->base);
+
+ buf = (byte*)block + mem_block_get_free(block) - MEM_SPACE_NEEDED(n);
+
+#ifdef UNIV_MEM_DEBUG
+ ut_ad(mem_block_get_start(block) <=(ulint)((byte*)buf - (byte*)block));
+
+ /* In the debug version, advance buf to point at the storage which
+ was given to the caller in the allocation*/
+
+ buf = (byte*)buf + MEM_FIELD_HEADER_SIZE;
+
+ /* Check that the field lengths agree */
+ ut_ad(n == (ulint)mem_field_header_get_len(buf));
+#endif
+
+ return(buf);
+}
+
+/*****************************************************************//**
+Frees the topmost element in a memory heap. The size of the element must be
+given. */
+UNIV_INLINE
+void
+mem_heap_free_top(
+/*==============*/
+ mem_heap_t* heap, /*!< in: memory heap */
+ ulint n) /*!< in: size of the topmost element */
+{
+ mem_block_t* block;
+
+ ut_ad(mem_heap_check(heap));
+
+ block = UT_LIST_GET_LAST(heap->base);
+
+ /* Subtract the free field of block */
+ mem_block_set_free(block, mem_block_get_free(block)
+ - MEM_SPACE_NEEDED(n));
+ UNIV_MEM_ASSERT_W((byte*) block + mem_block_get_free(block), n);
+#ifdef UNIV_MEM_DEBUG
+
+ ut_ad(mem_block_get_start(block) <= mem_block_get_free(block));
+
+ /* In the debug version check the consistency, and erase field */
+ mem_field_erase((byte*)block + mem_block_get_free(block), n);
+#endif
+
+ /* If free == start, we may free the block if it is not the first
+ one */
+
+ if ((heap != block) && (mem_block_get_free(block)
+ == mem_block_get_start(block))) {
+ mem_heap_block_free(heap, block);
+ } else {
+ /* Avoid a bogus UNIV_MEM_ASSERT_W() warning in a
+ subsequent invocation of mem_heap_free_top().
+ Originally, this was UNIV_MEM_FREE(), to catch writes
+ to freed memory. */
+ UNIV_MEM_ALLOC((byte*) block + mem_block_get_free(block), n);
+ }
+}
+
+/*****************************************************************//**
+NOTE: Use the corresponding macros instead of this function. Creates a
+memory heap. For debugging purposes, takes also the file name and line as
+argument.
+@return own: memory heap, NULL if did not succeed (only possible for
+MEM_HEAP_BTR_SEARCH type heaps) */
+UNIV_INLINE
+mem_heap_t*
+mem_heap_create_func(
+/*=================*/
+ ulint n, /*!< in: desired start block size,
+ this means that a single user buffer
+ of size n will fit in the block,
+ 0 creates a default size block */
+ ulint type, /*!< in: heap type */
+ const char* file_name, /*!< in: file name where created */
+ ulint line) /*!< in: line where created */
+{
+ mem_block_t* block;
+
+ if (!n) {
+ n = MEM_BLOCK_START_SIZE;
+ }
+
+ block = mem_heap_create_block(NULL, n, type, file_name, line);
+
+ if (block == NULL) {
+
+ return(NULL);
+ }
+
+ UT_LIST_INIT(block->base);
+
+ /* Add the created block itself as the first block in the list */
+ UT_LIST_ADD_FIRST(list, block->base, block);
+
+#ifdef UNIV_MEM_DEBUG
+
+ mem_hash_insert(block, file_name, line);
+
+#endif
+
+ return(block);
+}
+
+/*****************************************************************//**
+NOTE: Use the corresponding macro instead of this function. Frees the space
+occupied by a memory heap. In the debug version erases the heap memory
+blocks. */
+UNIV_INLINE
+void
+mem_heap_free_func(
+/*===============*/
+ mem_heap_t* heap, /*!< in, own: heap to be freed */
+ const char* file_name __attribute__((unused)),
+ /*!< in: file name where freed */
+ ulint line __attribute__((unused)))
+{
+ mem_block_t* block;
+ mem_block_t* prev_block;
+
+ ut_ad(mem_heap_check(heap));
+
+ block = UT_LIST_GET_LAST(heap->base);
+
+#ifdef UNIV_MEM_DEBUG
+
+ /* In the debug version remove the heap from the hash table of heaps
+ and check its consistency */
+
+ mem_hash_remove(heap, file_name, line);
+
+#endif
+#ifndef UNIV_HOTBACKUP
+ if (heap->free_block) {
+ mem_heap_free_block_free(heap);
+ }
+#endif /* !UNIV_HOTBACKUP */
+
+ while (block != NULL) {
+ /* Store the contents of info before freeing current block
+ (it is erased in freeing) */
+
+ prev_block = UT_LIST_GET_PREV(list, block);
+
+ mem_heap_block_free(heap, block);
+
+ block = prev_block;
+ }
+}
+
+/***************************************************************//**
+NOTE: Use the corresponding macro instead of this function.
+Allocates a single buffer of memory from the dynamic memory of
+the C compiler. Is like malloc of C. The buffer must be freed
+with mem_free.
+@return own: free storage */
+UNIV_INLINE
+void*
+mem_alloc_func(
+/*===========*/
+ ulint n, /*!< in: desired number of bytes */
+ ulint* size, /*!< out: allocated size in bytes,
+ or NULL */
+ const char* file_name, /*!< in: file name where created */
+ ulint line) /*!< in: line where created */
+{
+ mem_heap_t* heap;
+ void* buf;
+
+ heap = mem_heap_create_func(n, MEM_HEAP_DYNAMIC, file_name, line);
+
+ /* Note that as we created the first block in the heap big enough
+ for the buffer requested by the caller, the buffer will be in the
+ first block and thus we can calculate the pointer to the heap from
+ the pointer to the buffer when we free the memory buffer. */
+
+ if (UNIV_LIKELY_NULL(size)) {
+ /* Adjust the allocation to the actual size of the
+ memory block. */
+ ulint m = mem_block_get_len(heap)
+ - mem_block_get_free(heap);
+#ifdef UNIV_MEM_DEBUG
+ m -= MEM_FIELD_HEADER_SIZE + MEM_FIELD_TRAILER_SIZE;
+#endif /* UNIV_MEM_DEBUG */
+ ut_ad(m >= n);
+ *size = n = m;
+ }
+
+ buf = mem_heap_alloc(heap, n);
+
+ ut_a((byte*)heap == (byte*)buf - MEM_BLOCK_HEADER_SIZE
+ - MEM_FIELD_HEADER_SIZE);
+ return(buf);
+}
+
+/***************************************************************//**
+NOTE: Use the corresponding macro instead of this function. Frees a single
+buffer of storage from the dynamic memory of the C compiler. Similar to the
+free of C. */
+UNIV_INLINE
+void
+mem_free_func(
+/*==========*/
+ void* ptr, /*!< in, own: buffer to be freed */
+ const char* file_name, /*!< in: file name where created */
+ ulint line) /*!< in: line where created */
+{
+ mem_heap_t* heap;
+
+ heap = (mem_heap_t*)((byte*)ptr - MEM_BLOCK_HEADER_SIZE
+ - MEM_FIELD_HEADER_SIZE);
+ mem_heap_free_func(heap, file_name, line);
+}
+
+/*****************************************************************//**
+Returns the space in bytes occupied by a memory heap. */
+UNIV_INLINE
+ulint
+mem_heap_get_size(
+/*==============*/
+ mem_heap_t* heap) /*!< in: heap */
+{
+ mem_block_t* block;
+ ulint size = 0;
+
+ ut_ad(mem_heap_check(heap));
+
+ block = heap;
+
+ while (block != NULL) {
+
+ size += mem_block_get_len(block);
+ block = UT_LIST_GET_NEXT(list, block);
+ }
+#ifndef UNIV_HOTBACKUP
+ if (heap->free_block) {
+ size += UNIV_PAGE_SIZE;
+ }
+#endif /* !UNIV_HOTBACKUP */
+
+ return(size);
+}
+
+/**********************************************************************//**
+Duplicates a NUL-terminated string.
+@return own: a copy of the string, must be deallocated with mem_free */
+UNIV_INLINE
+char*
+mem_strdup(
+/*=======*/
+ const char* str) /*!< in: string to be copied */
+{
+ ulint len = strlen(str) + 1;
+ return((char*) memcpy(mem_alloc(len), str, len));
+}
+
+/**********************************************************************//**
+Makes a NUL-terminated copy of a nonterminated string.
+@return own: a copy of the string, must be deallocated with mem_free */
+UNIV_INLINE
+char*
+mem_strdupl(
+/*========*/
+ const char* str, /*!< in: string to be copied */
+ ulint len) /*!< in: length of str, in bytes */
+{
+ char* s = (char*) mem_alloc(len + 1);
+ s[len] = 0;
+ return((char*) memcpy(s, str, len));
+}
+
+/**********************************************************************//**
+Makes a NUL-terminated copy of a nonterminated string,
+allocated from a memory heap.
+@return own: a copy of the string */
+UNIV_INLINE
+char*
+mem_heap_strdupl(
+/*=============*/
+ mem_heap_t* heap, /*!< in: memory heap where string is allocated */
+ const char* str, /*!< in: string to be copied */
+ ulint len) /*!< in: length of str, in bytes */
+{
+ char* s = (char*) mem_heap_alloc(heap, len + 1);
+ s[len] = 0;
+ return((char*) memcpy(s, str, len));
+}
diff --git a/storage/innobase/include/mem0pool.h b/storage/innobase/include/mem0pool.h
new file mode 100644
index 00000000000..18f988241d6
--- /dev/null
+++ b/storage/innobase/include/mem0pool.h
@@ -0,0 +1,129 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/mem0pool.h
+The lowest-level memory management
+
+Created 6/9/1994 Heikki Tuuri
+*******************************************************/
+
+#ifndef mem0pool_h
+#define mem0pool_h
+
+#include "univ.i"
+#include "os0file.h"
+#include "ut0lst.h"
+
+/** Memory area header */
+typedef struct mem_area_struct mem_area_t;
+/** Memory pool */
+typedef struct mem_pool_struct mem_pool_t;
+
+/** The common memory pool */
+extern mem_pool_t* mem_comm_pool;
+
+/** Memory area header */
+
+struct mem_area_struct{
+ ulint size_and_free; /*!< memory area size is obtained by
+ anding with ~MEM_AREA_FREE; area in
+ a free list if ANDing with
+ MEM_AREA_FREE results in nonzero */
+ UT_LIST_NODE_T(mem_area_t)
+ free_list; /*!< free list node */
+};
+
+/** Each memory area takes this many extra bytes for control information */
+#define MEM_AREA_EXTRA_SIZE (ut_calc_align(sizeof(struct mem_area_struct),\
+ UNIV_MEM_ALIGNMENT))
+
+/********************************************************************//**
+Creates a memory pool.
+@return memory pool */
+UNIV_INTERN
+mem_pool_t*
+mem_pool_create(
+/*============*/
+ ulint size); /*!< in: pool size in bytes */
+/********************************************************************//**
+Allocates memory from a pool. NOTE: This low-level function should only be
+used in mem0mem.*!
+@return own: allocated memory buffer */
+UNIV_INTERN
+void*
+mem_area_alloc(
+/*===========*/
+ ulint* psize, /*!< in: requested size in bytes; for optimum
+ space usage, the size should be a power of 2
+ minus MEM_AREA_EXTRA_SIZE;
+ out: allocated size in bytes (greater than
+ or equal to the requested size) */
+ mem_pool_t* pool); /*!< in: memory pool */
+/********************************************************************//**
+Frees memory to a pool. */
+UNIV_INTERN
+void
+mem_area_free(
+/*==========*/
+ void* ptr, /*!< in, own: pointer to allocated memory
+ buffer */
+ mem_pool_t* pool); /*!< in: memory pool */
+/********************************************************************//**
+Returns the amount of reserved memory.
+@return reserved mmeory in bytes */
+UNIV_INTERN
+ulint
+mem_pool_get_reserved(
+/*==================*/
+ mem_pool_t* pool); /*!< in: memory pool */
+/********************************************************************//**
+Reserves the mem pool mutex. */
+UNIV_INTERN
+void
+mem_pool_mutex_enter(void);
+/*======================*/
+/********************************************************************//**
+Releases the mem pool mutex. */
+UNIV_INTERN
+void
+mem_pool_mutex_exit(void);
+/*=====================*/
+/********************************************************************//**
+Validates a memory pool.
+@return TRUE if ok */
+UNIV_INTERN
+ibool
+mem_pool_validate(
+/*==============*/
+ mem_pool_t* pool); /*!< in: memory pool */
+/********************************************************************//**
+Prints info of a memory pool. */
+UNIV_INTERN
+void
+mem_pool_print_info(
+/*================*/
+ FILE* outfile,/*!< in: output file to write to */
+ mem_pool_t* pool); /*!< in: memory pool */
+
+
+#ifndef UNIV_NONINL
+#include "mem0pool.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/mem0pool.ic b/storage/innobase/include/mem0pool.ic
new file mode 100644
index 00000000000..b891dd6dea0
--- /dev/null
+++ b/storage/innobase/include/mem0pool.ic
@@ -0,0 +1,24 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/mem0pool.ic
+The lowest-level memory management
+
+Created 6/8/1994 Heikki Tuuri
+*************************************************************************/
diff --git a/storage/innobase/include/mtr0log.h b/storage/innobase/include/mtr0log.h
new file mode 100644
index 00000000000..6322af2a569
--- /dev/null
+++ b/storage/innobase/include/mtr0log.h
@@ -0,0 +1,250 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/mtr0log.h
+Mini-transaction logging routines
+
+Created 12/7/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef mtr0log_h
+#define mtr0log_h
+
+#include "univ.i"
+#include "mtr0mtr.h"
+#include "dict0types.h"
+
+#ifndef UNIV_HOTBACKUP
+/********************************************************//**
+Writes 1 - 4 bytes to a file page buffered in the buffer pool.
+Writes the corresponding log record to the mini-transaction log. */
+UNIV_INTERN
+void
+mlog_write_ulint(
+/*=============*/
+ byte* ptr, /*!< in: pointer where to write */
+ ulint val, /*!< in: value to write */
+ byte type, /*!< in: MLOG_1BYTE, MLOG_2BYTES, MLOG_4BYTES */
+ mtr_t* mtr); /*!< in: mini-transaction handle */
+/********************************************************//**
+Writes 8 bytes to a file page buffered in the buffer pool.
+Writes the corresponding log record to the mini-transaction log. */
+UNIV_INTERN
+void
+mlog_write_dulint(
+/*==============*/
+ byte* ptr, /*!< in: pointer where to write */
+ dulint val, /*!< in: value to write */
+ mtr_t* mtr); /*!< in: mini-transaction handle */
+/********************************************************//**
+Writes a string to a file page buffered in the buffer pool. Writes the
+corresponding log record to the mini-transaction log. */
+UNIV_INTERN
+void
+mlog_write_string(
+/*==============*/
+ byte* ptr, /*!< in: pointer where to write */
+ const byte* str, /*!< in: string to write */
+ ulint len, /*!< in: string length */
+ mtr_t* mtr); /*!< in: mini-transaction handle */
+/********************************************************//**
+Logs a write of a string to a file page buffered in the buffer pool.
+Writes the corresponding log record to the mini-transaction log. */
+UNIV_INTERN
+void
+mlog_log_string(
+/*============*/
+ byte* ptr, /*!< in: pointer written to */
+ ulint len, /*!< in: string length */
+ mtr_t* mtr); /*!< in: mini-transaction handle */
+/********************************************************//**
+Writes initial part of a log record consisting of one-byte item
+type and four-byte space and page numbers. */
+UNIV_INTERN
+void
+mlog_write_initial_log_record(
+/*==========================*/
+ const byte* ptr, /*!< in: pointer to (inside) a buffer
+ frame holding the file page where
+ modification is made */
+ byte type, /*!< in: log item type: MLOG_1BYTE, ... */
+ mtr_t* mtr); /*!< in: mini-transaction handle */
+/********************************************************//**
+Writes a log record about an .ibd file create/delete/rename.
+@return new value of log_ptr */
+UNIV_INLINE
+byte*
+mlog_write_initial_log_record_for_file_op(
+/*======================================*/
+ ulint type, /*!< in: MLOG_FILE_CREATE, MLOG_FILE_DELETE, or
+ MLOG_FILE_RENAME */
+ ulint space_id,/*!< in: space id, if applicable */
+ ulint page_no,/*!< in: page number (not relevant currently) */
+ byte* log_ptr,/*!< in: pointer to mtr log which has been opened */
+ mtr_t* mtr); /*!< in: mtr */
+/********************************************************//**
+Catenates 1 - 4 bytes to the mtr log. */
+UNIV_INLINE
+void
+mlog_catenate_ulint(
+/*================*/
+ mtr_t* mtr, /*!< in: mtr */
+ ulint val, /*!< in: value to write */
+ ulint type); /*!< in: MLOG_1BYTE, MLOG_2BYTES, MLOG_4BYTES */
+/********************************************************//**
+Catenates n bytes to the mtr log. */
+UNIV_INTERN
+void
+mlog_catenate_string(
+/*=================*/
+ mtr_t* mtr, /*!< in: mtr */
+ const byte* str, /*!< in: string to write */
+ ulint len); /*!< in: string length */
+/********************************************************//**
+Catenates a compressed ulint to mlog. */
+UNIV_INLINE
+void
+mlog_catenate_ulint_compressed(
+/*===========================*/
+ mtr_t* mtr, /*!< in: mtr */
+ ulint val); /*!< in: value to write */
+/********************************************************//**
+Catenates a compressed dulint to mlog. */
+UNIV_INLINE
+void
+mlog_catenate_dulint_compressed(
+/*============================*/
+ mtr_t* mtr, /*!< in: mtr */
+ dulint val); /*!< in: value to write */
+/********************************************************//**
+Opens a buffer to mlog. It must be closed with mlog_close.
+@return buffer, NULL if log mode MTR_LOG_NONE */
+UNIV_INLINE
+byte*
+mlog_open(
+/*======*/
+ mtr_t* mtr, /*!< in: mtr */
+ ulint size); /*!< in: buffer size in bytes; MUST be
+ smaller than DYN_ARRAY_DATA_SIZE! */
+/********************************************************//**
+Closes a buffer opened to mlog. */
+UNIV_INLINE
+void
+mlog_close(
+/*=======*/
+ mtr_t* mtr, /*!< in: mtr */
+ byte* ptr); /*!< in: buffer space from ptr up was not used */
+/********************************************************//**
+Writes the initial part of a log record (3..11 bytes).
+If the implementation of this function is changed, all
+size parameters to mlog_open() should be adjusted accordingly!
+@return new value of log_ptr */
+UNIV_INLINE
+byte*
+mlog_write_initial_log_record_fast(
+/*===============================*/
+ const byte* ptr, /*!< in: pointer to (inside) a buffer
+ frame holding the file page where
+ modification is made */
+ byte type, /*!< in: log item type: MLOG_1BYTE, ... */
+ byte* log_ptr,/*!< in: pointer to mtr log which has
+ been opened */
+ mtr_t* mtr); /*!< in: mtr */
+#else /* !UNIV_HOTBACKUP */
+# define mlog_write_initial_log_record(ptr,type,mtr) ((void) 0)
+# define mlog_write_initial_log_record_fast(ptr,type,log_ptr,mtr) ((byte *) 0)
+#endif /* !UNIV_HOTBACKUP */
+/********************************************************//**
+Parses an initial log record written by mlog_write_initial_log_record.
+@return parsed record end, NULL if not a complete record */
+UNIV_INTERN
+byte*
+mlog_parse_initial_log_record(
+/*==========================*/
+ byte* ptr, /*!< in: buffer */
+ byte* end_ptr,/*!< in: buffer end */
+ byte* type, /*!< out: log record type: MLOG_1BYTE, ... */
+ ulint* space, /*!< out: space id */
+ ulint* page_no);/*!< out: page number */
+/********************************************************//**
+Parses a log record written by mlog_write_ulint or mlog_write_dulint.
+@return parsed record end, NULL if not a complete record */
+UNIV_INTERN
+byte*
+mlog_parse_nbytes(
+/*==============*/
+ ulint type, /*!< in: log record type: MLOG_1BYTE, ... */
+ byte* ptr, /*!< in: buffer */
+ byte* end_ptr,/*!< in: buffer end */
+ byte* page, /*!< in: page where to apply the log record, or NULL */
+ void* page_zip);/*!< in/out: compressed page, or NULL */
+/********************************************************//**
+Parses a log record written by mlog_write_string.
+@return parsed record end, NULL if not a complete record */
+UNIV_INTERN
+byte*
+mlog_parse_string(
+/*==============*/
+ byte* ptr, /*!< in: buffer */
+ byte* end_ptr,/*!< in: buffer end */
+ byte* page, /*!< in: page where to apply the log record, or NULL */
+ void* page_zip);/*!< in/out: compressed page, or NULL */
+
+#ifndef UNIV_HOTBACKUP
+/********************************************************//**
+Opens a buffer for mlog, writes the initial log record and,
+if needed, the field lengths of an index. Reserves space
+for further log entries. The log entry must be closed with
+mtr_close().
+@return buffer, NULL if log mode MTR_LOG_NONE */
+UNIV_INTERN
+byte*
+mlog_open_and_write_index(
+/*======================*/
+ mtr_t* mtr, /*!< in: mtr */
+ const byte* rec, /*!< in: index record or page */
+ dict_index_t* index, /*!< in: record descriptor */
+ byte type, /*!< in: log item type */
+ ulint size); /*!< in: requested buffer size in bytes
+ (if 0, calls mlog_close() and returns NULL) */
+#endif /* !UNIV_HOTBACKUP */
+
+/********************************************************//**
+Parses a log record written by mlog_open_and_write_index.
+@return parsed record end, NULL if not a complete record */
+UNIV_INTERN
+byte*
+mlog_parse_index(
+/*=============*/
+ byte* ptr, /*!< in: buffer */
+ const byte* end_ptr,/*!< in: buffer end */
+ ibool comp, /*!< in: TRUE=compact record format */
+ dict_index_t** index); /*!< out, own: dummy index */
+
+#ifndef UNIV_HOTBACKUP
+/* Insert, update, and maybe other functions may use this value to define an
+extra mlog buffer size for variable size data */
+#define MLOG_BUF_MARGIN 256
+#endif /* !UNIV_HOTBACKUP */
+
+#ifndef UNIV_NONINL
+#include "mtr0log.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/mtr0log.ic b/storage/innobase/include/mtr0log.ic
new file mode 100644
index 00000000000..5c24c38b337
--- /dev/null
+++ b/storage/innobase/include/mtr0log.ic
@@ -0,0 +1,274 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/mtr0log.ic
+Mini-transaction logging routines
+
+Created 12/7/1995 Heikki Tuuri
+*******************************************************/
+
+#include "mach0data.h"
+#include "ut0lst.h"
+#include "buf0buf.h"
+#include "fsp0types.h"
+#include "trx0sys.h"
+
+/********************************************************//**
+Opens a buffer to mlog. It must be closed with mlog_close.
+@return buffer, NULL if log mode MTR_LOG_NONE */
+UNIV_INLINE
+byte*
+mlog_open(
+/*======*/
+ mtr_t* mtr, /*!< in: mtr */
+ ulint size) /*!< in: buffer size in bytes; MUST be
+ smaller than DYN_ARRAY_DATA_SIZE! */
+{
+ dyn_array_t* mlog;
+
+ mtr->modifications = TRUE;
+
+ if (mtr_get_log_mode(mtr) == MTR_LOG_NONE) {
+
+ return(NULL);
+ }
+
+ mlog = &(mtr->log);
+
+ return(dyn_array_open(mlog, size));
+}
+
+/********************************************************//**
+Closes a buffer opened to mlog. */
+UNIV_INLINE
+void
+mlog_close(
+/*=======*/
+ mtr_t* mtr, /*!< in: mtr */
+ byte* ptr) /*!< in: buffer space from ptr up was not used */
+{
+ dyn_array_t* mlog;
+
+ ut_ad(mtr_get_log_mode(mtr) != MTR_LOG_NONE);
+
+ mlog = &(mtr->log);
+
+ dyn_array_close(mlog, ptr);
+}
+
+#ifndef UNIV_HOTBACKUP
+/********************************************************//**
+Catenates 1 - 4 bytes to the mtr log. The value is not compressed. */
+UNIV_INLINE
+void
+mlog_catenate_ulint(
+/*================*/
+ mtr_t* mtr, /*!< in: mtr */
+ ulint val, /*!< in: value to write */
+ ulint type) /*!< in: MLOG_1BYTE, MLOG_2BYTES, MLOG_4BYTES */
+{
+ dyn_array_t* mlog;
+ byte* ptr;
+
+ if (mtr_get_log_mode(mtr) == MTR_LOG_NONE) {
+
+ return;
+ }
+
+ mlog = &(mtr->log);
+
+#if MLOG_1BYTE != 1
+# error "MLOG_1BYTE != 1"
+#endif
+#if MLOG_2BYTES != 2
+# error "MLOG_2BYTES != 2"
+#endif
+#if MLOG_4BYTES != 4
+# error "MLOG_4BYTES != 4"
+#endif
+#if MLOG_8BYTES != 8
+# error "MLOG_8BYTES != 8"
+#endif
+ ptr = (byte*) dyn_array_push(mlog, type);
+
+ if (type == MLOG_4BYTES) {
+ mach_write_to_4(ptr, val);
+ } else if (type == MLOG_2BYTES) {
+ mach_write_to_2(ptr, val);
+ } else {
+ ut_ad(type == MLOG_1BYTE);
+ mach_write_to_1(ptr, val);
+ }
+}
+
+/********************************************************//**
+Catenates a compressed ulint to mlog. */
+UNIV_INLINE
+void
+mlog_catenate_ulint_compressed(
+/*===========================*/
+ mtr_t* mtr, /*!< in: mtr */
+ ulint val) /*!< in: value to write */
+{
+ byte* log_ptr;
+
+ log_ptr = mlog_open(mtr, 10);
+
+ /* If no logging is requested, we may return now */
+ if (log_ptr == NULL) {
+
+ return;
+ }
+
+ log_ptr += mach_write_compressed(log_ptr, val);
+
+ mlog_close(mtr, log_ptr);
+}
+
+/********************************************************//**
+Catenates a compressed dulint to mlog. */
+UNIV_INLINE
+void
+mlog_catenate_dulint_compressed(
+/*============================*/
+ mtr_t* mtr, /*!< in: mtr */
+ dulint val) /*!< in: value to write */
+{
+ byte* log_ptr;
+
+ log_ptr = mlog_open(mtr, 15);
+
+ /* If no logging is requested, we may return now */
+ if (log_ptr == NULL) {
+
+ return;
+ }
+
+ log_ptr += mach_dulint_write_compressed(log_ptr, val);
+
+ mlog_close(mtr, log_ptr);
+}
+
+/********************************************************//**
+Writes the initial part of a log record (3..11 bytes).
+If the implementation of this function is changed, all
+size parameters to mlog_open() should be adjusted accordingly!
+@return new value of log_ptr */
+UNIV_INLINE
+byte*
+mlog_write_initial_log_record_fast(
+/*===============================*/
+ const byte* ptr, /*!< in: pointer to (inside) a buffer
+ frame holding the file page where
+ modification is made */
+ byte type, /*!< in: log item type: MLOG_1BYTE, ... */
+ byte* log_ptr,/*!< in: pointer to mtr log which has
+ been opened */
+ mtr_t* mtr) /*!< in: mtr */
+{
+#ifdef UNIV_DEBUG
+ buf_block_t* block;
+#endif
+ const byte* page;
+ ulint space;
+ ulint offset;
+
+ ut_ad(mtr_memo_contains_page(mtr, ptr, MTR_MEMO_PAGE_X_FIX));
+ ut_ad(type <= MLOG_BIGGEST_TYPE);
+ ut_ad(ptr && log_ptr);
+
+ page = (const byte*) ut_align_down(ptr, UNIV_PAGE_SIZE);
+ space = mach_read_from_4(page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
+ offset = mach_read_from_4(page + FIL_PAGE_OFFSET);
+
+ /* check whether the page is in the doublewrite buffer;
+ the doublewrite buffer is located in pages
+ FSP_EXTENT_SIZE, ..., 3 * FSP_EXTENT_SIZE - 1 in the
+ system tablespace */
+ if (space == TRX_SYS_SPACE
+ && offset >= FSP_EXTENT_SIZE && offset < 3 * FSP_EXTENT_SIZE) {
+ if (trx_doublewrite_buf_is_being_created) {
+ /* Do nothing: we only come to this branch in an
+ InnoDB database creation. We do not redo log
+ anything for the doublewrite buffer pages. */
+ return(log_ptr);
+ } else {
+ fprintf(stderr,
+ "Error: trying to redo log a record of type "
+ "%d on page %lu of space %lu in the "
+ "doublewrite buffer, continuing anyway.\n"
+ "Please post a bug report to "
+ "bugs.mysql.com.\n",
+ type, offset, space);
+ }
+ }
+
+ mach_write_to_1(log_ptr, type);
+ log_ptr++;
+ log_ptr += mach_write_compressed(log_ptr, space);
+ log_ptr += mach_write_compressed(log_ptr, offset);
+
+ mtr->n_log_recs++;
+
+#ifdef UNIV_LOG_DEBUG
+ fprintf(stderr,
+ "Adding to mtr log record type %lu space %lu page no %lu\n",
+ (ulong) type, space, offset);
+#endif
+
+#ifdef UNIV_DEBUG
+ /* We now assume that all x-latched pages have been modified! */
+ block = (buf_block_t*) buf_block_align(ptr);
+
+ if (!mtr_memo_contains(mtr, block, MTR_MEMO_MODIFY)) {
+
+ mtr_memo_push(mtr, block, MTR_MEMO_MODIFY);
+ }
+#endif
+ return(log_ptr);
+}
+
+/********************************************************//**
+Writes a log record about an .ibd file create/delete/rename.
+@return new value of log_ptr */
+UNIV_INLINE
+byte*
+mlog_write_initial_log_record_for_file_op(
+/*======================================*/
+ ulint type, /*!< in: MLOG_FILE_CREATE, MLOG_FILE_DELETE, or
+ MLOG_FILE_RENAME */
+ ulint space_id,/*!< in: space id, if applicable */
+ ulint page_no,/*!< in: page number (not relevant currently) */
+ byte* log_ptr,/*!< in: pointer to mtr log which has been opened */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ut_ad(log_ptr);
+
+ mach_write_to_1(log_ptr, type);
+ log_ptr++;
+
+ /* We write dummy space id and page number */
+ log_ptr += mach_write_compressed(log_ptr, space_id);
+ log_ptr += mach_write_compressed(log_ptr, page_no);
+
+ mtr->n_log_recs++;
+
+ return(log_ptr);
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/innobase/include/mtr0mtr.h b/storage/innobase/include/mtr0mtr.h
new file mode 100644
index 00000000000..69a2c03f4cb
--- /dev/null
+++ b/storage/innobase/include/mtr0mtr.h
@@ -0,0 +1,416 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/mtr0mtr.h
+Mini-transaction buffer
+
+Created 11/26/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef mtr0mtr_h
+#define mtr0mtr_h
+
+#include "univ.i"
+#include "mem0mem.h"
+#include "dyn0dyn.h"
+#include "buf0types.h"
+#include "sync0rw.h"
+#include "ut0byte.h"
+#include "mtr0types.h"
+#include "page0types.h"
+
+/* Logging modes for a mini-transaction */
+#define MTR_LOG_ALL 21 /* default mode: log all operations
+ modifying disk-based data */
+#define MTR_LOG_NONE 22 /* log no operations */
+/*#define MTR_LOG_SPACE 23 */ /* log only operations modifying
+ file space page allocation data
+ (operations in fsp0fsp.* ) */
+#define MTR_LOG_SHORT_INSERTS 24 /* inserts are logged in a shorter
+ form */
+
+/* Types for the mlock objects to store in the mtr memo; NOTE that the
+first 3 values must be RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH */
+#define MTR_MEMO_PAGE_S_FIX RW_S_LATCH
+#define MTR_MEMO_PAGE_X_FIX RW_X_LATCH
+#define MTR_MEMO_BUF_FIX RW_NO_LATCH
+#define MTR_MEMO_MODIFY 54
+#define MTR_MEMO_S_LOCK 55
+#define MTR_MEMO_X_LOCK 56
+
+/** @name Log item types
+The log items are declared 'byte' so that the compiler can warn if val
+and type parameters are switched in a call to mlog_write_ulint. NOTE!
+For 1 - 8 bytes, the flag value must give the length also! @{ */
+#define MLOG_SINGLE_REC_FLAG 128 /*!< if the mtr contains only
+ one log record for one page,
+ i.e., write_initial_log_record
+ has been called only once,
+ this flag is ORed to the type
+ of that first log record */
+#define MLOG_1BYTE (1) /*!< one byte is written */
+#define MLOG_2BYTES (2) /*!< 2 bytes ... */
+#define MLOG_4BYTES (4) /*!< 4 bytes ... */
+#define MLOG_8BYTES (8) /*!< 8 bytes ... */
+#define MLOG_REC_INSERT ((byte)9) /*!< record insert */
+#define MLOG_REC_CLUST_DELETE_MARK ((byte)10) /*!< mark clustered index record
+ deleted */
+#define MLOG_REC_SEC_DELETE_MARK ((byte)11) /*!< mark secondary index record
+ deleted */
+#define MLOG_REC_UPDATE_IN_PLACE ((byte)13) /*!< update of a record,
+ preserves record field sizes */
+#define MLOG_REC_DELETE ((byte)14) /*!< delete a record from a
+ page */
+#define MLOG_LIST_END_DELETE ((byte)15) /*!< delete record list end on
+ index page */
+#define MLOG_LIST_START_DELETE ((byte)16) /*!< delete record list start on
+ index page */
+#define MLOG_LIST_END_COPY_CREATED ((byte)17) /*!< copy record list end to a
+ new created index page */
+#define MLOG_PAGE_REORGANIZE ((byte)18) /*!< reorganize an
+ index page in
+ ROW_FORMAT=REDUNDANT */
+#define MLOG_PAGE_CREATE ((byte)19) /*!< create an index page */
+#define MLOG_UNDO_INSERT ((byte)20) /*!< insert entry in an undo
+ log */
+#define MLOG_UNDO_ERASE_END ((byte)21) /*!< erase an undo log
+ page end */
+#define MLOG_UNDO_INIT ((byte)22) /*!< initialize a page in an
+ undo log */
+#define MLOG_UNDO_HDR_DISCARD ((byte)23) /*!< discard an update undo log
+ header */
+#define MLOG_UNDO_HDR_REUSE ((byte)24) /*!< reuse an insert undo log
+ header */
+#define MLOG_UNDO_HDR_CREATE ((byte)25) /*!< create an undo
+ log header */
+#define MLOG_REC_MIN_MARK ((byte)26) /*!< mark an index
+ record as the
+ predefined minimum
+ record */
+#define MLOG_IBUF_BITMAP_INIT ((byte)27) /*!< initialize an
+ ibuf bitmap page */
+/*#define MLOG_FULL_PAGE ((byte)28) full contents of a page */
+#define MLOG_INIT_FILE_PAGE ((byte)29) /*!< this means that a
+ file page is taken
+ into use and the prior
+ contents of the page
+ should be ignored: in
+ recovery we must not
+ trust the lsn values
+ stored to the file
+ page */
+#define MLOG_WRITE_STRING ((byte)30) /*!< write a string to
+ a page */
+#define MLOG_MULTI_REC_END ((byte)31) /*!< if a single mtr writes
+ log records for several pages,
+ this log record ends the
+ sequence of these records */
+#define MLOG_DUMMY_RECORD ((byte)32) /*!< dummy log record used to
+ pad a log block full */
+#define MLOG_FILE_CREATE ((byte)33) /*!< log record about an .ibd
+ file creation */
+#define MLOG_FILE_RENAME ((byte)34) /*!< log record about an .ibd
+ file rename */
+#define MLOG_FILE_DELETE ((byte)35) /*!< log record about an .ibd
+ file deletion */
+#define MLOG_COMP_REC_MIN_MARK ((byte)36) /*!< mark a compact
+ index record as the
+ predefined minimum
+ record */
+#define MLOG_COMP_PAGE_CREATE ((byte)37) /*!< create a compact
+ index page */
+#define MLOG_COMP_REC_INSERT ((byte)38) /*!< compact record insert */
+#define MLOG_COMP_REC_CLUST_DELETE_MARK ((byte)39)
+ /*!< mark compact
+ clustered index record
+ deleted */
+#define MLOG_COMP_REC_SEC_DELETE_MARK ((byte)40)/*!< mark compact
+ secondary index record
+ deleted; this log
+ record type is
+ redundant, as
+ MLOG_REC_SEC_DELETE_MARK
+ is independent of the
+ record format. */
+#define MLOG_COMP_REC_UPDATE_IN_PLACE ((byte)41)/*!< update of a
+ compact record,
+ preserves record field
+ sizes */
+#define MLOG_COMP_REC_DELETE ((byte)42) /*!< delete a compact record
+ from a page */
+#define MLOG_COMP_LIST_END_DELETE ((byte)43) /*!< delete compact record list
+ end on index page */
+#define MLOG_COMP_LIST_START_DELETE ((byte)44) /*!< delete compact record list
+ start on index page */
+#define MLOG_COMP_LIST_END_COPY_CREATED ((byte)45)
+ /*!< copy compact
+ record list end to a
+ new created index
+ page */
+#define MLOG_COMP_PAGE_REORGANIZE ((byte)46) /*!< reorganize an index page */
+#define MLOG_FILE_CREATE2 ((byte)47) /*!< log record about creating
+ an .ibd file, with format */
+#define MLOG_ZIP_WRITE_NODE_PTR ((byte)48) /*!< write the node pointer of
+ a record on a compressed
+ non-leaf B-tree page */
+#define MLOG_ZIP_WRITE_BLOB_PTR ((byte)49) /*!< write the BLOB pointer
+ of an externally stored column
+ on a compressed page */
+#define MLOG_ZIP_WRITE_HEADER ((byte)50) /*!< write to compressed page
+ header */
+#define MLOG_ZIP_PAGE_COMPRESS ((byte)51) /*!< compress an index page */
+#define MLOG_BIGGEST_TYPE ((byte)51) /*!< biggest value (used in
+ assertions) */
+/* @} */
+
+/** @name Flags for MLOG_FILE operations
+(stored in the page number parameter, called log_flags in the
+functions). The page number parameter was originally written as 0. @{ */
+#define MLOG_FILE_FLAG_TEMP 1 /*!< identifies TEMPORARY TABLE in
+ MLOG_FILE_CREATE, MLOG_FILE_CREATE2 */
+/* @} */
+
+/***************************************************************//**
+Starts a mini-transaction and creates a mini-transaction handle
+and buffer in the memory buffer given by the caller.
+@return mtr buffer which also acts as the mtr handle */
+UNIV_INLINE
+mtr_t*
+mtr_start(
+/*======*/
+ mtr_t* mtr); /*!< in: memory buffer for the mtr buffer */
+/***************************************************************//**
+Commits a mini-transaction. */
+UNIV_INTERN
+void
+mtr_commit(
+/*=======*/
+ mtr_t* mtr); /*!< in: mini-transaction */
+/**********************************************************//**
+Sets and returns a savepoint in mtr.
+@return savepoint */
+UNIV_INLINE
+ulint
+mtr_set_savepoint(
+/*==============*/
+ mtr_t* mtr); /*!< in: mtr */
+/**********************************************************//**
+Releases the latches stored in an mtr memo down to a savepoint.
+NOTE! The mtr must not have made changes to buffer pages after the
+savepoint, as these can be handled only by mtr_commit. */
+UNIV_INTERN
+void
+mtr_rollback_to_savepoint(
+/*======================*/
+ mtr_t* mtr, /*!< in: mtr */
+ ulint savepoint); /*!< in: savepoint */
+#ifndef UNIV_HOTBACKUP
+/**********************************************************//**
+Releases the (index tree) s-latch stored in an mtr memo after a
+savepoint. */
+UNIV_INLINE
+void
+mtr_release_s_latch_at_savepoint(
+/*=============================*/
+ mtr_t* mtr, /*!< in: mtr */
+ ulint savepoint, /*!< in: savepoint */
+ rw_lock_t* lock); /*!< in: latch to release */
+#else /* !UNIV_HOTBACKUP */
+# define mtr_release_s_latch_at_savepoint(mtr,savepoint,lock) ((void) 0)
+#endif /* !UNIV_HOTBACKUP */
+/***************************************************************//**
+Gets the logging mode of a mini-transaction.
+@return logging mode: MTR_LOG_NONE, ... */
+UNIV_INLINE
+ulint
+mtr_get_log_mode(
+/*=============*/
+ mtr_t* mtr); /*!< in: mtr */
+/***************************************************************//**
+Changes the logging mode of a mini-transaction.
+@return old mode */
+UNIV_INLINE
+ulint
+mtr_set_log_mode(
+/*=============*/
+ mtr_t* mtr, /*!< in: mtr */
+ ulint mode); /*!< in: logging mode: MTR_LOG_NONE, ... */
+/********************************************************//**
+Reads 1 - 4 bytes from a file page buffered in the buffer pool.
+@return value read */
+UNIV_INTERN
+ulint
+mtr_read_ulint(
+/*===========*/
+ const byte* ptr, /*!< in: pointer from where to read */
+ ulint type, /*!< in: MLOG_1BYTE, MLOG_2BYTES, MLOG_4BYTES */
+ mtr_t* mtr); /*!< in: mini-transaction handle */
+/********************************************************//**
+Reads 8 bytes from a file page buffered in the buffer pool.
+@return value read */
+UNIV_INTERN
+dulint
+mtr_read_dulint(
+/*============*/
+ const byte* ptr, /*!< in: pointer from where to read */
+ mtr_t* mtr); /*!< in: mini-transaction handle */
+#ifndef UNIV_HOTBACKUP
+/*********************************************************************//**
+This macro locks an rw-lock in s-mode. */
+#define mtr_s_lock(B, MTR) mtr_s_lock_func((B), __FILE__, __LINE__,\
+ (MTR))
+/*********************************************************************//**
+This macro locks an rw-lock in x-mode. */
+#define mtr_x_lock(B, MTR) mtr_x_lock_func((B), __FILE__, __LINE__,\
+ (MTR))
+/*********************************************************************//**
+NOTE! Use the macro above!
+Locks a lock in s-mode. */
+UNIV_INLINE
+void
+mtr_s_lock_func(
+/*============*/
+ rw_lock_t* lock, /*!< in: rw-lock */
+ const char* file, /*!< in: file name */
+ ulint line, /*!< in: line number */
+ mtr_t* mtr); /*!< in: mtr */
+/*********************************************************************//**
+NOTE! Use the macro above!
+Locks a lock in x-mode. */
+UNIV_INLINE
+void
+mtr_x_lock_func(
+/*============*/
+ rw_lock_t* lock, /*!< in: rw-lock */
+ const char* file, /*!< in: file name */
+ ulint line, /*!< in: line number */
+ mtr_t* mtr); /*!< in: mtr */
+#endif /* !UNIV_HOTBACKUP */
+
+/***************************************************//**
+Releases an object in the memo stack. */
+UNIV_INTERN
+void
+mtr_memo_release(
+/*=============*/
+ mtr_t* mtr, /*!< in: mtr */
+ void* object, /*!< in: object */
+ ulint type); /*!< in: object type: MTR_MEMO_S_LOCK, ... */
+#ifdef UNIV_DEBUG
+# ifndef UNIV_HOTBACKUP
+/**********************************************************//**
+Checks if memo contains the given item.
+@return TRUE if contains */
+UNIV_INLINE
+ibool
+mtr_memo_contains(
+/*==============*/
+ mtr_t* mtr, /*!< in: mtr */
+ const void* object, /*!< in: object to search */
+ ulint type); /*!< in: type of object */
+
+/**********************************************************//**
+Checks if memo contains the given page.
+@return TRUE if contains */
+UNIV_INTERN
+ibool
+mtr_memo_contains_page(
+/*===================*/
+ mtr_t* mtr, /*!< in: mtr */
+ const byte* ptr, /*!< in: pointer to buffer frame */
+ ulint type); /*!< in: type of object */
+/*********************************************************//**
+Prints info of an mtr handle. */
+UNIV_INTERN
+void
+mtr_print(
+/*======*/
+ mtr_t* mtr); /*!< in: mtr */
+# else /* !UNIV_HOTBACKUP */
+# define mtr_memo_contains(mtr, object, type) TRUE
+# define mtr_memo_contains_page(mtr, ptr, type) TRUE
+# endif /* !UNIV_HOTBACKUP */
+#endif /* UNIV_DEBUG */
+/*######################################################################*/
+
+#define MTR_BUF_MEMO_SIZE 200 /* number of slots in memo */
+
+/***************************************************************//**
+Returns the log object of a mini-transaction buffer.
+@return log */
+UNIV_INLINE
+dyn_array_t*
+mtr_get_log(
+/*========*/
+ mtr_t* mtr); /*!< in: mini-transaction */
+/***************************************************//**
+Pushes an object to an mtr memo stack. */
+UNIV_INLINE
+void
+mtr_memo_push(
+/*==========*/
+ mtr_t* mtr, /*!< in: mtr */
+ void* object, /*!< in: object */
+ ulint type); /*!< in: object type: MTR_MEMO_S_LOCK, ... */
+
+
+/* Type definition of a mini-transaction memo stack slot. */
+typedef struct mtr_memo_slot_struct mtr_memo_slot_t;
+struct mtr_memo_slot_struct{
+ ulint type; /*!< type of the stored object (MTR_MEMO_S_LOCK, ...) */
+ void* object; /*!< pointer to the object */
+};
+
+/* Mini-transaction handle and buffer */
+struct mtr_struct{
+#ifdef UNIV_DEBUG
+ ulint state; /*!< MTR_ACTIVE, MTR_COMMITTING, MTR_COMMITTED */
+#endif
+ dyn_array_t memo; /*!< memo stack for locks etc. */
+ dyn_array_t log; /*!< mini-transaction log */
+ ibool modifications;
+ /* TRUE if the mtr made modifications to
+ buffer pool pages */
+ ulint n_log_recs;
+ /* count of how many page initial log records
+ have been written to the mtr log */
+ ulint log_mode; /* specifies which operations should be
+ logged; default value MTR_LOG_ALL */
+ ib_uint64_t start_lsn;/* start lsn of the possible log entry for
+ this mtr */
+ ib_uint64_t end_lsn;/* end lsn of the possible log entry for
+ this mtr */
+#ifdef UNIV_DEBUG
+ ulint magic_n;
+#endif /* UNIV_DEBUG */
+};
+
+#ifdef UNIV_DEBUG
+# define MTR_MAGIC_N 54551
+#endif /* UNIV_DEBUG */
+
+#define MTR_ACTIVE 12231
+#define MTR_COMMITTING 56456
+#define MTR_COMMITTED 34676
+
+#ifndef UNIV_NONINL
+#include "mtr0mtr.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/mtr0mtr.ic b/storage/innobase/include/mtr0mtr.ic
new file mode 100644
index 00000000000..310c7c4117f
--- /dev/null
+++ b/storage/innobase/include/mtr0mtr.ic
@@ -0,0 +1,272 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/mtr0mtr.ic
+Mini-transaction buffer
+
+Created 11/26/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef UNIV_HOTBACKUP
+# include "sync0sync.h"
+# include "sync0rw.h"
+#endif /* !UNIV_HOTBACKUP */
+#include "mach0data.h"
+
+/***************************************************************//**
+Starts a mini-transaction and creates a mini-transaction handle
+and a buffer in the memory buffer given by the caller.
+@return mtr buffer which also acts as the mtr handle */
+UNIV_INLINE
+mtr_t*
+mtr_start(
+/*======*/
+ mtr_t* mtr) /*!< in: memory buffer for the mtr buffer */
+{
+ dyn_array_create(&(mtr->memo));
+ dyn_array_create(&(mtr->log));
+
+ mtr->log_mode = MTR_LOG_ALL;
+ mtr->modifications = FALSE;
+ mtr->n_log_recs = 0;
+
+ ut_d(mtr->state = MTR_ACTIVE);
+ ut_d(mtr->magic_n = MTR_MAGIC_N);
+
+ return(mtr);
+}
+
+/***************************************************//**
+Pushes an object to an mtr memo stack. */
+UNIV_INLINE
+void
+mtr_memo_push(
+/*==========*/
+ mtr_t* mtr, /*!< in: mtr */
+ void* object, /*!< in: object */
+ ulint type) /*!< in: object type: MTR_MEMO_S_LOCK, ... */
+{
+ dyn_array_t* memo;
+ mtr_memo_slot_t* slot;
+
+ ut_ad(object);
+ ut_ad(type >= MTR_MEMO_PAGE_S_FIX);
+ ut_ad(type <= MTR_MEMO_X_LOCK);
+ ut_ad(mtr);
+ ut_ad(mtr->magic_n == MTR_MAGIC_N);
+
+ memo = &(mtr->memo);
+
+ slot = (mtr_memo_slot_t*) dyn_array_push(memo, sizeof *slot);
+
+ slot->object = object;
+ slot->type = type;
+}
+
+/**********************************************************//**
+Sets and returns a savepoint in mtr.
+@return savepoint */
+UNIV_INLINE
+ulint
+mtr_set_savepoint(
+/*==============*/
+ mtr_t* mtr) /*!< in: mtr */
+{
+ dyn_array_t* memo;
+
+ ut_ad(mtr);
+ ut_ad(mtr->magic_n == MTR_MAGIC_N);
+
+ memo = &(mtr->memo);
+
+ return(dyn_array_get_data_size(memo));
+}
+
+#ifndef UNIV_HOTBACKUP
+/**********************************************************//**
+Releases the (index tree) s-latch stored in an mtr memo after a
+savepoint. */
+UNIV_INLINE
+void
+mtr_release_s_latch_at_savepoint(
+/*=============================*/
+ mtr_t* mtr, /*!< in: mtr */
+ ulint savepoint, /*!< in: savepoint */
+ rw_lock_t* lock) /*!< in: latch to release */
+{
+ mtr_memo_slot_t* slot;
+ dyn_array_t* memo;
+
+ ut_ad(mtr);
+ ut_ad(mtr->magic_n == MTR_MAGIC_N);
+ ut_ad(mtr->state == MTR_ACTIVE);
+
+ memo = &(mtr->memo);
+
+ ut_ad(dyn_array_get_data_size(memo) > savepoint);
+
+ slot = (mtr_memo_slot_t*) dyn_array_get_element(memo, savepoint);
+
+ ut_ad(slot->object == lock);
+ ut_ad(slot->type == MTR_MEMO_S_LOCK);
+
+ rw_lock_s_unlock(lock);
+
+ slot->object = NULL;
+}
+
+# ifdef UNIV_DEBUG
+/**********************************************************//**
+Checks if memo contains the given item.
+@return TRUE if contains */
+UNIV_INLINE
+ibool
+mtr_memo_contains(
+/*==============*/
+ mtr_t* mtr, /*!< in: mtr */
+ const void* object, /*!< in: object to search */
+ ulint type) /*!< in: type of object */
+{
+ mtr_memo_slot_t* slot;
+ dyn_array_t* memo;
+ ulint offset;
+
+ ut_ad(mtr);
+ ut_ad(mtr->magic_n == MTR_MAGIC_N);
+
+ memo = &(mtr->memo);
+
+ offset = dyn_array_get_data_size(memo);
+
+ while (offset > 0) {
+ offset -= sizeof(mtr_memo_slot_t);
+
+ slot = dyn_array_get_element(memo, offset);
+
+ if ((object == slot->object) && (type == slot->type)) {
+
+ return(TRUE);
+ }
+ }
+
+ return(FALSE);
+}
+# endif /* UNIV_DEBUG */
+#endif /* !UNIV_HOTBACKUP */
+
+/***************************************************************//**
+Returns the log object of a mini-transaction buffer.
+@return log */
+UNIV_INLINE
+dyn_array_t*
+mtr_get_log(
+/*========*/
+ mtr_t* mtr) /*!< in: mini-transaction */
+{
+ ut_ad(mtr);
+ ut_ad(mtr->magic_n == MTR_MAGIC_N);
+
+ return(&(mtr->log));
+}
+
+/***************************************************************//**
+Gets the logging mode of a mini-transaction.
+@return logging mode: MTR_LOG_NONE, ... */
+UNIV_INLINE
+ulint
+mtr_get_log_mode(
+/*=============*/
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ut_ad(mtr);
+ ut_ad(mtr->log_mode >= MTR_LOG_ALL);
+ ut_ad(mtr->log_mode <= MTR_LOG_SHORT_INSERTS);
+
+ return(mtr->log_mode);
+}
+
+/***************************************************************//**
+Changes the logging mode of a mini-transaction.
+@return old mode */
+UNIV_INLINE
+ulint
+mtr_set_log_mode(
+/*=============*/
+ mtr_t* mtr, /*!< in: mtr */
+ ulint mode) /*!< in: logging mode: MTR_LOG_NONE, ... */
+{
+ ulint old_mode;
+
+ ut_ad(mtr);
+ ut_ad(mode >= MTR_LOG_ALL);
+ ut_ad(mode <= MTR_LOG_SHORT_INSERTS);
+
+ old_mode = mtr->log_mode;
+
+ if ((mode == MTR_LOG_SHORT_INSERTS) && (old_mode == MTR_LOG_NONE)) {
+ /* Do nothing */
+ } else {
+ mtr->log_mode = mode;
+ }
+
+ ut_ad(old_mode >= MTR_LOG_ALL);
+ ut_ad(old_mode <= MTR_LOG_SHORT_INSERTS);
+
+ return(old_mode);
+}
+
+#ifndef UNIV_HOTBACKUP
+/*********************************************************************//**
+Locks a lock in s-mode. */
+UNIV_INLINE
+void
+mtr_s_lock_func(
+/*============*/
+ rw_lock_t* lock, /*!< in: rw-lock */
+ const char* file, /*!< in: file name */
+ ulint line, /*!< in: line number */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ut_ad(mtr);
+ ut_ad(lock);
+
+ rw_lock_s_lock_func(lock, 0, file, line);
+
+ mtr_memo_push(mtr, lock, MTR_MEMO_S_LOCK);
+}
+
+/*********************************************************************//**
+Locks a lock in x-mode. */
+UNIV_INLINE
+void
+mtr_x_lock_func(
+/*============*/
+ rw_lock_t* lock, /*!< in: rw-lock */
+ const char* file, /*!< in: file name */
+ ulint line, /*!< in: line number */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ut_ad(mtr);
+ ut_ad(lock);
+
+ rw_lock_x_lock_func(lock, 0, file, line);
+
+ mtr_memo_push(mtr, lock, MTR_MEMO_X_LOCK);
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/innobase/include/mtr0types.h b/storage/innobase/include/mtr0types.h
new file mode 100644
index 00000000000..83a7aaf3839
--- /dev/null
+++ b/storage/innobase/include/mtr0types.h
@@ -0,0 +1,31 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/mtr0types.h
+Mini-transaction buffer global types
+
+Created 11/26/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef mtr0types_h
+#define mtr0types_h
+
+typedef struct mtr_struct mtr_t;
+
+#endif
diff --git a/storage/innobase/include/mysql_addons.h b/storage/innobase/include/mysql_addons.h
new file mode 100644
index 00000000000..17660c18710
--- /dev/null
+++ b/storage/innobase/include/mysql_addons.h
@@ -0,0 +1,33 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/mysql_addons.h
+This file contains functions that need to be added to
+MySQL code but have not been added yet.
+
+Whenever you add a function here submit a MySQL bug
+report (feature request) with the implementation. Then
+write the bug number in the comment before the
+function in this file.
+
+When MySQL commits the function it can be deleted from
+here. In a perfect world this file exists but is empty.
+
+Created November 07, 2007 Vasil Dimov
+*******************************************************/
diff --git a/storage/innobase/include/os0file.h b/storage/innobase/include/os0file.h
new file mode 100644
index 00000000000..d8d2f0e5d9e
--- /dev/null
+++ b/storage/innobase/include/os0file.h
@@ -0,0 +1,796 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+/***********************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 2009, Percona Inc.
+
+Portions of this file contain modifications contributed and copyrighted
+by Percona Inc.. Those modifications are
+gratefully acknowledged and are described briefly in the InnoDB
+documentation. The contributions by Percona Inc. are incorporated with
+their permission, and subject to the conditions contained in the file
+COPYING.Percona.
+
+This program is free software; you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by the
+Free Software Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
+Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+***********************************************************************/
+
+/**************************************************//**
+@file include/os0file.h
+The interface to the operating system file io
+
+Created 10/21/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef os0file_h
+#define os0file_h
+
+#include "univ.i"
+
+#ifndef __WIN__
+#include <dirent.h>
+#include <sys/stat.h>
+#include <time.h>
+#endif
+
+/** File node of a tablespace or the log data space */
+typedef struct fil_node_struct fil_node_t;
+
+#ifdef UNIV_DO_FLUSH
+extern ibool os_do_not_call_flush_at_each_write;
+#endif /* UNIV_DO_FLUSH */
+extern ibool os_has_said_disk_full;
+/** Flag: enable debug printout for asynchronous i/o */
+extern ibool os_aio_print_debug;
+
+/** Number of pending os_file_pread() operations */
+extern ulint os_file_n_pending_preads;
+/** Number of pending os_file_pwrite() operations */
+extern ulint os_file_n_pending_pwrites;
+
+/** Number of pending read operations */
+extern ulint os_n_pending_reads;
+/** Number of pending write operations */
+extern ulint os_n_pending_writes;
+
+#ifdef __WIN__
+
+/** We define always WIN_ASYNC_IO, and check at run-time whether
+ the OS actually supports it: Win 95 does not, NT does. */
+#define WIN_ASYNC_IO
+
+/** Use unbuffered I/O */
+#define UNIV_NON_BUFFERED_IO
+
+#endif
+
+#ifdef __WIN__
+/** File handle */
+#define os_file_t HANDLE
+/** Convert a C file descriptor to a native file handle
+@param fd file descriptor
+@return native file handle */
+#define OS_FILE_FROM_FD(fd) (HANDLE) _get_osfhandle(fd)
+#else
+/** File handle */
+typedef int os_file_t;
+/** Convert a C file descriptor to a native file handle
+@param fd file descriptor
+@return native file handle */
+#define OS_FILE_FROM_FD(fd) fd
+#endif
+
+/** Umask for creating files */
+extern ulint os_innodb_umask;
+
+/** If this flag is TRUE, then we will use the native aio of the
+OS (provided we compiled Innobase with it in), otherwise we will
+use simulated aio we build below with threads */
+
+extern ibool os_aio_use_native_aio;
+
+/** The next value should be smaller or equal to the smallest sector size used
+on any disk. A log block is required to be a portion of disk which is written
+so that if the start and the end of a block get written to disk, then the
+whole block gets written. This should be true even in most cases of a crash:
+if this fails for a log block, then it is equivalent to a media failure in the
+log. */
+
+#define OS_FILE_LOG_BLOCK_SIZE 512
+
+/** Options for file_create @{ */
+#define OS_FILE_OPEN 51
+#define OS_FILE_CREATE 52
+#define OS_FILE_OVERWRITE 53
+#define OS_FILE_OPEN_RAW 54
+#define OS_FILE_CREATE_PATH 55
+#define OS_FILE_OPEN_RETRY 56 /* for os_file_create() on
+ the first ibdata file */
+
+#define OS_FILE_READ_ONLY 333
+#define OS_FILE_READ_WRITE 444
+#define OS_FILE_READ_ALLOW_DELETE 555 /* for ibbackup */
+
+/* Options for file_create */
+#define OS_FILE_AIO 61
+#define OS_FILE_NORMAL 62
+/* @} */
+
+/** Types for file create @{ */
+#define OS_DATA_FILE 100
+#define OS_LOG_FILE 101
+/* @} */
+
+/** Error codes from os_file_get_last_error @{ */
+#define OS_FILE_NOT_FOUND 71
+#define OS_FILE_DISK_FULL 72
+#define OS_FILE_ALREADY_EXISTS 73
+#define OS_FILE_PATH_ERROR 74
+#define OS_FILE_AIO_RESOURCES_RESERVED 75 /* wait for OS aio resources
+ to become available again */
+#define OS_FILE_SHARING_VIOLATION 76
+#define OS_FILE_ERROR_NOT_SPECIFIED 77
+/* @} */
+
+/** Types for aio operations @{ */
+#define OS_FILE_READ 10
+#define OS_FILE_WRITE 11
+
+#define OS_FILE_LOG 256 /* This can be ORed to type */
+/* @} */
+
+#define OS_AIO_N_PENDING_IOS_PER_THREAD 32 /*!< Win NT does not allow more
+ than 64 */
+
+/** Modes for aio operations @{ */
+#define OS_AIO_NORMAL 21 /*!< Normal asynchronous i/o not for ibuf
+ pages or ibuf bitmap pages */
+#define OS_AIO_IBUF 22 /*!< Asynchronous i/o for ibuf pages or ibuf
+ bitmap pages */
+#define OS_AIO_LOG 23 /*!< Asynchronous i/o for the log */
+#define OS_AIO_SYNC 24 /*!< Asynchronous i/o where the calling thread
+ will itself wait for the i/o to complete,
+ doing also the job of the i/o-handler thread;
+ can be used for any pages, ibuf or non-ibuf.
+ This is used to save CPU time, as we can do
+ with fewer thread switches. Plain synchronous
+ i/o is not as good, because it must serialize
+ the file seek and read or write, causing a
+ bottleneck for parallelism. */
+
+#define OS_AIO_SIMULATED_WAKE_LATER 512 /*!< This can be ORed to mode
+ in the call of os_aio(...),
+ if the caller wants to post several i/o
+ requests in a batch, and only after that
+ wake the i/o-handler thread; this has
+ effect only in simulated aio */
+/* @} */
+
+#define OS_WIN31 1 /*!< Microsoft Windows 3.x */
+#define OS_WIN95 2 /*!< Microsoft Windows 95 */
+#define OS_WINNT 3 /*!< Microsoft Windows NT 3.x */
+#define OS_WIN2000 4 /*!< Microsoft Windows 2000 */
+
+extern ulint os_n_file_reads;
+extern ulint os_n_file_writes;
+extern ulint os_n_fsyncs;
+
+/* File types for directory entry data type */
+
+enum os_file_type_enum{
+ OS_FILE_TYPE_UNKNOWN = 0,
+ OS_FILE_TYPE_FILE, /* regular file */
+ OS_FILE_TYPE_DIR, /* directory */
+ OS_FILE_TYPE_LINK /* symbolic link */
+};
+typedef enum os_file_type_enum os_file_type_t;
+
+/* Maximum path string length in bytes when referring to tables with in the
+'./databasename/tablename.ibd' path format; we can allocate at least 2 buffers
+of this size from the thread stack; that is why this should not be made much
+bigger than 4000 bytes */
+#define OS_FILE_MAX_PATH 4000
+
+/* Struct used in fetching information of a file in a directory */
+struct os_file_stat_struct{
+ char name[OS_FILE_MAX_PATH]; /*!< path to a file */
+ os_file_type_t type; /*!< file type */
+ ib_int64_t size; /*!< file size */
+ time_t ctime; /*!< creation time */
+ time_t mtime; /*!< modification time */
+ time_t atime; /*!< access time */
+};
+typedef struct os_file_stat_struct os_file_stat_t;
+
+#ifdef __WIN__
+typedef HANDLE os_file_dir_t; /*!< directory stream */
+#else
+typedef DIR* os_file_dir_t; /*!< directory stream */
+#endif
+
+/***********************************************************************//**
+Gets the operating system version. Currently works only on Windows.
+@return OS_WIN95, OS_WIN31, OS_WINNT, or OS_WIN2000 */
+UNIV_INTERN
+ulint
+os_get_os_version(void);
+/*===================*/
+#ifndef UNIV_HOTBACKUP
+/****************************************************************//**
+Creates the seek mutexes used in positioned reads and writes. */
+UNIV_INTERN
+void
+os_io_init_simple(void);
+/*===================*/
+/***********************************************************************//**
+Creates a temporary file. This function is like tmpfile(3), but
+the temporary file is created in the MySQL temporary directory.
+On Netware, this function is like tmpfile(3), because the C run-time
+library of Netware does not expose the delete-on-close flag.
+@return temporary file handle, or NULL on error */
+
+FILE*
+os_file_create_tmpfile(void);
+/*========================*/
+#endif /* !UNIV_HOTBACKUP */
+/***********************************************************************//**
+The os_file_opendir() function opens a directory stream corresponding to the
+directory named by the dirname argument. The directory stream is positioned
+at the first entry. In both Unix and Windows we automatically skip the '.'
+and '..' items at the start of the directory listing.
+@return directory stream, NULL if error */
+UNIV_INTERN
+os_file_dir_t
+os_file_opendir(
+/*============*/
+ const char* dirname, /*!< in: directory name; it must not
+ contain a trailing '\' or '/' */
+ ibool error_is_fatal);/*!< in: TRUE if we should treat an
+ error as a fatal error; if we try to
+ open symlinks then we do not wish a
+ fatal error if it happens not to be
+ a directory */
+/***********************************************************************//**
+Closes a directory stream.
+@return 0 if success, -1 if failure */
+UNIV_INTERN
+int
+os_file_closedir(
+/*=============*/
+ os_file_dir_t dir); /*!< in: directory stream */
+/***********************************************************************//**
+This function returns information of the next file in the directory. We jump
+over the '.' and '..' entries in the directory.
+@return 0 if ok, -1 if error, 1 if at the end of the directory */
+UNIV_INTERN
+int
+os_file_readdir_next_file(
+/*======================*/
+ const char* dirname,/*!< in: directory name or path */
+ os_file_dir_t dir, /*!< in: directory stream */
+ os_file_stat_t* info); /*!< in/out: buffer where the info is returned */
+/*****************************************************************//**
+This function attempts to create a directory named pathname. The new directory
+gets default permissions. On Unix, the permissions are (0770 & ~umask). If the
+directory exists already, nothing is done and the call succeeds, unless the
+fail_if_exists arguments is true.
+@return TRUE if call succeeds, FALSE on error */
+UNIV_INTERN
+ibool
+os_file_create_directory(
+/*=====================*/
+ const char* pathname, /*!< in: directory name as
+ null-terminated string */
+ ibool fail_if_exists);/*!< in: if TRUE, pre-existing directory
+ is treated as an error. */
+/****************************************************************//**
+A simple function to open or create a file.
+@return own: handle to the file, not defined if error, error number
+can be retrieved with os_file_get_last_error */
+UNIV_INTERN
+os_file_t
+os_file_create_simple(
+/*==================*/
+ const char* name, /*!< in: name of the file or path as a
+ null-terminated string */
+ ulint create_mode,/*!< in: OS_FILE_OPEN if an existing file is
+ opened (if does not exist, error), or
+ OS_FILE_CREATE if a new file is created
+ (if exists, error), or
+ OS_FILE_CREATE_PATH if new file
+ (if exists, error) and subdirectories along
+ its path are created (if needed)*/
+ ulint access_type,/*!< in: OS_FILE_READ_ONLY or
+ OS_FILE_READ_WRITE */
+ ibool* success);/*!< out: TRUE if succeed, FALSE if error */
+/****************************************************************//**
+A simple function to open or create a file.
+@return own: handle to the file, not defined if error, error number
+can be retrieved with os_file_get_last_error */
+UNIV_INTERN
+os_file_t
+os_file_create_simple_no_error_handling(
+/*====================================*/
+ const char* name, /*!< in: name of the file or path as a
+ null-terminated string */
+ ulint create_mode,/*!< in: OS_FILE_OPEN if an existing file
+ is opened (if does not exist, error), or
+ OS_FILE_CREATE if a new file is created
+ (if exists, error) */
+ ulint access_type,/*!< in: OS_FILE_READ_ONLY,
+ OS_FILE_READ_WRITE, or
+ OS_FILE_READ_ALLOW_DELETE; the last option is
+ used by a backup program reading the file */
+ ibool* success);/*!< out: TRUE if succeed, FALSE if error */
+/****************************************************************//**
+Tries to disable OS caching on an opened file descriptor. */
+UNIV_INTERN
+void
+os_file_set_nocache(
+/*================*/
+ int fd, /*!< in: file descriptor to alter */
+ const char* file_name, /*!< in: file name, used in the
+ diagnostic message */
+ const char* operation_name);/*!< in: "open" or "create"; used in the
+ diagnostic message */
+/****************************************************************//**
+Opens an existing file or creates a new.
+@return own: handle to the file, not defined if error, error number
+can be retrieved with os_file_get_last_error */
+UNIV_INTERN
+os_file_t
+os_file_create(
+/*===========*/
+ const char* name, /*!< in: name of the file or path as a
+ null-terminated string */
+ ulint create_mode,/*!< in: OS_FILE_OPEN if an existing file
+ is opened (if does not exist, error), or
+ OS_FILE_CREATE if a new file is created
+ (if exists, error),
+ OS_FILE_OVERWRITE if a new file is created
+ or an old overwritten;
+ OS_FILE_OPEN_RAW, if a raw device or disk
+ partition should be opened */
+ ulint purpose,/*!< in: OS_FILE_AIO, if asynchronous,
+ non-buffered i/o is desired,
+ OS_FILE_NORMAL, if any normal file;
+ NOTE that it also depends on type, os_aio_..
+ and srv_.. variables whether we really use
+ async i/o or unbuffered i/o: look in the
+ function source code for the exact rules */
+ ulint type, /*!< in: OS_DATA_FILE or OS_LOG_FILE */
+ ibool* success);/*!< out: TRUE if succeed, FALSE if error */
+/***********************************************************************//**
+Deletes a file. The file has to be closed before calling this.
+@return TRUE if success */
+UNIV_INTERN
+ibool
+os_file_delete(
+/*===========*/
+ const char* name); /*!< in: file path as a null-terminated string */
+
+/***********************************************************************//**
+Deletes a file if it exists. The file has to be closed before calling this.
+@return TRUE if success */
+UNIV_INTERN
+ibool
+os_file_delete_if_exists(
+/*=====================*/
+ const char* name); /*!< in: file path as a null-terminated string */
+/***********************************************************************//**
+Renames a file (can also move it to another directory). It is safest that the
+file is closed before calling this function.
+@return TRUE if success */
+UNIV_INTERN
+ibool
+os_file_rename(
+/*===========*/
+ const char* oldpath, /*!< in: old file path as a
+ null-terminated string */
+ const char* newpath); /*!< in: new file path */
+/***********************************************************************//**
+Closes a file handle. In case of error, error number can be retrieved with
+os_file_get_last_error.
+@return TRUE if success */
+UNIV_INTERN
+ibool
+os_file_close(
+/*==========*/
+ os_file_t file); /*!< in, own: handle to a file */
+#ifdef UNIV_HOTBACKUP
+/***********************************************************************//**
+Closes a file handle.
+@return TRUE if success */
+UNIV_INTERN
+ibool
+os_file_close_no_error_handling(
+/*============================*/
+ os_file_t file); /*!< in, own: handle to a file */
+#endif /* UNIV_HOTBACKUP */
+/***********************************************************************//**
+Gets a file size.
+@return TRUE if success */
+UNIV_INTERN
+ibool
+os_file_get_size(
+/*=============*/
+ os_file_t file, /*!< in: handle to a file */
+ ulint* size, /*!< out: least significant 32 bits of file
+ size */
+ ulint* size_high);/*!< out: most significant 32 bits of size */
+/***********************************************************************//**
+Gets file size as a 64-bit integer ib_int64_t.
+@return size in bytes, -1 if error */
+UNIV_INTERN
+ib_int64_t
+os_file_get_size_as_iblonglong(
+/*===========================*/
+ os_file_t file); /*!< in: handle to a file */
+/***********************************************************************//**
+Write the specified number of zeros to a newly created file.
+@return TRUE if success */
+UNIV_INTERN
+ibool
+os_file_set_size(
+/*=============*/
+ const char* name, /*!< in: name of the file or path as a
+ null-terminated string */
+ os_file_t file, /*!< in: handle to a file */
+ ulint size, /*!< in: least significant 32 bits of file
+ size */
+ ulint size_high);/*!< in: most significant 32 bits of size */
+/***********************************************************************//**
+Truncates a file at its current position.
+@return TRUE if success */
+UNIV_INTERN
+ibool
+os_file_set_eof(
+/*============*/
+ FILE* file); /*!< in: file to be truncated */
+/***********************************************************************//**
+Flushes the write buffers of a given file to the disk.
+@return TRUE if success */
+UNIV_INTERN
+ibool
+os_file_flush(
+/*==========*/
+ os_file_t file); /*!< in, own: handle to a file */
+/***********************************************************************//**
+Retrieves the last error number if an error occurs in a file io function.
+The number should be retrieved before any other OS calls (because they may
+overwrite the error number). If the number is not known to this program,
+the OS error number + 100 is returned.
+@return error number, or OS error number + 100 */
+UNIV_INTERN
+ulint
+os_file_get_last_error(
+/*===================*/
+ ibool report_all_errors); /*!< in: TRUE if we want an error message
+ printed of all errors */
+/*******************************************************************//**
+Requests a synchronous read operation.
+@return TRUE if request was successful, FALSE if fail */
+UNIV_INTERN
+ibool
+os_file_read(
+/*=========*/
+ os_file_t file, /*!< in: handle to a file */
+ void* buf, /*!< in: buffer where to read */
+ ulint offset, /*!< in: least significant 32 bits of file
+ offset where to read */
+ ulint offset_high,/*!< in: most significant 32 bits of
+ offset */
+ ulint n); /*!< in: number of bytes to read */
+/*******************************************************************//**
+Rewind file to its start, read at most size - 1 bytes from it to str, and
+NUL-terminate str. All errors are silently ignored. This function is
+mostly meant to be used with temporary files. */
+UNIV_INTERN
+void
+os_file_read_string(
+/*================*/
+ FILE* file, /*!< in: file to read from */
+ char* str, /*!< in: buffer where to read */
+ ulint size); /*!< in: size of buffer */
+/*******************************************************************//**
+Requests a synchronous positioned read operation. This function does not do
+any error handling. In case of error it returns FALSE.
+@return TRUE if request was successful, FALSE if fail */
+UNIV_INTERN
+ibool
+os_file_read_no_error_handling(
+/*===========================*/
+ os_file_t file, /*!< in: handle to a file */
+ void* buf, /*!< in: buffer where to read */
+ ulint offset, /*!< in: least significant 32 bits of file
+ offset where to read */
+ ulint offset_high,/*!< in: most significant 32 bits of
+ offset */
+ ulint n); /*!< in: number of bytes to read */
+
+/*******************************************************************//**
+Requests a synchronous write operation.
+@return TRUE if request was successful, FALSE if fail */
+UNIV_INTERN
+ibool
+os_file_write(
+/*==========*/
+ const char* name, /*!< in: name of the file or path as a
+ null-terminated string */
+ os_file_t file, /*!< in: handle to a file */
+ const void* buf, /*!< in: buffer from which to write */
+ ulint offset, /*!< in: least significant 32 bits of file
+ offset where to write */
+ ulint offset_high,/*!< in: most significant 32 bits of
+ offset */
+ ulint n); /*!< in: number of bytes to write */
+/*******************************************************************//**
+Check the existence and type of the given file.
+@return TRUE if call succeeded */
+UNIV_INTERN
+ibool
+os_file_status(
+/*===========*/
+ const char* path, /*!< in: pathname of the file */
+ ibool* exists, /*!< out: TRUE if file exists */
+ os_file_type_t* type); /*!< out: type of the file (if it exists) */
+/****************************************************************//**
+The function os_file_dirname returns a directory component of a
+null-terminated pathname string. In the usual case, dirname returns
+the string up to, but not including, the final '/', and basename
+is the component following the final '/'. Trailing '/' charac­
+ters are not counted as part of the pathname.
+
+If path does not contain a slash, dirname returns the string ".".
+
+Concatenating the string returned by dirname, a "/", and the basename
+yields a complete pathname.
+
+The return value is a copy of the directory component of the pathname.
+The copy is allocated from heap. It is the caller responsibility
+to free it after it is no longer needed.
+
+The following list of examples (taken from SUSv2) shows the strings
+returned by dirname and basename for different paths:
+
+ path dirname basename
+ "/usr/lib" "/usr" "lib"
+ "/usr/" "/" "usr"
+ "usr" "." "usr"
+ "/" "/" "/"
+ "." "." "."
+ ".." "." ".."
+
+@return own: directory component of the pathname */
+UNIV_INTERN
+char*
+os_file_dirname(
+/*============*/
+ const char* path); /*!< in: pathname */
+/****************************************************************//**
+Creates all missing subdirectories along the given path.
+@return TRUE if call succeeded FALSE otherwise */
+UNIV_INTERN
+ibool
+os_file_create_subdirs_if_needed(
+/*=============================*/
+ const char* path); /*!< in: path name */
+/***********************************************************************
+Initializes the asynchronous io system. Creates one array each for ibuf
+and log i/o. Also creates one array each for read and write where each
+array is divided logically into n_read_segs and n_write_segs
+respectively. The caller must create an i/o handler thread for each
+segment in these arrays. This function also creates the sync array.
+No i/o handler thread needs to be created for that */
+UNIV_INTERN
+void
+os_aio_init(
+/*========*/
+ ulint n_per_seg, /*<! in: maximum number of pending aio
+ operations allowed per segment */
+ ulint n_read_segs, /*<! in: number of reader threads */
+ ulint n_write_segs, /*<! in: number of writer threads */
+ ulint n_slots_sync); /*<! in: number of slots in the sync aio
+ array */
+/*******************************************************************//**
+Requests an asynchronous i/o operation.
+@return TRUE if request was queued successfully, FALSE if fail */
+UNIV_INTERN
+ibool
+os_aio(
+/*===*/
+ ulint type, /*!< in: OS_FILE_READ or OS_FILE_WRITE */
+ ulint mode, /*!< in: OS_AIO_NORMAL, ..., possibly ORed
+ to OS_AIO_SIMULATED_WAKE_LATER: the
+ last flag advises this function not to wake
+ i/o-handler threads, but the caller will
+ do the waking explicitly later, in this
+ way the caller can post several requests in
+ a batch; NOTE that the batch must not be
+ so big that it exhausts the slots in aio
+ arrays! NOTE that a simulated batch
+ may introduce hidden chances of deadlocks,
+ because i/os are not actually handled until
+ all have been posted: use with great
+ caution! */
+ const char* name, /*!< in: name of the file or path as a
+ null-terminated string */
+ os_file_t file, /*!< in: handle to a file */
+ void* buf, /*!< in: buffer where to read or from which
+ to write */
+ ulint offset, /*!< in: least significant 32 bits of file
+ offset where to read or write */
+ ulint offset_high, /*!< in: most significant 32 bits of
+ offset */
+ ulint n, /*!< in: number of bytes to read or write */
+ fil_node_t* message1,/*!< in: message for the aio handler
+ (can be used to identify a completed
+ aio operation); ignored if mode is
+ OS_AIO_SYNC */
+ void* message2);/*!< in: message for the aio handler
+ (can be used to identify a completed
+ aio operation); ignored if mode is
+ OS_AIO_SYNC */
+/************************************************************************//**
+Wakes up all async i/o threads so that they know to exit themselves in
+shutdown. */
+UNIV_INTERN
+void
+os_aio_wake_all_threads_at_shutdown(void);
+/*=====================================*/
+/************************************************************************//**
+Waits until there are no pending writes in os_aio_write_array. There can
+be other, synchronous, pending writes. */
+UNIV_INTERN
+void
+os_aio_wait_until_no_pending_writes(void);
+/*=====================================*/
+/**********************************************************************//**
+Wakes up simulated aio i/o-handler threads if they have something to do. */
+UNIV_INTERN
+void
+os_aio_simulated_wake_handler_threads(void);
+/*=======================================*/
+/**********************************************************************//**
+This function can be called if one wants to post a batch of reads and
+prefers an i/o-handler thread to handle them all at once later. You must
+call os_aio_simulated_wake_handler_threads later to ensure the threads
+are not left sleeping! */
+UNIV_INTERN
+void
+os_aio_simulated_put_read_threads_to_sleep(void);
+/*============================================*/
+
+#ifdef WIN_ASYNC_IO
+/**********************************************************************//**
+This function is only used in Windows asynchronous i/o.
+Waits for an aio operation to complete. This function is used to wait the
+for completed requests. The aio array of pending requests is divided
+into segments. The thread specifies which segment or slot it wants to wait
+for. NOTE: this function will also take care of freeing the aio slot,
+therefore no other thread is allowed to do the freeing!
+@return TRUE if the aio operation succeeded */
+UNIV_INTERN
+ibool
+os_aio_windows_handle(
+/*==================*/
+ ulint segment, /*!< in: the number of the segment in the aio
+ arrays to wait for; segment 0 is the ibuf
+ i/o thread, segment 1 the log i/o thread,
+ then follow the non-ibuf read threads, and as
+ the last are the non-ibuf write threads; if
+ this is ULINT_UNDEFINED, then it means that
+ sync aio is used, and this parameter is
+ ignored */
+ ulint pos, /*!< this parameter is used only in sync aio:
+ wait for the aio slot at this position */
+ fil_node_t**message1, /*!< out: the messages passed with the aio
+ request; note that also in the case where
+ the aio operation failed, these output
+ parameters are valid and can be used to
+ restart the operation, for example */
+ void** message2,
+ ulint* type); /*!< out: OS_FILE_WRITE or ..._READ */
+#endif
+
+/**********************************************************************//**
+Does simulated aio. This function should be called by an i/o-handler
+thread.
+@return TRUE if the aio operation succeeded */
+UNIV_INTERN
+ibool
+os_aio_simulated_handle(
+/*====================*/
+ ulint segment, /*!< in: the number of the segment in the aio
+ arrays to wait for; segment 0 is the ibuf
+ i/o thread, segment 1 the log i/o thread,
+ then follow the non-ibuf read threads, and as
+ the last are the non-ibuf write threads */
+ fil_node_t**message1, /*!< out: the messages passed with the aio
+ request; note that also in the case where
+ the aio operation failed, these output
+ parameters are valid and can be used to
+ restart the operation, for example */
+ void** message2,
+ ulint* type); /*!< out: OS_FILE_WRITE or ..._READ */
+/**********************************************************************//**
+Validates the consistency of the aio system.
+@return TRUE if ok */
+UNIV_INTERN
+ibool
+os_aio_validate(void);
+/*=================*/
+/**********************************************************************//**
+Prints info of the aio arrays. */
+UNIV_INTERN
+void
+os_aio_print(
+/*=========*/
+ FILE* file); /*!< in: file where to print */
+/**********************************************************************//**
+Refreshes the statistics used to print per-second averages. */
+UNIV_INTERN
+void
+os_aio_refresh_stats(void);
+/*======================*/
+
+#ifdef UNIV_DEBUG
+/**********************************************************************//**
+Checks that all slots in the system have been freed, that is, there are
+no pending io operations. */
+UNIV_INTERN
+ibool
+os_aio_all_slots_free(void);
+/*=======================*/
+#endif /* UNIV_DEBUG */
+
+/*******************************************************************//**
+This function returns information about the specified file
+@return TRUE if stat information found */
+UNIV_INTERN
+ibool
+os_file_get_status(
+/*===============*/
+ const char* path, /*!< in: pathname of the file */
+ os_file_stat_t* stat_info); /*!< information of a file in a
+ directory */
+
+#if !defined(UNIV_HOTBACKUP) && !defined(__NETWARE__)
+/*********************************************************************//**
+Creates a temporary file that will be deleted on close.
+This function is defined in ha_innodb.cc.
+@return temporary file descriptor, or < 0 on error */
+UNIV_INTERN
+int
+innobase_mysql_tmpfile(void);
+/*========================*/
+#endif /* !UNIV_HOTBACKUP && !__NETWARE__ */
+
+#endif
diff --git a/storage/innobase/include/os0proc.h b/storage/innobase/include/os0proc.h
new file mode 100644
index 00000000000..fd46bd7db87
--- /dev/null
+++ b/storage/innobase/include/os0proc.h
@@ -0,0 +1,77 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/os0proc.h
+The interface to the operating system
+process control primitives
+
+Created 9/30/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef os0proc_h
+#define os0proc_h
+
+#include "univ.i"
+
+#ifdef UNIV_LINUX
+#include <sys/ipc.h>
+#include <sys/shm.h>
+#endif
+
+typedef void* os_process_t;
+typedef unsigned long int os_process_id_t;
+
+extern ibool os_use_large_pages;
+/* Large page size. This may be a boot-time option on some platforms */
+extern ulint os_large_page_size;
+
+/****************************************************************//**
+Converts the current process id to a number. It is not guaranteed that the
+number is unique. In Linux returns the 'process number' of the current
+thread. That number is the same as one sees in 'top', for example. In Linux
+the thread id is not the same as one sees in 'top'.
+@return process id as a number */
+UNIV_INTERN
+ulint
+os_proc_get_number(void);
+/*====================*/
+/****************************************************************//**
+Allocates large pages memory.
+@return allocated memory */
+UNIV_INTERN
+void*
+os_mem_alloc_large(
+/*===============*/
+ ulint* n); /*!< in/out: number of bytes */
+/****************************************************************//**
+Frees large pages memory. */
+UNIV_INTERN
+void
+os_mem_free_large(
+/*==============*/
+ void *ptr, /*!< in: pointer returned by
+ os_mem_alloc_large() */
+ ulint size); /*!< in: size returned by
+ os_mem_alloc_large() */
+
+#ifndef UNIV_NONINL
+#include "os0proc.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/os0proc.ic b/storage/innobase/include/os0proc.ic
new file mode 100644
index 00000000000..c9641644525
--- /dev/null
+++ b/storage/innobase/include/os0proc.ic
@@ -0,0 +1,27 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/os0proc.ic
+The interface to the operating system
+process control primitives
+
+Created 9/30/1995 Heikki Tuuri
+*******************************************************/
+
+
diff --git a/storage/innobase/include/os0sync.h b/storage/innobase/include/os0sync.h
new file mode 100644
index 00000000000..0e0b32e7036
--- /dev/null
+++ b/storage/innobase/include/os0sync.h
@@ -0,0 +1,386 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 2008, Google Inc.
+
+Portions of this file contain modifications contributed and copyrighted by
+Google, Inc. Those modifications are gratefully acknowledged and are described
+briefly in the InnoDB documentation. The contributions by Google are
+incorporated with their permission, and subject to the conditions contained in
+the file COPYING.Google.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/os0sync.h
+The interface to the operating system
+synchronization primitives.
+
+Created 9/6/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef os0sync_h
+#define os0sync_h
+
+#include "univ.i"
+#include "ut0lst.h"
+
+#ifdef __WIN__
+
+/** Native mutex */
+#define os_fast_mutex_t CRITICAL_SECTION
+
+/** Native event */
+typedef HANDLE os_native_event_t;
+
+/** Operating system event */
+typedef struct os_event_struct os_event_struct_t;
+/** Operating system event handle */
+typedef os_event_struct_t* os_event_t;
+
+/** An asynchronous signal sent between threads */
+struct os_event_struct {
+ os_native_event_t handle;
+ /*!< Windows event */
+ UT_LIST_NODE_T(os_event_struct_t) os_event_list;
+ /*!< list of all created events */
+};
+#else
+/** Native mutex */
+typedef pthread_mutex_t os_fast_mutex_t;
+
+/** Operating system event */
+typedef struct os_event_struct os_event_struct_t;
+/** Operating system event handle */
+typedef os_event_struct_t* os_event_t;
+
+/** An asynchronous signal sent between threads */
+struct os_event_struct {
+ os_fast_mutex_t os_mutex; /*!< this mutex protects the next
+ fields */
+ ibool is_set; /*!< this is TRUE when the event is
+ in the signaled state, i.e., a thread
+ does not stop if it tries to wait for
+ this event */
+ ib_int64_t signal_count; /*!< this is incremented each time
+ the event becomes signaled */
+ pthread_cond_t cond_var; /*!< condition variable is used in
+ waiting for the event */
+ UT_LIST_NODE_T(os_event_struct_t) os_event_list;
+ /*!< list of all created events */
+};
+#endif
+
+/** Operating system mutex */
+typedef struct os_mutex_struct os_mutex_str_t;
+/** Operating system mutex handle */
+typedef os_mutex_str_t* os_mutex_t;
+
+/** Denotes an infinite delay for os_event_wait_time() */
+#define OS_SYNC_INFINITE_TIME ((ulint)(-1))
+
+/** Return value of os_event_wait_time() when the time is exceeded */
+#define OS_SYNC_TIME_EXCEEDED 1
+
+/** Mutex protecting counts and the event and OS 'slow' mutex lists */
+extern os_mutex_t os_sync_mutex;
+
+/** This is incremented by 1 in os_thread_create and decremented by 1 in
+os_thread_exit */
+extern ulint os_thread_count;
+
+extern ulint os_event_count;
+extern ulint os_mutex_count;
+extern ulint os_fast_mutex_count;
+
+/*********************************************************//**
+Initializes global event and OS 'slow' mutex lists. */
+UNIV_INTERN
+void
+os_sync_init(void);
+/*==============*/
+/*********************************************************//**
+Frees created events and OS 'slow' mutexes. */
+UNIV_INTERN
+void
+os_sync_free(void);
+/*==============*/
+/*********************************************************//**
+Creates an event semaphore, i.e., a semaphore which may just have two states:
+signaled and nonsignaled. The created event is manual reset: it must be reset
+explicitly by calling sync_os_reset_event.
+@return the event handle */
+UNIV_INTERN
+os_event_t
+os_event_create(
+/*============*/
+ const char* name); /*!< in: the name of the event, if NULL
+ the event is created without a name */
+/**********************************************************//**
+Sets an event semaphore to the signaled state: lets waiting threads
+proceed. */
+UNIV_INTERN
+void
+os_event_set(
+/*=========*/
+ os_event_t event); /*!< in: event to set */
+/**********************************************************//**
+Resets an event semaphore to the nonsignaled state. Waiting threads will
+stop to wait for the event.
+The return value should be passed to os_even_wait_low() if it is desired
+that this thread should not wait in case of an intervening call to
+os_event_set() between this os_event_reset() and the
+os_event_wait_low() call. See comments for os_event_wait_low(). */
+UNIV_INTERN
+ib_int64_t
+os_event_reset(
+/*===========*/
+ os_event_t event); /*!< in: event to reset */
+/**********************************************************//**
+Frees an event object. */
+UNIV_INTERN
+void
+os_event_free(
+/*==========*/
+ os_event_t event); /*!< in: event to free */
+
+/**********************************************************//**
+Waits for an event object until it is in the signaled state. If
+srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS this also exits the
+waiting thread when the event becomes signaled (or immediately if the
+event is already in the signaled state).
+
+Typically, if the event has been signalled after the os_event_reset()
+we'll return immediately because event->is_set == TRUE.
+There are, however, situations (e.g.: sync_array code) where we may
+lose this information. For example:
+
+thread A calls os_event_reset()
+thread B calls os_event_set() [event->is_set == TRUE]
+thread C calls os_event_reset() [event->is_set == FALSE]
+thread A calls os_event_wait() [infinite wait!]
+thread C calls os_event_wait() [infinite wait!]
+
+Where such a scenario is possible, to avoid infinite wait, the
+value returned by os_event_reset() should be passed in as
+reset_sig_count. */
+UNIV_INTERN
+void
+os_event_wait_low(
+/*==============*/
+ os_event_t event, /*!< in: event to wait */
+ ib_int64_t reset_sig_count);/*!< in: zero or the value
+ returned by previous call of
+ os_event_reset(). */
+
+#define os_event_wait(event) os_event_wait_low(event, 0)
+
+/**********************************************************//**
+Waits for an event object until it is in the signaled state or
+a timeout is exceeded. In Unix the timeout is always infinite.
+@return 0 if success, OS_SYNC_TIME_EXCEEDED if timeout was exceeded */
+UNIV_INTERN
+ulint
+os_event_wait_time(
+/*===============*/
+ os_event_t event, /*!< in: event to wait */
+ ulint time); /*!< in: timeout in microseconds, or
+ OS_SYNC_INFINITE_TIME */
+#ifdef __WIN__
+/**********************************************************//**
+Waits for any event in an OS native event array. Returns if even a single
+one is signaled or becomes signaled.
+@return index of the event which was signaled */
+UNIV_INTERN
+ulint
+os_event_wait_multiple(
+/*===================*/
+ ulint n, /*!< in: number of events in the
+ array */
+ os_native_event_t* native_event_array);
+ /*!< in: pointer to an array of event
+ handles */
+#endif
+/*********************************************************//**
+Creates an operating system mutex semaphore. Because these are slow, the
+mutex semaphore of InnoDB itself (mutex_t) should be used where possible.
+@return the mutex handle */
+UNIV_INTERN
+os_mutex_t
+os_mutex_create(
+/*============*/
+ const char* name); /*!< in: the name of the mutex, if NULL
+ the mutex is created without a name */
+/**********************************************************//**
+Acquires ownership of a mutex semaphore. */
+UNIV_INTERN
+void
+os_mutex_enter(
+/*===========*/
+ os_mutex_t mutex); /*!< in: mutex to acquire */
+/**********************************************************//**
+Releases ownership of a mutex. */
+UNIV_INTERN
+void
+os_mutex_exit(
+/*==========*/
+ os_mutex_t mutex); /*!< in: mutex to release */
+/**********************************************************//**
+Frees an mutex object. */
+UNIV_INTERN
+void
+os_mutex_free(
+/*==========*/
+ os_mutex_t mutex); /*!< in: mutex to free */
+/**********************************************************//**
+Acquires ownership of a fast mutex. Currently in Windows this is the same
+as os_fast_mutex_lock!
+@return 0 if success, != 0 if was reserved by another thread */
+UNIV_INLINE
+ulint
+os_fast_mutex_trylock(
+/*==================*/
+ os_fast_mutex_t* fast_mutex); /*!< in: mutex to acquire */
+/**********************************************************//**
+Releases ownership of a fast mutex. */
+UNIV_INTERN
+void
+os_fast_mutex_unlock(
+/*=================*/
+ os_fast_mutex_t* fast_mutex); /*!< in: mutex to release */
+/*********************************************************//**
+Initializes an operating system fast mutex semaphore. */
+UNIV_INTERN
+void
+os_fast_mutex_init(
+/*===============*/
+ os_fast_mutex_t* fast_mutex); /*!< in: fast mutex */
+/**********************************************************//**
+Acquires ownership of a fast mutex. */
+UNIV_INTERN
+void
+os_fast_mutex_lock(
+/*===============*/
+ os_fast_mutex_t* fast_mutex); /*!< in: mutex to acquire */
+/**********************************************************//**
+Frees an mutex object. */
+UNIV_INTERN
+void
+os_fast_mutex_free(
+/*===============*/
+ os_fast_mutex_t* fast_mutex); /*!< in: mutex to free */
+
+/**********************************************************//**
+Atomic compare-and-swap and increment for InnoDB. */
+
+#ifdef HAVE_GCC_ATOMIC_BUILTINS
+/**********************************************************//**
+Returns true if swapped, ptr is pointer to target, old_val is value to
+compare to, new_val is the value to swap in. */
+# define os_compare_and_swap(ptr, old_val, new_val) \
+ __sync_bool_compare_and_swap(ptr, old_val, new_val)
+# define os_compare_and_swap_ulint(ptr, old_val, new_val) \
+ os_compare_and_swap(ptr, old_val, new_val)
+# define os_compare_and_swap_lint(ptr, old_val, new_val) \
+ os_compare_and_swap(ptr, old_val, new_val)
+# define os_compare_and_swap_thread_id(ptr, old_val, new_val) \
+ os_compare_and_swap(ptr, old_val, new_val)
+/**********************************************************//**
+Returns the resulting value, ptr is pointer to target, amount is the
+amount of increment. */
+# define os_atomic_increment(ptr, amount) \
+ __sync_add_and_fetch(ptr, amount)
+# define os_atomic_increment_lint(ptr, amount) \
+ os_atomic_increment(ptr, amount)
+# define os_atomic_increment_ulint(ptr, amount) \
+ os_atomic_increment(ptr, amount)
+/**********************************************************//**
+Returns the old value of *ptr, atomically sets *ptr to new_val */
+# define os_atomic_test_and_set_byte(ptr, new_val) \
+ __sync_lock_test_and_set(ptr, new_val)
+/* If not compiling with GCC or GCC doesn't support the atomic
+intrinsics and running on Solaris >= 10 use Solaris atomics */
+#elif defined(HAVE_SOLARIS_ATOMICS)
+#include <atomic.h>
+/**********************************************************//**
+Returns true if swapped, ptr is pointer to target, old_val is value to
+compare to, new_val is the value to swap in. */
+# define os_compare_and_swap_ulint(ptr, old_val, new_val) \
+ (atomic_cas_ulong(ptr, old_val, new_val) == old_val)
+# define os_compare_and_swap_lint(ptr, old_val, new_val) \
+ ((lint)atomic_cas_ulong((ulong_t*) ptr, old_val, new_val) == old_val)
+# ifdef INNODB_RW_LOCKS_USE_ATOMICS
+# if SIZEOF_PTHREAD_T == 4
+# define os_compare_and_swap_thread_id(ptr, old_val, new_val) \
+ ((pthread_t)atomic_cas_32(ptr, old_val, new_val) == old_val)
+# elif SIZEOF_PTHREAD_T == 8
+# define os_compare_and_swap_thread_id(ptr, old_val, new_val) \
+ ((pthread_t)atomic_cas_64(ptr, old_val, new_val) == old_val)
+# else
+# error "SIZEOF_PTHREAD_T != 4 or 8"
+# endif /* SIZEOF_PTHREAD_T CHECK */
+# endif /* INNODB_RW_LOCKS_USE_ATOMICS */
+
+/**********************************************************//**
+Returns the resulting value, ptr is pointer to target, amount is the
+amount of increment. */
+# define os_atomic_increment_lint(ptr, amount) \
+ atomic_add_long_nv((ulong_t*) ptr, amount)
+# define os_atomic_increment_ulint(ptr, amount) \
+ atomic_add_long_nv(ptr, amount)
+/**********************************************************//**
+Returns the old value of *ptr, atomically sets *ptr to new_val */
+# define os_atomic_test_and_set_byte(ptr, new_val) \
+ atomic_swap_uchar(ptr, new_val)
+/* On Windows, use Windows atomics / interlocked */
+#elif defined(HAVE_WINDOWS_ATOMICS)
+# ifdef _WIN64
+# define win_cmp_and_xchg InterlockedCompareExchange64
+# define win_xchg_and_add InterlockedExchangeAdd64
+# else /* _WIN64 */
+# define win_cmp_and_xchg InterlockedCompareExchange
+# define win_xchg_and_add InterlockedExchangeAdd
+# endif
+/**********************************************************//**
+Returns true if swapped, ptr is pointer to target, old_val is value to
+compare to, new_val is the value to swap in. */
+# define os_compare_and_swap_ulint(ptr, old_val, new_val) \
+ (win_cmp_and_xchg(ptr, new_val, old_val) == old_val)
+# define os_compare_and_swap_lint(ptr, old_val, new_val) \
+ (win_cmp_and_xchg(ptr, new_val, old_val) == old_val)
+# ifdef INNODB_RW_LOCKS_USE_ATOMICS
+# define os_compare_and_swap_thread_id(ptr, old_val, new_val) \
+ (InterlockedCompareExchange(ptr, new_val, old_val) == old_val)
+# endif /* INNODB_RW_LOCKS_USE_ATOMICS */
+/**********************************************************//**
+Returns the resulting value, ptr is pointer to target, amount is the
+amount of increment. */
+# define os_atomic_increment_lint(ptr, amount) \
+ (win_xchg_and_add(ptr, amount) + amount)
+# define os_atomic_increment_ulint(ptr, amount) \
+ ((ulint) (win_xchg_and_add(ptr, amount) + amount))
+/**********************************************************//**
+Returns the old value of *ptr, atomically sets *ptr to new_val.
+InterlockedExchange() operates on LONG, and the LONG will be
+clobbered */
+# define os_atomic_test_and_set_byte(ptr, new_val) \
+ ((byte) InterlockedExchange(ptr, new_val))
+#endif /* HAVE_GCC_ATOMIC_BUILTINS */
+
+#ifndef UNIV_NONINL
+#include "os0sync.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/os0sync.ic b/storage/innobase/include/os0sync.ic
new file mode 100644
index 00000000000..1f3ce38fa65
--- /dev/null
+++ b/storage/innobase/include/os0sync.ic
@@ -0,0 +1,53 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/os0sync.ic
+The interface to the operating system synchronization primitives.
+
+Created 9/6/1995 Heikki Tuuri
+*******************************************************/
+
+#ifdef __WIN__
+#include <winbase.h>
+#endif
+
+/**********************************************************//**
+Acquires ownership of a fast mutex. Currently in Windows this is the same
+as os_fast_mutex_lock!
+@return 0 if success, != 0 if was reserved by another thread */
+UNIV_INLINE
+ulint
+os_fast_mutex_trylock(
+/*==================*/
+ os_fast_mutex_t* fast_mutex) /*!< in: mutex to acquire */
+{
+#ifdef __WIN__
+ EnterCriticalSection(fast_mutex);
+
+ return(0);
+#else
+ /* NOTE that the MySQL my_pthread.h redefines pthread_mutex_trylock
+ so that it returns 0 on success. In the operating system
+ libraries, HP-UX-10.20 follows the old Posix 1003.4a Draft 4 and
+ returns 1 on success (but MySQL remaps that to 0), while Linux,
+ FreeBSD, Solaris, AIX, Tru64 Unix, HP-UX-11.0 return 0 on success. */
+
+ return((ulint) pthread_mutex_trylock(fast_mutex));
+#endif
+}
diff --git a/storage/innobase/include/os0thread.h b/storage/innobase/include/os0thread.h
new file mode 100644
index 00000000000..6583de0005f
--- /dev/null
+++ b/storage/innobase/include/os0thread.h
@@ -0,0 +1,162 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/os0thread.h
+The interface to the operating system
+process and thread control primitives
+
+Created 9/8/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef os0thread_h
+#define os0thread_h
+
+#include "univ.i"
+
+/* Maximum number of threads which can be created in the program;
+this is also the size of the wait slot array for MySQL threads which
+can wait inside InnoDB */
+
+#define OS_THREAD_MAX_N srv_max_n_threads
+
+
+/* Possible fixed priorities for threads */
+#define OS_THREAD_PRIORITY_NONE 100
+#define OS_THREAD_PRIORITY_BACKGROUND 1
+#define OS_THREAD_PRIORITY_NORMAL 2
+#define OS_THREAD_PRIORITY_ABOVE_NORMAL 3
+
+#ifdef __WIN__
+typedef void* os_thread_t;
+typedef unsigned long os_thread_id_t; /*!< In Windows the thread id
+ is an unsigned long int */
+#else
+typedef pthread_t os_thread_t;
+typedef os_thread_t os_thread_id_t; /*!< In Unix we use the thread
+ handle itself as the id of
+ the thread */
+#endif
+
+/* Define a function pointer type to use in a typecast */
+typedef void* (*os_posix_f_t) (void*);
+
+/***************************************************************//**
+Compares two thread ids for equality.
+@return TRUE if equal */
+UNIV_INTERN
+ibool
+os_thread_eq(
+/*=========*/
+ os_thread_id_t a, /*!< in: OS thread or thread id */
+ os_thread_id_t b); /*!< in: OS thread or thread id */
+/****************************************************************//**
+Converts an OS thread id to a ulint. It is NOT guaranteed that the ulint is
+unique for the thread though!
+@return thread identifier as a number */
+UNIV_INTERN
+ulint
+os_thread_pf(
+/*=========*/
+ os_thread_id_t a); /*!< in: OS thread identifier */
+/****************************************************************//**
+Creates a new thread of execution. The execution starts from
+the function given. The start function takes a void* parameter
+and returns a ulint.
+NOTE: We count the number of threads in os_thread_exit(). A created
+thread should always use that to exit and not use return() to exit.
+@return handle to the thread */
+UNIV_INTERN
+os_thread_t
+os_thread_create(
+/*=============*/
+#ifndef __WIN__
+ os_posix_f_t start_f,
+#else
+ ulint (*start_f)(void*), /*!< in: pointer to function
+ from which to start */
+#endif
+ void* arg, /*!< in: argument to start
+ function */
+ os_thread_id_t* thread_id); /*!< out: id of the created
+ thread, or NULL */
+
+/*****************************************************************//**
+Exits the current thread. */
+UNIV_INTERN
+void
+os_thread_exit(
+/*===========*/
+ void* exit_value); /*!< in: exit value; in Windows this void*
+ is cast as a DWORD */
+/*****************************************************************//**
+Returns the thread identifier of current thread.
+@return current thread identifier */
+UNIV_INTERN
+os_thread_id_t
+os_thread_get_curr_id(void);
+/*========================*/
+/*****************************************************************//**
+Returns handle to the current thread.
+@return current thread handle */
+UNIV_INTERN
+os_thread_t
+os_thread_get_curr(void);
+/*====================*/
+/*****************************************************************//**
+Advises the os to give up remainder of the thread's time slice. */
+UNIV_INTERN
+void
+os_thread_yield(void);
+/*=================*/
+/*****************************************************************//**
+The thread sleeps at least the time given in microseconds. */
+UNIV_INTERN
+void
+os_thread_sleep(
+/*============*/
+ ulint tm); /*!< in: time in microseconds */
+/******************************************************************//**
+Gets a thread priority.
+@return priority */
+UNIV_INTERN
+ulint
+os_thread_get_priority(
+/*===================*/
+ os_thread_t handle);/*!< in: OS handle to the thread */
+/******************************************************************//**
+Sets a thread priority. */
+UNIV_INTERN
+void
+os_thread_set_priority(
+/*===================*/
+ os_thread_t handle, /*!< in: OS handle to the thread */
+ ulint pri); /*!< in: priority: one of OS_PRIORITY_... */
+/******************************************************************//**
+Gets the last operating system error code for the calling thread.
+@return last error on Windows, 0 otherwise */
+UNIV_INTERN
+ulint
+os_thread_get_last_error(void);
+/*==========================*/
+
+#ifndef UNIV_NONINL
+#include "os0thread.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/os0thread.ic b/storage/innobase/include/os0thread.ic
new file mode 100644
index 00000000000..f89bc40b4fa
--- /dev/null
+++ b/storage/innobase/include/os0thread.ic
@@ -0,0 +1,25 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/os0thread.ic
+The interface to the operating system
+process and thread control primitives
+
+Created 9/8/1995 Heikki Tuuri
+*******************************************************/
diff --git a/storage/innobase/include/page0cur.h b/storage/innobase/include/page0cur.h
new file mode 100644
index 00000000000..1544b0abe1c
--- /dev/null
+++ b/storage/innobase/include/page0cur.h
@@ -0,0 +1,346 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/page0cur.h
+The page cursor
+
+Created 10/4/1994 Heikki Tuuri
+*************************************************************************/
+
+#ifndef page0cur_h
+#define page0cur_h
+
+#include "univ.i"
+
+#include "buf0types.h"
+#include "page0page.h"
+#include "rem0rec.h"
+#include "data0data.h"
+#include "mtr0mtr.h"
+
+
+#define PAGE_CUR_ADAPT
+
+/* Page cursor search modes; the values must be in this order! */
+
+#define PAGE_CUR_UNSUPP 0
+#define PAGE_CUR_G 1
+#define PAGE_CUR_GE 2
+#define PAGE_CUR_L 3
+#define PAGE_CUR_LE 4
+/*#define PAGE_CUR_LE_OR_EXTENDS 5*/ /* This is a search mode used in
+ "column LIKE 'abc%' ORDER BY column DESC";
+ we have to find strings which are <= 'abc' or
+ which extend it */
+#ifdef UNIV_SEARCH_DEBUG
+# define PAGE_CUR_DBG 6 /* As PAGE_CUR_LE, but skips search shortcut */
+#endif /* UNIV_SEARCH_DEBUG */
+
+#ifdef UNIV_DEBUG
+/*********************************************************//**
+Gets pointer to the page frame where the cursor is positioned.
+@return page */
+UNIV_INLINE
+page_t*
+page_cur_get_page(
+/*==============*/
+ page_cur_t* cur); /*!< in: page cursor */
+/*********************************************************//**
+Gets pointer to the buffer block where the cursor is positioned.
+@return page */
+UNIV_INLINE
+buf_block_t*
+page_cur_get_block(
+/*===============*/
+ page_cur_t* cur); /*!< in: page cursor */
+/*********************************************************//**
+Gets pointer to the page frame where the cursor is positioned.
+@return page */
+UNIV_INLINE
+page_zip_des_t*
+page_cur_get_page_zip(
+/*==================*/
+ page_cur_t* cur); /*!< in: page cursor */
+/*********************************************************//**
+Gets the record where the cursor is positioned.
+@return record */
+UNIV_INLINE
+rec_t*
+page_cur_get_rec(
+/*=============*/
+ page_cur_t* cur); /*!< in: page cursor */
+#else /* UNIV_DEBUG */
+# define page_cur_get_page(cur) page_align((cur)->rec)
+# define page_cur_get_block(cur) (cur)->block
+# define page_cur_get_page_zip(cur) buf_block_get_page_zip((cur)->block)
+# define page_cur_get_rec(cur) (cur)->rec
+#endif /* UNIV_DEBUG */
+/*********************************************************//**
+Sets the cursor object to point before the first user record
+on the page. */
+UNIV_INLINE
+void
+page_cur_set_before_first(
+/*======================*/
+ const buf_block_t* block, /*!< in: index page */
+ page_cur_t* cur); /*!< in: cursor */
+/*********************************************************//**
+Sets the cursor object to point after the last user record on
+the page. */
+UNIV_INLINE
+void
+page_cur_set_after_last(
+/*====================*/
+ const buf_block_t* block, /*!< in: index page */
+ page_cur_t* cur); /*!< in: cursor */
+/*********************************************************//**
+Returns TRUE if the cursor is before first user record on page.
+@return TRUE if at start */
+UNIV_INLINE
+ibool
+page_cur_is_before_first(
+/*=====================*/
+ const page_cur_t* cur); /*!< in: cursor */
+/*********************************************************//**
+Returns TRUE if the cursor is after last user record.
+@return TRUE if at end */
+UNIV_INLINE
+ibool
+page_cur_is_after_last(
+/*===================*/
+ const page_cur_t* cur); /*!< in: cursor */
+/**********************************************************//**
+Positions the cursor on the given record. */
+UNIV_INLINE
+void
+page_cur_position(
+/*==============*/
+ const rec_t* rec, /*!< in: record on a page */
+ const buf_block_t* block, /*!< in: buffer block containing
+ the record */
+ page_cur_t* cur); /*!< out: page cursor */
+/**********************************************************//**
+Invalidates a page cursor by setting the record pointer NULL. */
+UNIV_INLINE
+void
+page_cur_invalidate(
+/*================*/
+ page_cur_t* cur); /*!< out: page cursor */
+/**********************************************************//**
+Moves the cursor to the next record on page. */
+UNIV_INLINE
+void
+page_cur_move_to_next(
+/*==================*/
+ page_cur_t* cur); /*!< in/out: cursor; must not be after last */
+/**********************************************************//**
+Moves the cursor to the previous record on page. */
+UNIV_INLINE
+void
+page_cur_move_to_prev(
+/*==================*/
+ page_cur_t* cur); /*!< in/out: cursor; not before first */
+#ifndef UNIV_HOTBACKUP
+/***********************************************************//**
+Inserts a record next to page cursor. Returns pointer to inserted record if
+succeed, i.e., enough space available, NULL otherwise. The cursor stays at
+the same logical position, but the physical position may change if it is
+pointing to a compressed page that was reorganized.
+@return pointer to record if succeed, NULL otherwise */
+UNIV_INLINE
+rec_t*
+page_cur_tuple_insert(
+/*==================*/
+ page_cur_t* cursor, /*!< in/out: a page cursor */
+ const dtuple_t* tuple, /*!< in: pointer to a data tuple */
+ dict_index_t* index, /*!< in: record descriptor */
+ ulint n_ext, /*!< in: number of externally stored columns */
+ mtr_t* mtr); /*!< in: mini-transaction handle, or NULL */
+#endif /* !UNIV_HOTBACKUP */
+/***********************************************************//**
+Inserts a record next to page cursor. Returns pointer to inserted record if
+succeed, i.e., enough space available, NULL otherwise. The cursor stays at
+the same logical position, but the physical position may change if it is
+pointing to a compressed page that was reorganized.
+@return pointer to record if succeed, NULL otherwise */
+UNIV_INLINE
+rec_t*
+page_cur_rec_insert(
+/*================*/
+ page_cur_t* cursor, /*!< in/out: a page cursor */
+ const rec_t* rec, /*!< in: record to insert */
+ dict_index_t* index, /*!< in: record descriptor */
+ ulint* offsets,/*!< in/out: rec_get_offsets(rec, index) */
+ mtr_t* mtr); /*!< in: mini-transaction handle, or NULL */
+/***********************************************************//**
+Inserts a record next to page cursor on an uncompressed page.
+Returns pointer to inserted record if succeed, i.e., enough
+space available, NULL otherwise. The cursor stays at the same position.
+@return pointer to record if succeed, NULL otherwise */
+UNIV_INTERN
+rec_t*
+page_cur_insert_rec_low(
+/*====================*/
+ rec_t* current_rec,/*!< in: pointer to current record after
+ which the new record is inserted */
+ dict_index_t* index, /*!< in: record descriptor */
+ const rec_t* rec, /*!< in: pointer to a physical record */
+ ulint* offsets,/*!< in/out: rec_get_offsets(rec, index) */
+ mtr_t* mtr); /*!< in: mini-transaction handle, or NULL */
+/***********************************************************//**
+Inserts a record next to page cursor on a compressed and uncompressed
+page. Returns pointer to inserted record if succeed, i.e.,
+enough space available, NULL otherwise.
+The cursor stays at the same position.
+@return pointer to record if succeed, NULL otherwise */
+UNIV_INTERN
+rec_t*
+page_cur_insert_rec_zip(
+/*====================*/
+ rec_t** current_rec,/*!< in/out: pointer to current record after
+ which the new record is inserted */
+ buf_block_t* block, /*!< in: buffer block of *current_rec */
+ dict_index_t* index, /*!< in: record descriptor */
+ const rec_t* rec, /*!< in: pointer to a physical record */
+ ulint* offsets,/*!< in/out: rec_get_offsets(rec, index) */
+ mtr_t* mtr); /*!< in: mini-transaction handle, or NULL */
+/*************************************************************//**
+Copies records from page to a newly created page, from a given record onward,
+including that record. Infimum and supremum records are not copied. */
+UNIV_INTERN
+void
+page_copy_rec_list_end_to_created_page(
+/*===================================*/
+ page_t* new_page, /*!< in/out: index page to copy to */
+ rec_t* rec, /*!< in: first record to copy */
+ dict_index_t* index, /*!< in: record descriptor */
+ mtr_t* mtr); /*!< in: mtr */
+/***********************************************************//**
+Deletes a record at the page cursor. The cursor is moved to the
+next record after the deleted one. */
+UNIV_INTERN
+void
+page_cur_delete_rec(
+/*================*/
+ page_cur_t* cursor, /*!< in/out: a page cursor */
+ dict_index_t* index, /*!< in: record descriptor */
+ const ulint* offsets,/*!< in: rec_get_offsets(cursor->rec, index) */
+ mtr_t* mtr); /*!< in: mini-transaction handle */
+#ifndef UNIV_HOTBACKUP
+/****************************************************************//**
+Searches the right position for a page cursor.
+@return number of matched fields on the left */
+UNIV_INLINE
+ulint
+page_cur_search(
+/*============*/
+ const buf_block_t* block, /*!< in: buffer block */
+ const dict_index_t* index, /*!< in: record descriptor */
+ const dtuple_t* tuple, /*!< in: data tuple */
+ ulint mode, /*!< in: PAGE_CUR_L,
+ PAGE_CUR_LE, PAGE_CUR_G, or
+ PAGE_CUR_GE */
+ page_cur_t* cursor);/*!< out: page cursor */
+/****************************************************************//**
+Searches the right position for a page cursor. */
+UNIV_INTERN
+void
+page_cur_search_with_match(
+/*=======================*/
+ const buf_block_t* block, /*!< in: buffer block */
+ const dict_index_t* index, /*!< in: record descriptor */
+ const dtuple_t* tuple, /*!< in: data tuple */
+ ulint mode, /*!< in: PAGE_CUR_L,
+ PAGE_CUR_LE, PAGE_CUR_G, or
+ PAGE_CUR_GE */
+ ulint* iup_matched_fields,
+ /*!< in/out: already matched
+ fields in upper limit record */
+ ulint* iup_matched_bytes,
+ /*!< in/out: already matched
+ bytes in a field not yet
+ completely matched */
+ ulint* ilow_matched_fields,
+ /*!< in/out: already matched
+ fields in lower limit record */
+ ulint* ilow_matched_bytes,
+ /*!< in/out: already matched
+ bytes in a field not yet
+ completely matched */
+ page_cur_t* cursor);/*!< out: page cursor */
+/***********************************************************//**
+Positions a page cursor on a randomly chosen user record on a page. If there
+are no user records, sets the cursor on the infimum record. */
+UNIV_INTERN
+void
+page_cur_open_on_rnd_user_rec(
+/*==========================*/
+ buf_block_t* block, /*!< in: page */
+ page_cur_t* cursor);/*!< out: page cursor */
+#endif /* !UNIV_HOTBACKUP */
+/***********************************************************//**
+Parses a log record of a record insert on a page.
+@return end of log record or NULL */
+UNIV_INTERN
+byte*
+page_cur_parse_insert_rec(
+/*======================*/
+ ibool is_short,/*!< in: TRUE if short inserts */
+ byte* ptr, /*!< in: buffer */
+ byte* end_ptr,/*!< in: buffer end */
+ buf_block_t* block, /*!< in: page or NULL */
+ dict_index_t* index, /*!< in: record descriptor */
+ mtr_t* mtr); /*!< in: mtr or NULL */
+/**********************************************************//**
+Parses a log record of copying a record list end to a new created page.
+@return end of log record or NULL */
+UNIV_INTERN
+byte*
+page_parse_copy_rec_list_to_created_page(
+/*=====================================*/
+ byte* ptr, /*!< in: buffer */
+ byte* end_ptr,/*!< in: buffer end */
+ buf_block_t* block, /*!< in: page or NULL */
+ dict_index_t* index, /*!< in: record descriptor */
+ mtr_t* mtr); /*!< in: mtr or NULL */
+/***********************************************************//**
+Parses log record of a record delete on a page.
+@return pointer to record end or NULL */
+UNIV_INTERN
+byte*
+page_cur_parse_delete_rec(
+/*======================*/
+ byte* ptr, /*!< in: buffer */
+ byte* end_ptr,/*!< in: buffer end */
+ buf_block_t* block, /*!< in: page or NULL */
+ dict_index_t* index, /*!< in: record descriptor */
+ mtr_t* mtr); /*!< in: mtr or NULL */
+
+/** Index page cursor */
+
+struct page_cur_struct{
+ byte* rec; /*!< pointer to a record on page */
+ buf_block_t* block; /*!< pointer to the block containing rec */
+};
+
+#ifndef UNIV_NONINL
+#include "page0cur.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/page0cur.ic b/storage/innobase/include/page0cur.ic
new file mode 100644
index 00000000000..3520677dfb3
--- /dev/null
+++ b/storage/innobase/include/page0cur.ic
@@ -0,0 +1,299 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/page0cur.ic
+The page cursor
+
+Created 10/4/1994 Heikki Tuuri
+*************************************************************************/
+
+#include "page0page.h"
+#include "buf0types.h"
+
+#ifdef UNIV_DEBUG
+/*********************************************************//**
+Gets pointer to the page frame where the cursor is positioned.
+@return page */
+UNIV_INLINE
+page_t*
+page_cur_get_page(
+/*==============*/
+ page_cur_t* cur) /*!< in: page cursor */
+{
+ ut_ad(cur);
+ ut_ad(page_align(cur->rec) == cur->block->frame);
+
+ return(page_align(cur->rec));
+}
+
+/*********************************************************//**
+Gets pointer to the buffer block where the cursor is positioned.
+@return page */
+UNIV_INLINE
+buf_block_t*
+page_cur_get_block(
+/*===============*/
+ page_cur_t* cur) /*!< in: page cursor */
+{
+ ut_ad(cur);
+ ut_ad(page_align(cur->rec) == cur->block->frame);
+ return(cur->block);
+}
+
+/*********************************************************//**
+Gets pointer to the page frame where the cursor is positioned.
+@return page */
+UNIV_INLINE
+page_zip_des_t*
+page_cur_get_page_zip(
+/*==================*/
+ page_cur_t* cur) /*!< in: page cursor */
+{
+ return(buf_block_get_page_zip(page_cur_get_block(cur)));
+}
+
+/*********************************************************//**
+Gets the record where the cursor is positioned.
+@return record */
+UNIV_INLINE
+rec_t*
+page_cur_get_rec(
+/*=============*/
+ page_cur_t* cur) /*!< in: page cursor */
+{
+ ut_ad(cur);
+ ut_ad(page_align(cur->rec) == cur->block->frame);
+
+ return(cur->rec);
+}
+#endif /* UNIV_DEBUG */
+
+/*********************************************************//**
+Sets the cursor object to point before the first user record
+on the page. */
+UNIV_INLINE
+void
+page_cur_set_before_first(
+/*======================*/
+ const buf_block_t* block, /*!< in: index page */
+ page_cur_t* cur) /*!< in: cursor */
+{
+ cur->block = (buf_block_t*) block;
+ cur->rec = page_get_infimum_rec(buf_block_get_frame(cur->block));
+}
+
+/*********************************************************//**
+Sets the cursor object to point after the last user record on
+the page. */
+UNIV_INLINE
+void
+page_cur_set_after_last(
+/*====================*/
+ const buf_block_t* block, /*!< in: index page */
+ page_cur_t* cur) /*!< in: cursor */
+{
+ cur->block = (buf_block_t*) block;
+ cur->rec = page_get_supremum_rec(buf_block_get_frame(cur->block));
+}
+
+/*********************************************************//**
+Returns TRUE if the cursor is before first user record on page.
+@return TRUE if at start */
+UNIV_INLINE
+ibool
+page_cur_is_before_first(
+/*=====================*/
+ const page_cur_t* cur) /*!< in: cursor */
+{
+ ut_ad(cur);
+ ut_ad(page_align(cur->rec) == cur->block->frame);
+ return(page_rec_is_infimum(cur->rec));
+}
+
+/*********************************************************//**
+Returns TRUE if the cursor is after last user record.
+@return TRUE if at end */
+UNIV_INLINE
+ibool
+page_cur_is_after_last(
+/*===================*/
+ const page_cur_t* cur) /*!< in: cursor */
+{
+ ut_ad(cur);
+ ut_ad(page_align(cur->rec) == cur->block->frame);
+ return(page_rec_is_supremum(cur->rec));
+}
+
+/**********************************************************//**
+Positions the cursor on the given record. */
+UNIV_INLINE
+void
+page_cur_position(
+/*==============*/
+ const rec_t* rec, /*!< in: record on a page */
+ const buf_block_t* block, /*!< in: buffer block containing
+ the record */
+ page_cur_t* cur) /*!< out: page cursor */
+{
+ ut_ad(rec && block && cur);
+ ut_ad(page_align(rec) == block->frame);
+
+ cur->rec = (rec_t*) rec;
+ cur->block = (buf_block_t*) block;
+}
+
+/**********************************************************//**
+Invalidates a page cursor by setting the record pointer NULL. */
+UNIV_INLINE
+void
+page_cur_invalidate(
+/*================*/
+ page_cur_t* cur) /*!< out: page cursor */
+{
+ ut_ad(cur);
+
+ cur->rec = NULL;
+ cur->block = NULL;
+}
+
+/**********************************************************//**
+Moves the cursor to the next record on page. */
+UNIV_INLINE
+void
+page_cur_move_to_next(
+/*==================*/
+ page_cur_t* cur) /*!< in/out: cursor; must not be after last */
+{
+ ut_ad(!page_cur_is_after_last(cur));
+
+ cur->rec = page_rec_get_next(cur->rec);
+}
+
+/**********************************************************//**
+Moves the cursor to the previous record on page. */
+UNIV_INLINE
+void
+page_cur_move_to_prev(
+/*==================*/
+ page_cur_t* cur) /*!< in/out: page cursor, not before first */
+{
+ ut_ad(!page_cur_is_before_first(cur));
+
+ cur->rec = page_rec_get_prev(cur->rec);
+}
+
+#ifndef UNIV_HOTBACKUP
+/****************************************************************//**
+Searches the right position for a page cursor.
+@return number of matched fields on the left */
+UNIV_INLINE
+ulint
+page_cur_search(
+/*============*/
+ const buf_block_t* block, /*!< in: buffer block */
+ const dict_index_t* index, /*!< in: record descriptor */
+ const dtuple_t* tuple, /*!< in: data tuple */
+ ulint mode, /*!< in: PAGE_CUR_L,
+ PAGE_CUR_LE, PAGE_CUR_G, or
+ PAGE_CUR_GE */
+ page_cur_t* cursor) /*!< out: page cursor */
+{
+ ulint low_matched_fields = 0;
+ ulint low_matched_bytes = 0;
+ ulint up_matched_fields = 0;
+ ulint up_matched_bytes = 0;
+
+ ut_ad(dtuple_check_typed(tuple));
+
+ page_cur_search_with_match(block, index, tuple, mode,
+ &up_matched_fields,
+ &up_matched_bytes,
+ &low_matched_fields,
+ &low_matched_bytes,
+ cursor);
+ return(low_matched_fields);
+}
+
+/***********************************************************//**
+Inserts a record next to page cursor. Returns pointer to inserted record if
+succeed, i.e., enough space available, NULL otherwise. The cursor stays at
+the same logical position, but the physical position may change if it is
+pointing to a compressed page that was reorganized.
+@return pointer to record if succeed, NULL otherwise */
+UNIV_INLINE
+rec_t*
+page_cur_tuple_insert(
+/*==================*/
+ page_cur_t* cursor, /*!< in/out: a page cursor */
+ const dtuple_t* tuple, /*!< in: pointer to a data tuple */
+ dict_index_t* index, /*!< in: record descriptor */
+ ulint n_ext, /*!< in: number of externally stored columns */
+ mtr_t* mtr) /*!< in: mini-transaction handle, or NULL */
+{
+ mem_heap_t* heap;
+ ulint* offsets;
+ ulint size
+ = rec_get_converted_size(index, tuple, n_ext);
+ rec_t* rec;
+
+ heap = mem_heap_create(size
+ + (4 + REC_OFFS_HEADER_SIZE
+ + dtuple_get_n_fields(tuple))
+ * sizeof *offsets);
+ rec = rec_convert_dtuple_to_rec((byte*) mem_heap_alloc(heap, size),
+ index, tuple, n_ext);
+ offsets = rec_get_offsets(rec, index, NULL, ULINT_UNDEFINED, &heap);
+
+ if (buf_block_get_page_zip(cursor->block)) {
+ rec = page_cur_insert_rec_zip(&cursor->rec, cursor->block,
+ index, rec, offsets, mtr);
+ } else {
+ rec = page_cur_insert_rec_low(cursor->rec,
+ index, rec, offsets, mtr);
+ }
+
+ mem_heap_free(heap);
+ return(rec);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/***********************************************************//**
+Inserts a record next to page cursor. Returns pointer to inserted record if
+succeed, i.e., enough space available, NULL otherwise. The cursor stays at
+the same logical position, but the physical position may change if it is
+pointing to a compressed page that was reorganized.
+@return pointer to record if succeed, NULL otherwise */
+UNIV_INLINE
+rec_t*
+page_cur_rec_insert(
+/*================*/
+ page_cur_t* cursor, /*!< in/out: a page cursor */
+ const rec_t* rec, /*!< in: record to insert */
+ dict_index_t* index, /*!< in: record descriptor */
+ ulint* offsets,/*!< in/out: rec_get_offsets(rec, index) */
+ mtr_t* mtr) /*!< in: mini-transaction handle, or NULL */
+{
+ if (buf_block_get_page_zip(cursor->block)) {
+ return(page_cur_insert_rec_zip(&cursor->rec, cursor->block,
+ index, rec, offsets, mtr));
+ } else {
+ return(page_cur_insert_rec_low(cursor->rec,
+ index, rec, offsets, mtr));
+ }
+}
diff --git a/storage/innobase/include/page0page.h b/storage/innobase/include/page0page.h
new file mode 100644
index 00000000000..a4fe069d022
--- /dev/null
+++ b/storage/innobase/include/page0page.h
@@ -0,0 +1,1012 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/page0page.h
+Index page routines
+
+Created 2/2/1994 Heikki Tuuri
+*******************************************************/
+
+#ifndef page0page_h
+#define page0page_h
+
+#include "univ.i"
+
+#include "page0types.h"
+#include "fil0fil.h"
+#include "buf0buf.h"
+#include "data0data.h"
+#include "dict0dict.h"
+#include "rem0rec.h"
+#include "fsp0fsp.h"
+#include "mtr0mtr.h"
+
+#ifdef UNIV_MATERIALIZE
+#undef UNIV_INLINE
+#define UNIV_INLINE
+#endif
+
+/* PAGE HEADER
+ ===========
+
+Index page header starts at the first offset left free by the FIL-module */
+
+typedef byte page_header_t;
+
+#define PAGE_HEADER FSEG_PAGE_DATA /* index page header starts at this
+ offset */
+/*-----------------------------*/
+#define PAGE_N_DIR_SLOTS 0 /* number of slots in page directory */
+#define PAGE_HEAP_TOP 2 /* pointer to record heap top */
+#define PAGE_N_HEAP 4 /* number of records in the heap,
+ bit 15=flag: new-style compact page format */
+#define PAGE_FREE 6 /* pointer to start of page free record list */
+#define PAGE_GARBAGE 8 /* number of bytes in deleted records */
+#define PAGE_LAST_INSERT 10 /* pointer to the last inserted record, or
+ NULL if this info has been reset by a delete,
+ for example */
+#define PAGE_DIRECTION 12 /* last insert direction: PAGE_LEFT, ... */
+#define PAGE_N_DIRECTION 14 /* number of consecutive inserts to the same
+ direction */
+#define PAGE_N_RECS 16 /* number of user records on the page */
+#define PAGE_MAX_TRX_ID 18 /* highest id of a trx which may have modified
+ a record on the page; a dulint; defined only
+ in secondary indexes and in the insert buffer
+ tree; NOTE: this may be modified only
+ when the thread has an x-latch to the page,
+ and ALSO an x-latch to btr_search_latch
+ if there is a hash index to the page! */
+#define PAGE_HEADER_PRIV_END 26 /* end of private data structure of the page
+ header which are set in a page create */
+/*----*/
+#define PAGE_LEVEL 26 /* level of the node in an index tree; the
+ leaf level is the level 0 */
+#define PAGE_INDEX_ID 28 /* index id where the page belongs */
+#define PAGE_BTR_SEG_LEAF 36 /* file segment header for the leaf pages in
+ a B-tree: defined only on the root page of a
+ B-tree, but not in the root of an ibuf tree */
+#define PAGE_BTR_IBUF_FREE_LIST PAGE_BTR_SEG_LEAF
+#define PAGE_BTR_IBUF_FREE_LIST_NODE PAGE_BTR_SEG_LEAF
+ /* in the place of PAGE_BTR_SEG_LEAF and _TOP
+ there is a free list base node if the page is
+ the root page of an ibuf tree, and at the same
+ place is the free list node if the page is in
+ a free list */
+#define PAGE_BTR_SEG_TOP (36 + FSEG_HEADER_SIZE)
+ /* file segment header for the non-leaf pages
+ in a B-tree: defined only on the root page of
+ a B-tree, but not in the root of an ibuf
+ tree */
+/*----*/
+#define PAGE_DATA (PAGE_HEADER + 36 + 2 * FSEG_HEADER_SIZE)
+ /* start of data on the page */
+
+#define PAGE_OLD_INFIMUM (PAGE_DATA + 1 + REC_N_OLD_EXTRA_BYTES)
+ /* offset of the page infimum record on an
+ old-style page */
+#define PAGE_OLD_SUPREMUM (PAGE_DATA + 2 + 2 * REC_N_OLD_EXTRA_BYTES + 8)
+ /* offset of the page supremum record on an
+ old-style page */
+#define PAGE_OLD_SUPREMUM_END (PAGE_OLD_SUPREMUM + 9)
+ /* offset of the page supremum record end on
+ an old-style page */
+#define PAGE_NEW_INFIMUM (PAGE_DATA + REC_N_NEW_EXTRA_BYTES)
+ /* offset of the page infimum record on a
+ new-style compact page */
+#define PAGE_NEW_SUPREMUM (PAGE_DATA + 2 * REC_N_NEW_EXTRA_BYTES + 8)
+ /* offset of the page supremum record on a
+ new-style compact page */
+#define PAGE_NEW_SUPREMUM_END (PAGE_NEW_SUPREMUM + 8)
+ /* offset of the page supremum record end on
+ a new-style compact page */
+/*-----------------------------*/
+
+/* Heap numbers */
+#define PAGE_HEAP_NO_INFIMUM 0 /* page infimum */
+#define PAGE_HEAP_NO_SUPREMUM 1 /* page supremum */
+#define PAGE_HEAP_NO_USER_LOW 2 /* first user record in
+ creation (insertion) order,
+ not necessarily collation order;
+ this record may have been deleted */
+
+/* Directions of cursor movement */
+#define PAGE_LEFT 1
+#define PAGE_RIGHT 2
+#define PAGE_SAME_REC 3
+#define PAGE_SAME_PAGE 4
+#define PAGE_NO_DIRECTION 5
+
+/* PAGE DIRECTORY
+ ==============
+*/
+
+typedef byte page_dir_slot_t;
+typedef page_dir_slot_t page_dir_t;
+
+/* Offset of the directory start down from the page end. We call the
+slot with the highest file address directory start, as it points to
+the first record in the list of records. */
+#define PAGE_DIR FIL_PAGE_DATA_END
+
+/* We define a slot in the page directory as two bytes */
+#define PAGE_DIR_SLOT_SIZE 2
+
+/* The offset of the physically lower end of the directory, counted from
+page end, when the page is empty */
+#define PAGE_EMPTY_DIR_START (PAGE_DIR + 2 * PAGE_DIR_SLOT_SIZE)
+
+/* The maximum and minimum number of records owned by a directory slot. The
+number may drop below the minimum in the first and the last slot in the
+directory. */
+#define PAGE_DIR_SLOT_MAX_N_OWNED 8
+#define PAGE_DIR_SLOT_MIN_N_OWNED 4
+
+/************************************************************//**
+Gets the start of a page.
+@return start of the page */
+UNIV_INLINE
+page_t*
+page_align(
+/*=======*/
+ const void* ptr) /*!< in: pointer to page frame */
+ __attribute__((const));
+/************************************************************//**
+Gets the offset within a page.
+@return offset from the start of the page */
+UNIV_INLINE
+ulint
+page_offset(
+/*========*/
+ const void* ptr) /*!< in: pointer to page frame */
+ __attribute__((const));
+/*************************************************************//**
+Returns the max trx id field value. */
+UNIV_INLINE
+trx_id_t
+page_get_max_trx_id(
+/*================*/
+ const page_t* page); /*!< in: page */
+/*************************************************************//**
+Sets the max trx id field value. */
+UNIV_INTERN
+void
+page_set_max_trx_id(
+/*================*/
+ buf_block_t* block, /*!< in/out: page */
+ page_zip_des_t* page_zip,/*!< in/out: compressed page, or NULL */
+ trx_id_t trx_id, /*!< in: transaction id */
+ mtr_t* mtr); /*!< in/out: mini-transaction, or NULL */
+/*************************************************************//**
+Sets the max trx id field value if trx_id is bigger than the previous
+value. */
+UNIV_INLINE
+void
+page_update_max_trx_id(
+/*===================*/
+ buf_block_t* block, /*!< in/out: page */
+ page_zip_des_t* page_zip,/*!< in/out: compressed page whose
+ uncompressed part will be updated, or NULL */
+ trx_id_t trx_id, /*!< in: transaction id */
+ mtr_t* mtr); /*!< in/out: mini-transaction */
+/*************************************************************//**
+Reads the given header field. */
+UNIV_INLINE
+ulint
+page_header_get_field(
+/*==================*/
+ const page_t* page, /*!< in: page */
+ ulint field); /*!< in: PAGE_N_DIR_SLOTS, ... */
+/*************************************************************//**
+Sets the given header field. */
+UNIV_INLINE
+void
+page_header_set_field(
+/*==================*/
+ page_t* page, /*!< in/out: page */
+ page_zip_des_t* page_zip,/*!< in/out: compressed page whose
+ uncompressed part will be updated, or NULL */
+ ulint field, /*!< in: PAGE_N_DIR_SLOTS, ... */
+ ulint val); /*!< in: value */
+/*************************************************************//**
+Returns the offset stored in the given header field.
+@return offset from the start of the page, or 0 */
+UNIV_INLINE
+ulint
+page_header_get_offs(
+/*=================*/
+ const page_t* page, /*!< in: page */
+ ulint field) /*!< in: PAGE_FREE, ... */
+ __attribute__((nonnull, pure));
+
+/*************************************************************//**
+Returns the pointer stored in the given header field, or NULL. */
+#define page_header_get_ptr(page, field) \
+ (page_header_get_offs(page, field) \
+ ? page + page_header_get_offs(page, field) : NULL)
+/*************************************************************//**
+Sets the pointer stored in the given header field. */
+UNIV_INLINE
+void
+page_header_set_ptr(
+/*================*/
+ page_t* page, /*!< in/out: page */
+ page_zip_des_t* page_zip,/*!< in/out: compressed page whose
+ uncompressed part will be updated, or NULL */
+ ulint field, /*!< in/out: PAGE_FREE, ... */
+ const byte* ptr); /*!< in: pointer or NULL*/
+#ifndef UNIV_HOTBACKUP
+/*************************************************************//**
+Resets the last insert info field in the page header. Writes to mlog
+about this operation. */
+UNIV_INLINE
+void
+page_header_reset_last_insert(
+/*==========================*/
+ page_t* page, /*!< in: page */
+ page_zip_des_t* page_zip,/*!< in/out: compressed page whose
+ uncompressed part will be updated, or NULL */
+ mtr_t* mtr); /*!< in: mtr */
+#endif /* !UNIV_HOTBACKUP */
+/************************************************************//**
+Gets the offset of the first record on the page.
+@return offset of the first record in record list, relative from page */
+UNIV_INLINE
+ulint
+page_get_infimum_offset(
+/*====================*/
+ const page_t* page); /*!< in: page which must have record(s) */
+/************************************************************//**
+Gets the offset of the last record on the page.
+@return offset of the last record in record list, relative from page */
+UNIV_INLINE
+ulint
+page_get_supremum_offset(
+/*=====================*/
+ const page_t* page); /*!< in: page which must have record(s) */
+#define page_get_infimum_rec(page) ((page) + page_get_infimum_offset(page))
+#define page_get_supremum_rec(page) ((page) + page_get_supremum_offset(page))
+/************************************************************//**
+Returns the middle record of record list. If there are an even number
+of records in the list, returns the first record of upper half-list.
+@return middle record */
+UNIV_INTERN
+rec_t*
+page_get_middle_rec(
+/*================*/
+ page_t* page); /*!< in: page */
+#ifndef UNIV_HOTBACKUP
+/*************************************************************//**
+Compares a data tuple to a physical record. Differs from the function
+cmp_dtuple_rec_with_match in the way that the record must reside on an
+index page, and also page infimum and supremum records can be given in
+the parameter rec. These are considered as the negative infinity and
+the positive infinity in the alphabetical order.
+@return 1, 0, -1, if dtuple is greater, equal, less than rec,
+respectively, when only the common first fields are compared */
+UNIV_INLINE
+int
+page_cmp_dtuple_rec_with_match(
+/*===========================*/
+ const dtuple_t* dtuple, /*!< in: data tuple */
+ const rec_t* rec, /*!< in: physical record on a page; may also
+ be page infimum or supremum, in which case
+ matched-parameter values below are not
+ affected */
+ const ulint* offsets,/*!< in: array returned by rec_get_offsets() */
+ ulint* matched_fields, /*!< in/out: number of already completely
+ matched fields; when function returns
+ contains the value for current comparison */
+ ulint* matched_bytes); /*!< in/out: number of already matched
+ bytes within the first field not completely
+ matched; when function returns contains the
+ value for current comparison */
+#endif /* !UNIV_HOTBACKUP */
+/*************************************************************//**
+Gets the page number.
+@return page number */
+UNIV_INLINE
+ulint
+page_get_page_no(
+/*=============*/
+ const page_t* page); /*!< in: page */
+/*************************************************************//**
+Gets the tablespace identifier.
+@return space id */
+UNIV_INLINE
+ulint
+page_get_space_id(
+/*==============*/
+ const page_t* page); /*!< in: page */
+/*************************************************************//**
+Gets the number of user records on page (the infimum and supremum records
+are not user records).
+@return number of user records */
+UNIV_INLINE
+ulint
+page_get_n_recs(
+/*============*/
+ const page_t* page); /*!< in: index page */
+/***************************************************************//**
+Returns the number of records before the given record in chain.
+The number includes infimum and supremum records.
+@return number of records */
+UNIV_INTERN
+ulint
+page_rec_get_n_recs_before(
+/*=======================*/
+ const rec_t* rec); /*!< in: the physical record */
+/*************************************************************//**
+Gets the number of records in the heap.
+@return number of user records */
+UNIV_INLINE
+ulint
+page_dir_get_n_heap(
+/*================*/
+ const page_t* page); /*!< in: index page */
+/*************************************************************//**
+Sets the number of records in the heap. */
+UNIV_INLINE
+void
+page_dir_set_n_heap(
+/*================*/
+ page_t* page, /*!< in/out: index page */
+ page_zip_des_t* page_zip,/*!< in/out: compressed page whose
+ uncompressed part will be updated, or NULL.
+ Note that the size of the dense page directory
+ in the compressed page trailer is
+ n_heap * PAGE_ZIP_DIR_SLOT_SIZE. */
+ ulint n_heap);/*!< in: number of records */
+/*************************************************************//**
+Gets the number of dir slots in directory.
+@return number of slots */
+UNIV_INLINE
+ulint
+page_dir_get_n_slots(
+/*=================*/
+ const page_t* page); /*!< in: index page */
+/*************************************************************//**
+Sets the number of dir slots in directory. */
+UNIV_INLINE
+void
+page_dir_set_n_slots(
+/*=================*/
+ page_t* page, /*!< in/out: page */
+ page_zip_des_t* page_zip,/*!< in/out: compressed page whose
+ uncompressed part will be updated, or NULL */
+ ulint n_slots);/*!< in: number of slots */
+#ifdef UNIV_DEBUG
+/*************************************************************//**
+Gets pointer to nth directory slot.
+@return pointer to dir slot */
+UNIV_INLINE
+page_dir_slot_t*
+page_dir_get_nth_slot(
+/*==================*/
+ const page_t* page, /*!< in: index page */
+ ulint n); /*!< in: position */
+#else /* UNIV_DEBUG */
+# define page_dir_get_nth_slot(page, n) \
+ ((page) + UNIV_PAGE_SIZE - PAGE_DIR \
+ - (n + 1) * PAGE_DIR_SLOT_SIZE)
+#endif /* UNIV_DEBUG */
+/**************************************************************//**
+Used to check the consistency of a record on a page.
+@return TRUE if succeed */
+UNIV_INLINE
+ibool
+page_rec_check(
+/*===========*/
+ const rec_t* rec); /*!< in: record */
+/***************************************************************//**
+Gets the record pointed to by a directory slot.
+@return pointer to record */
+UNIV_INLINE
+const rec_t*
+page_dir_slot_get_rec(
+/*==================*/
+ const page_dir_slot_t* slot); /*!< in: directory slot */
+/***************************************************************//**
+This is used to set the record offset in a directory slot. */
+UNIV_INLINE
+void
+page_dir_slot_set_rec(
+/*==================*/
+ page_dir_slot_t* slot, /*!< in: directory slot */
+ rec_t* rec); /*!< in: record on the page */
+/***************************************************************//**
+Gets the number of records owned by a directory slot.
+@return number of records */
+UNIV_INLINE
+ulint
+page_dir_slot_get_n_owned(
+/*======================*/
+ const page_dir_slot_t* slot); /*!< in: page directory slot */
+/***************************************************************//**
+This is used to set the owned records field of a directory slot. */
+UNIV_INLINE
+void
+page_dir_slot_set_n_owned(
+/*======================*/
+ page_dir_slot_t*slot, /*!< in/out: directory slot */
+ page_zip_des_t* page_zip,/*!< in/out: compressed page, or NULL */
+ ulint n); /*!< in: number of records owned by the slot */
+/************************************************************//**
+Calculates the space reserved for directory slots of a given
+number of records. The exact value is a fraction number
+n * PAGE_DIR_SLOT_SIZE / PAGE_DIR_SLOT_MIN_N_OWNED, and it is
+rounded upwards to an integer. */
+UNIV_INLINE
+ulint
+page_dir_calc_reserved_space(
+/*=========================*/
+ ulint n_recs); /*!< in: number of records */
+/***************************************************************//**
+Looks for the directory slot which owns the given record.
+@return the directory slot number */
+UNIV_INTERN
+ulint
+page_dir_find_owner_slot(
+/*=====================*/
+ const rec_t* rec); /*!< in: the physical record */
+/************************************************************//**
+Determine whether the page is in new-style compact format.
+@return nonzero if the page is in compact format, zero if it is in
+old-style format */
+UNIV_INLINE
+ulint
+page_is_comp(
+/*=========*/
+ const page_t* page); /*!< in: index page */
+/************************************************************//**
+TRUE if the record is on a page in compact format.
+@return nonzero if in compact format */
+UNIV_INLINE
+ulint
+page_rec_is_comp(
+/*=============*/
+ const rec_t* rec); /*!< in: record */
+/***************************************************************//**
+Returns the heap number of a record.
+@return heap number */
+UNIV_INLINE
+ulint
+page_rec_get_heap_no(
+/*=================*/
+ const rec_t* rec); /*!< in: the physical record */
+/************************************************************//**
+Determine whether the page is a B-tree leaf.
+@return TRUE if the page is a B-tree leaf */
+UNIV_INLINE
+ibool
+page_is_leaf(
+/*=========*/
+ const page_t* page) /*!< in: page */
+ __attribute__((nonnull, pure));
+/************************************************************//**
+Gets the pointer to the next record on the page.
+@return pointer to next record */
+UNIV_INLINE
+const rec_t*
+page_rec_get_next_low(
+/*==================*/
+ const rec_t* rec, /*!< in: pointer to record */
+ ulint comp); /*!< in: nonzero=compact page layout */
+/************************************************************//**
+Gets the pointer to the next record on the page.
+@return pointer to next record */
+UNIV_INLINE
+rec_t*
+page_rec_get_next(
+/*==============*/
+ rec_t* rec); /*!< in: pointer to record */
+/************************************************************//**
+Gets the pointer to the next record on the page.
+@return pointer to next record */
+UNIV_INLINE
+const rec_t*
+page_rec_get_next_const(
+/*====================*/
+ const rec_t* rec); /*!< in: pointer to record */
+/************************************************************//**
+Sets the pointer to the next record on the page. */
+UNIV_INLINE
+void
+page_rec_set_next(
+/*==============*/
+ rec_t* rec, /*!< in: pointer to record,
+ must not be page supremum */
+ rec_t* next); /*!< in: pointer to next record,
+ must not be page infimum */
+/************************************************************//**
+Gets the pointer to the previous record.
+@return pointer to previous record */
+UNIV_INLINE
+const rec_t*
+page_rec_get_prev_const(
+/*====================*/
+ const rec_t* rec); /*!< in: pointer to record, must not be page
+ infimum */
+/************************************************************//**
+Gets the pointer to the previous record.
+@return pointer to previous record */
+UNIV_INLINE
+rec_t*
+page_rec_get_prev(
+/*==============*/
+ rec_t* rec); /*!< in: pointer to record,
+ must not be page infimum */
+/************************************************************//**
+TRUE if the record is a user record on the page.
+@return TRUE if a user record */
+UNIV_INLINE
+ibool
+page_rec_is_user_rec_low(
+/*=====================*/
+ ulint offset) /*!< in: record offset on page */
+ __attribute__((const));
+/************************************************************//**
+TRUE if the record is the supremum record on a page.
+@return TRUE if the supremum record */
+UNIV_INLINE
+ibool
+page_rec_is_supremum_low(
+/*=====================*/
+ ulint offset) /*!< in: record offset on page */
+ __attribute__((const));
+/************************************************************//**
+TRUE if the record is the infimum record on a page.
+@return TRUE if the infimum record */
+UNIV_INLINE
+ibool
+page_rec_is_infimum_low(
+/*====================*/
+ ulint offset) /*!< in: record offset on page */
+ __attribute__((const));
+
+/************************************************************//**
+TRUE if the record is a user record on the page.
+@return TRUE if a user record */
+UNIV_INLINE
+ibool
+page_rec_is_user_rec(
+/*=================*/
+ const rec_t* rec) /*!< in: record */
+ __attribute__((const));
+/************************************************************//**
+TRUE if the record is the supremum record on a page.
+@return TRUE if the supremum record */
+UNIV_INLINE
+ibool
+page_rec_is_supremum(
+/*=================*/
+ const rec_t* rec) /*!< in: record */
+ __attribute__((const));
+
+/************************************************************//**
+TRUE if the record is the infimum record on a page.
+@return TRUE if the infimum record */
+UNIV_INLINE
+ibool
+page_rec_is_infimum(
+/*================*/
+ const rec_t* rec) /*!< in: record */
+ __attribute__((const));
+/***************************************************************//**
+Looks for the record which owns the given record.
+@return the owner record */
+UNIV_INLINE
+rec_t*
+page_rec_find_owner_rec(
+/*====================*/
+ rec_t* rec); /*!< in: the physical record */
+/***********************************************************************//**
+This is a low-level operation which is used in a database index creation
+to update the page number of a created B-tree to a data dictionary
+record. */
+UNIV_INTERN
+void
+page_rec_write_index_page_no(
+/*=========================*/
+ rec_t* rec, /*!< in: record to update */
+ ulint i, /*!< in: index of the field to update */
+ ulint page_no,/*!< in: value to write */
+ mtr_t* mtr); /*!< in: mtr */
+/************************************************************//**
+Returns the maximum combined size of records which can be inserted on top
+of record heap.
+@return maximum combined size for inserted records */
+UNIV_INLINE
+ulint
+page_get_max_insert_size(
+/*=====================*/
+ const page_t* page, /*!< in: index page */
+ ulint n_recs);/*!< in: number of records */
+/************************************************************//**
+Returns the maximum combined size of records which can be inserted on top
+of record heap if page is first reorganized.
+@return maximum combined size for inserted records */
+UNIV_INLINE
+ulint
+page_get_max_insert_size_after_reorganize(
+/*======================================*/
+ const page_t* page, /*!< in: index page */
+ ulint n_recs);/*!< in: number of records */
+/*************************************************************//**
+Calculates free space if a page is emptied.
+@return free space */
+UNIV_INLINE
+ulint
+page_get_free_space_of_empty(
+/*=========================*/
+ ulint comp) /*!< in: nonzero=compact page format */
+ __attribute__((const));
+/**********************************************************//**
+Returns the base extra size of a physical record. This is the
+size of the fixed header, independent of the record size.
+@return REC_N_NEW_EXTRA_BYTES or REC_N_OLD_EXTRA_BYTES */
+UNIV_INLINE
+ulint
+page_rec_get_base_extra_size(
+/*=========================*/
+ const rec_t* rec); /*!< in: physical record */
+/************************************************************//**
+Returns the sum of the sizes of the records in the record list
+excluding the infimum and supremum records.
+@return data in bytes */
+UNIV_INLINE
+ulint
+page_get_data_size(
+/*===============*/
+ const page_t* page); /*!< in: index page */
+/************************************************************//**
+Allocates a block of memory from the head of the free list
+of an index page. */
+UNIV_INLINE
+void
+page_mem_alloc_free(
+/*================*/
+ page_t* page, /*!< in/out: index page */
+ page_zip_des_t* page_zip,/*!< in/out: compressed page with enough
+ space available for inserting the record,
+ or NULL */
+ rec_t* next_rec,/*!< in: pointer to the new head of the
+ free record list */
+ ulint need); /*!< in: number of bytes allocated */
+/************************************************************//**
+Allocates a block of memory from the heap of an index page.
+@return pointer to start of allocated buffer, or NULL if allocation fails */
+UNIV_INTERN
+byte*
+page_mem_alloc_heap(
+/*================*/
+ page_t* page, /*!< in/out: index page */
+ page_zip_des_t* page_zip,/*!< in/out: compressed page with enough
+ space available for inserting the record,
+ or NULL */
+ ulint need, /*!< in: total number of bytes needed */
+ ulint* heap_no);/*!< out: this contains the heap number
+ of the allocated record
+ if allocation succeeds */
+/************************************************************//**
+Puts a record to free list. */
+UNIV_INLINE
+void
+page_mem_free(
+/*==========*/
+ page_t* page, /*!< in/out: index page */
+ page_zip_des_t* page_zip,/*!< in/out: compressed page, or NULL */
+ rec_t* rec, /*!< in: pointer to the (origin of) record */
+ dict_index_t* index, /*!< in: index of rec */
+ const ulint* offsets);/*!< in: array returned by rec_get_offsets() */
+/**********************************************************//**
+Create an uncompressed B-tree index page.
+@return pointer to the page */
+UNIV_INTERN
+page_t*
+page_create(
+/*========*/
+ buf_block_t* block, /*!< in: a buffer block where the
+ page is created */
+ mtr_t* mtr, /*!< in: mini-transaction handle */
+ ulint comp); /*!< in: nonzero=compact page format */
+/**********************************************************//**
+Create a compressed B-tree index page.
+@return pointer to the page */
+UNIV_INTERN
+page_t*
+page_create_zip(
+/*============*/
+ buf_block_t* block, /*!< in/out: a buffer frame where the
+ page is created */
+ dict_index_t* index, /*!< in: the index of the page */
+ ulint level, /*!< in: the B-tree level of the page */
+ mtr_t* mtr); /*!< in: mini-transaction handle */
+
+/*************************************************************//**
+Differs from page_copy_rec_list_end, because this function does not
+touch the lock table and max trx id on page or compress the page. */
+UNIV_INTERN
+void
+page_copy_rec_list_end_no_locks(
+/*============================*/
+ buf_block_t* new_block, /*!< in: index page to copy to */
+ buf_block_t* block, /*!< in: index page of rec */
+ rec_t* rec, /*!< in: record on page */
+ dict_index_t* index, /*!< in: record descriptor */
+ mtr_t* mtr); /*!< in: mtr */
+/*************************************************************//**
+Copies records from page to new_page, from the given record onward,
+including that record. Infimum and supremum records are not copied.
+The records are copied to the start of the record list on new_page.
+@return pointer to the original successor of the infimum record on
+new_page, or NULL on zip overflow (new_block will be decompressed) */
+UNIV_INTERN
+rec_t*
+page_copy_rec_list_end(
+/*===================*/
+ buf_block_t* new_block, /*!< in/out: index page to copy to */
+ buf_block_t* block, /*!< in: index page containing rec */
+ rec_t* rec, /*!< in: record on page */
+ dict_index_t* index, /*!< in: record descriptor */
+ mtr_t* mtr) /*!< in: mtr */
+ __attribute__((nonnull));
+/*************************************************************//**
+Copies records from page to new_page, up to the given record, NOT
+including that record. Infimum and supremum records are not copied.
+The records are copied to the end of the record list on new_page.
+@return pointer to the original predecessor of the supremum record on
+new_page, or NULL on zip overflow (new_block will be decompressed) */
+UNIV_INTERN
+rec_t*
+page_copy_rec_list_start(
+/*=====================*/
+ buf_block_t* new_block, /*!< in/out: index page to copy to */
+ buf_block_t* block, /*!< in: index page containing rec */
+ rec_t* rec, /*!< in: record on page */
+ dict_index_t* index, /*!< in: record descriptor */
+ mtr_t* mtr) /*!< in: mtr */
+ __attribute__((nonnull));
+/*************************************************************//**
+Deletes records from a page from a given record onward, including that record.
+The infimum and supremum records are not deleted. */
+UNIV_INTERN
+void
+page_delete_rec_list_end(
+/*=====================*/
+ rec_t* rec, /*!< in: pointer to record on page */
+ buf_block_t* block, /*!< in: buffer block of the page */
+ dict_index_t* index, /*!< in: record descriptor */
+ ulint n_recs, /*!< in: number of records to delete,
+ or ULINT_UNDEFINED if not known */
+ ulint size, /*!< in: the sum of the sizes of the
+ records in the end of the chain to
+ delete, or ULINT_UNDEFINED if not known */
+ mtr_t* mtr) /*!< in: mtr */
+ __attribute__((nonnull));
+/*************************************************************//**
+Deletes records from page, up to the given record, NOT including
+that record. Infimum and supremum records are not deleted. */
+UNIV_INTERN
+void
+page_delete_rec_list_start(
+/*=======================*/
+ rec_t* rec, /*!< in: record on page */
+ buf_block_t* block, /*!< in: buffer block of the page */
+ dict_index_t* index, /*!< in: record descriptor */
+ mtr_t* mtr) /*!< in: mtr */
+ __attribute__((nonnull));
+/*************************************************************//**
+Moves record list end to another page. Moved records include
+split_rec.
+@return TRUE on success; FALSE on compression failure (new_block will
+be decompressed) */
+UNIV_INTERN
+ibool
+page_move_rec_list_end(
+/*===================*/
+ buf_block_t* new_block, /*!< in/out: index page where to move */
+ buf_block_t* block, /*!< in: index page from where to move */
+ rec_t* split_rec, /*!< in: first record to move */
+ dict_index_t* index, /*!< in: record descriptor */
+ mtr_t* mtr) /*!< in: mtr */
+ __attribute__((nonnull(1, 2, 4, 5)));
+/*************************************************************//**
+Moves record list start to another page. Moved records do not include
+split_rec.
+@return TRUE on success; FALSE on compression failure */
+UNIV_INTERN
+ibool
+page_move_rec_list_start(
+/*=====================*/
+ buf_block_t* new_block, /*!< in/out: index page where to move */
+ buf_block_t* block, /*!< in/out: page containing split_rec */
+ rec_t* split_rec, /*!< in: first record not to move */
+ dict_index_t* index, /*!< in: record descriptor */
+ mtr_t* mtr) /*!< in: mtr */
+ __attribute__((nonnull(1, 2, 4, 5)));
+/****************************************************************//**
+Splits a directory slot which owns too many records. */
+UNIV_INTERN
+void
+page_dir_split_slot(
+/*================*/
+ page_t* page, /*!< in: index page */
+ page_zip_des_t* page_zip,/*!< in/out: compressed page whose
+ uncompressed part will be written, or NULL */
+ ulint slot_no)/*!< in: the directory slot */
+ __attribute__((nonnull(1)));
+/*************************************************************//**
+Tries to balance the given directory slot with too few records
+with the upper neighbor, so that there are at least the minimum number
+of records owned by the slot; this may result in the merging of
+two slots. */
+UNIV_INTERN
+void
+page_dir_balance_slot(
+/*==================*/
+ page_t* page, /*!< in/out: index page */
+ page_zip_des_t* page_zip,/*!< in/out: compressed page, or NULL */
+ ulint slot_no)/*!< in: the directory slot */
+ __attribute__((nonnull(1)));
+/**********************************************************//**
+Parses a log record of a record list end or start deletion.
+@return end of log record or NULL */
+UNIV_INTERN
+byte*
+page_parse_delete_rec_list(
+/*=======================*/
+ byte type, /*!< in: MLOG_LIST_END_DELETE,
+ MLOG_LIST_START_DELETE,
+ MLOG_COMP_LIST_END_DELETE or
+ MLOG_COMP_LIST_START_DELETE */
+ byte* ptr, /*!< in: buffer */
+ byte* end_ptr,/*!< in: buffer end */
+ buf_block_t* block, /*!< in/out: buffer block or NULL */
+ dict_index_t* index, /*!< in: record descriptor */
+ mtr_t* mtr); /*!< in: mtr or NULL */
+/***********************************************************//**
+Parses a redo log record of creating a page.
+@return end of log record or NULL */
+UNIV_INTERN
+byte*
+page_parse_create(
+/*==============*/
+ byte* ptr, /*!< in: buffer */
+ byte* end_ptr,/*!< in: buffer end */
+ ulint comp, /*!< in: nonzero=compact page format */
+ buf_block_t* block, /*!< in: block or NULL */
+ mtr_t* mtr); /*!< in: mtr or NULL */
+/************************************************************//**
+Prints record contents including the data relevant only in
+the index page context. */
+UNIV_INTERN
+void
+page_rec_print(
+/*===========*/
+ const rec_t* rec, /*!< in: physical record */
+ const ulint* offsets);/*!< in: record descriptor */
+/***************************************************************//**
+This is used to print the contents of the directory for
+debugging purposes. */
+UNIV_INTERN
+void
+page_dir_print(
+/*===========*/
+ page_t* page, /*!< in: index page */
+ ulint pr_n); /*!< in: print n first and n last entries */
+/***************************************************************//**
+This is used to print the contents of the page record list for
+debugging purposes. */
+UNIV_INTERN
+void
+page_print_list(
+/*============*/
+ buf_block_t* block, /*!< in: index page */
+ dict_index_t* index, /*!< in: dictionary index of the page */
+ ulint pr_n); /*!< in: print n first and n last entries */
+/***************************************************************//**
+Prints the info in a page header. */
+UNIV_INTERN
+void
+page_header_print(
+/*==============*/
+ const page_t* page); /*!< in: index page */
+/***************************************************************//**
+This is used to print the contents of the page for
+debugging purposes. */
+UNIV_INTERN
+void
+page_print(
+/*=======*/
+ buf_block_t* block, /*!< in: index page */
+ dict_index_t* index, /*!< in: dictionary index of the page */
+ ulint dn, /*!< in: print dn first and last entries
+ in directory */
+ ulint rn); /*!< in: print rn first and last records
+ in directory */
+/***************************************************************//**
+The following is used to validate a record on a page. This function
+differs from rec_validate as it can also check the n_owned field and
+the heap_no field.
+@return TRUE if ok */
+UNIV_INTERN
+ibool
+page_rec_validate(
+/*==============*/
+ rec_t* rec, /*!< in: physical record */
+ const ulint* offsets);/*!< in: array returned by rec_get_offsets() */
+/***************************************************************//**
+Checks that the first directory slot points to the infimum record and
+the last to the supremum. This function is intended to track if the
+bug fixed in 4.0.14 has caused corruption to users' databases. */
+UNIV_INTERN
+void
+page_check_dir(
+/*===========*/
+ const page_t* page); /*!< in: index page */
+/***************************************************************//**
+This function checks the consistency of an index page when we do not
+know the index. This is also resilient so that this should never crash
+even if the page is total garbage.
+@return TRUE if ok */
+UNIV_INTERN
+ibool
+page_simple_validate_old(
+/*=====================*/
+ page_t* page); /*!< in: old-style index page */
+/***************************************************************//**
+This function checks the consistency of an index page when we do not
+know the index. This is also resilient so that this should never crash
+even if the page is total garbage.
+@return TRUE if ok */
+UNIV_INTERN
+ibool
+page_simple_validate_new(
+/*=====================*/
+ page_t* block); /*!< in: new-style index page */
+/***************************************************************//**
+This function checks the consistency of an index page.
+@return TRUE if ok */
+UNIV_INTERN
+ibool
+page_validate(
+/*==========*/
+ page_t* page, /*!< in: index page */
+ dict_index_t* index); /*!< in: data dictionary index containing
+ the page record type definition */
+/***************************************************************//**
+Looks in the page record list for a record with the given heap number.
+@return record, NULL if not found */
+
+const rec_t*
+page_find_rec_with_heap_no(
+/*=======================*/
+ const page_t* page, /*!< in: index page */
+ ulint heap_no);/*!< in: heap number */
+
+#ifdef UNIV_MATERIALIZE
+#undef UNIV_INLINE
+#define UNIV_INLINE UNIV_INLINE_ORIGINAL
+#endif
+
+#ifndef UNIV_NONINL
+#include "page0page.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/page0page.ic b/storage/innobase/include/page0page.ic
new file mode 100644
index 00000000000..318ec1cc1f2
--- /dev/null
+++ b/storage/innobase/include/page0page.ic
@@ -0,0 +1,1073 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/page0page.ic
+Index page routines
+
+Created 2/2/1994 Heikki Tuuri
+*******************************************************/
+
+#include "mach0data.h"
+#ifdef UNIV_DEBUG
+# include "log0recv.h"
+#endif /* !UNIV_DEBUG */
+#ifndef UNIV_HOTBACKUP
+# include "rem0cmp.h"
+#endif /* !UNIV_HOTBACKUP */
+#include "mtr0log.h"
+#include "page0zip.h"
+
+#ifdef UNIV_MATERIALIZE
+#undef UNIV_INLINE
+#define UNIV_INLINE
+#endif
+
+/************************************************************//**
+Gets the start of a page.
+@return start of the page */
+UNIV_INLINE
+page_t*
+page_align(
+/*=======*/
+ const void* ptr) /*!< in: pointer to page frame */
+{
+ return((page_t*) ut_align_down(ptr, UNIV_PAGE_SIZE));
+}
+/************************************************************//**
+Gets the offset within a page.
+@return offset from the start of the page */
+UNIV_INLINE
+ulint
+page_offset(
+/*========*/
+ const void* ptr) /*!< in: pointer to page frame */
+{
+ return(ut_align_offset(ptr, UNIV_PAGE_SIZE));
+}
+/*************************************************************//**
+Returns the max trx id field value. */
+UNIV_INLINE
+trx_id_t
+page_get_max_trx_id(
+/*================*/
+ const page_t* page) /*!< in: page */
+{
+ ut_ad(page);
+
+ return(mach_read_from_8(page + PAGE_HEADER + PAGE_MAX_TRX_ID));
+}
+
+/*************************************************************//**
+Sets the max trx id field value if trx_id is bigger than the previous
+value. */
+UNIV_INLINE
+void
+page_update_max_trx_id(
+/*===================*/
+ buf_block_t* block, /*!< in/out: page */
+ page_zip_des_t* page_zip,/*!< in/out: compressed page whose
+ uncompressed part will be updated, or NULL */
+ trx_id_t trx_id, /*!< in: transaction id */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+{
+ ut_ad(block);
+ ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
+ /* During crash recovery, this function may be called on
+ something else than a leaf page of a secondary index or the
+ insert buffer index tree (dict_index_is_sec_or_ibuf() returns
+ TRUE for the dummy indexes constructed during redo log
+ application). In that case, PAGE_MAX_TRX_ID is unused,
+ and trx_id is usually zero. */
+ ut_ad(!ut_dulint_is_zero(trx_id) || recv_recovery_is_on());
+ ut_ad(page_is_leaf(buf_block_get_frame(block)));
+
+ if (ut_dulint_cmp(page_get_max_trx_id(buf_block_get_frame(block)),
+ trx_id) < 0) {
+
+ page_set_max_trx_id(block, page_zip, trx_id, mtr);
+ }
+}
+
+/*************************************************************//**
+Reads the given header field. */
+UNIV_INLINE
+ulint
+page_header_get_field(
+/*==================*/
+ const page_t* page, /*!< in: page */
+ ulint field) /*!< in: PAGE_LEVEL, ... */
+{
+ ut_ad(page);
+ ut_ad(field <= PAGE_INDEX_ID);
+
+ return(mach_read_from_2(page + PAGE_HEADER + field));
+}
+
+/*************************************************************//**
+Sets the given header field. */
+UNIV_INLINE
+void
+page_header_set_field(
+/*==================*/
+ page_t* page, /*!< in/out: page */
+ page_zip_des_t* page_zip,/*!< in/out: compressed page whose
+ uncompressed part will be updated, or NULL */
+ ulint field, /*!< in: PAGE_N_DIR_SLOTS, ... */
+ ulint val) /*!< in: value */
+{
+ ut_ad(page);
+ ut_ad(field <= PAGE_N_RECS);
+ ut_ad(field == PAGE_N_HEAP || val < UNIV_PAGE_SIZE);
+ ut_ad(field != PAGE_N_HEAP || (val & 0x7fff) < UNIV_PAGE_SIZE);
+
+ mach_write_to_2(page + PAGE_HEADER + field, val);
+ if (UNIV_LIKELY_NULL(page_zip)) {
+ page_zip_write_header(page_zip,
+ page + PAGE_HEADER + field, 2, NULL);
+ }
+}
+
+/*************************************************************//**
+Returns the offset stored in the given header field.
+@return offset from the start of the page, or 0 */
+UNIV_INLINE
+ulint
+page_header_get_offs(
+/*=================*/
+ const page_t* page, /*!< in: page */
+ ulint field) /*!< in: PAGE_FREE, ... */
+{
+ ulint offs;
+
+ ut_ad(page);
+ ut_ad((field == PAGE_FREE)
+ || (field == PAGE_LAST_INSERT)
+ || (field == PAGE_HEAP_TOP));
+
+ offs = page_header_get_field(page, field);
+
+ ut_ad((field != PAGE_HEAP_TOP) || offs);
+
+ return(offs);
+}
+
+/*************************************************************//**
+Sets the pointer stored in the given header field. */
+UNIV_INLINE
+void
+page_header_set_ptr(
+/*================*/
+ page_t* page, /*!< in: page */
+ page_zip_des_t* page_zip,/*!< in/out: compressed page whose
+ uncompressed part will be updated, or NULL */
+ ulint field, /*!< in: PAGE_FREE, ... */
+ const byte* ptr) /*!< in: pointer or NULL*/
+{
+ ulint offs;
+
+ ut_ad(page);
+ ut_ad((field == PAGE_FREE)
+ || (field == PAGE_LAST_INSERT)
+ || (field == PAGE_HEAP_TOP));
+
+ if (ptr == NULL) {
+ offs = 0;
+ } else {
+ offs = ptr - page;
+ }
+
+ ut_ad((field != PAGE_HEAP_TOP) || offs);
+
+ page_header_set_field(page, page_zip, field, offs);
+}
+
+#ifndef UNIV_HOTBACKUP
+/*************************************************************//**
+Resets the last insert info field in the page header. Writes to mlog
+about this operation. */
+UNIV_INLINE
+void
+page_header_reset_last_insert(
+/*==========================*/
+ page_t* page, /*!< in/out: page */
+ page_zip_des_t* page_zip,/*!< in/out: compressed page whose
+ uncompressed part will be updated, or NULL */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ut_ad(page && mtr);
+
+ if (UNIV_LIKELY_NULL(page_zip)) {
+ mach_write_to_2(page + (PAGE_HEADER + PAGE_LAST_INSERT), 0);
+ page_zip_write_header(page_zip,
+ page + (PAGE_HEADER + PAGE_LAST_INSERT),
+ 2, mtr);
+ } else {
+ mlog_write_ulint(page + (PAGE_HEADER + PAGE_LAST_INSERT), 0,
+ MLOG_2BYTES, mtr);
+ }
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/************************************************************//**
+Determine whether the page is in new-style compact format.
+@return nonzero if the page is in compact format, zero if it is in
+old-style format */
+UNIV_INLINE
+ulint
+page_is_comp(
+/*=========*/
+ const page_t* page) /*!< in: index page */
+{
+ return(UNIV_EXPECT(page_header_get_field(page, PAGE_N_HEAP) & 0x8000,
+ 0x8000));
+}
+
+/************************************************************//**
+TRUE if the record is on a page in compact format.
+@return nonzero if in compact format */
+UNIV_INLINE
+ulint
+page_rec_is_comp(
+/*=============*/
+ const rec_t* rec) /*!< in: record */
+{
+ return(page_is_comp(page_align(rec)));
+}
+
+/***************************************************************//**
+Returns the heap number of a record.
+@return heap number */
+UNIV_INLINE
+ulint
+page_rec_get_heap_no(
+/*=================*/
+ const rec_t* rec) /*!< in: the physical record */
+{
+ if (page_rec_is_comp(rec)) {
+ return(rec_get_heap_no_new(rec));
+ } else {
+ return(rec_get_heap_no_old(rec));
+ }
+}
+
+/************************************************************//**
+Determine whether the page is a B-tree leaf.
+@return TRUE if the page is a B-tree leaf */
+UNIV_INLINE
+ibool
+page_is_leaf(
+/*=========*/
+ const page_t* page) /*!< in: page */
+{
+ return(!*(const uint16*) (page + (PAGE_HEADER + PAGE_LEVEL)));
+}
+
+/************************************************************//**
+Gets the offset of the first record on the page.
+@return offset of the first record in record list, relative from page */
+UNIV_INLINE
+ulint
+page_get_infimum_offset(
+/*====================*/
+ const page_t* page) /*!< in: page which must have record(s) */
+{
+ ut_ad(page);
+ ut_ad(!page_offset(page));
+
+ if (page_is_comp(page)) {
+ return(PAGE_NEW_INFIMUM);
+ } else {
+ return(PAGE_OLD_INFIMUM);
+ }
+}
+
+/************************************************************//**
+Gets the offset of the last record on the page.
+@return offset of the last record in record list, relative from page */
+UNIV_INLINE
+ulint
+page_get_supremum_offset(
+/*=====================*/
+ const page_t* page) /*!< in: page which must have record(s) */
+{
+ ut_ad(page);
+ ut_ad(!page_offset(page));
+
+ if (page_is_comp(page)) {
+ return(PAGE_NEW_SUPREMUM);
+ } else {
+ return(PAGE_OLD_SUPREMUM);
+ }
+}
+
+/************************************************************//**
+TRUE if the record is a user record on the page.
+@return TRUE if a user record */
+UNIV_INLINE
+ibool
+page_rec_is_user_rec_low(
+/*=====================*/
+ ulint offset) /*!< in: record offset on page */
+{
+ ut_ad(offset >= PAGE_NEW_INFIMUM);
+#if PAGE_OLD_INFIMUM < PAGE_NEW_INFIMUM
+# error "PAGE_OLD_INFIMUM < PAGE_NEW_INFIMUM"
+#endif
+#if PAGE_OLD_SUPREMUM < PAGE_NEW_SUPREMUM
+# error "PAGE_OLD_SUPREMUM < PAGE_NEW_SUPREMUM"
+#endif
+#if PAGE_NEW_INFIMUM > PAGE_OLD_SUPREMUM
+# error "PAGE_NEW_INFIMUM > PAGE_OLD_SUPREMUM"
+#endif
+#if PAGE_OLD_INFIMUM > PAGE_NEW_SUPREMUM
+# error "PAGE_OLD_INFIMUM > PAGE_NEW_SUPREMUM"
+#endif
+#if PAGE_NEW_SUPREMUM > PAGE_OLD_SUPREMUM_END
+# error "PAGE_NEW_SUPREMUM > PAGE_OLD_SUPREMUM_END"
+#endif
+#if PAGE_OLD_SUPREMUM > PAGE_NEW_SUPREMUM_END
+# error "PAGE_OLD_SUPREMUM > PAGE_NEW_SUPREMUM_END"
+#endif
+ ut_ad(offset <= UNIV_PAGE_SIZE - PAGE_EMPTY_DIR_START);
+
+ return(UNIV_LIKELY(offset != PAGE_NEW_SUPREMUM)
+ && UNIV_LIKELY(offset != PAGE_NEW_INFIMUM)
+ && UNIV_LIKELY(offset != PAGE_OLD_INFIMUM)
+ && UNIV_LIKELY(offset != PAGE_OLD_SUPREMUM));
+}
+
+/************************************************************//**
+TRUE if the record is the supremum record on a page.
+@return TRUE if the supremum record */
+UNIV_INLINE
+ibool
+page_rec_is_supremum_low(
+/*=====================*/
+ ulint offset) /*!< in: record offset on page */
+{
+ ut_ad(offset >= PAGE_NEW_INFIMUM);
+ ut_ad(offset <= UNIV_PAGE_SIZE - PAGE_EMPTY_DIR_START);
+
+ return(UNIV_UNLIKELY(offset == PAGE_NEW_SUPREMUM)
+ || UNIV_UNLIKELY(offset == PAGE_OLD_SUPREMUM));
+}
+
+/************************************************************//**
+TRUE if the record is the infimum record on a page.
+@return TRUE if the infimum record */
+UNIV_INLINE
+ibool
+page_rec_is_infimum_low(
+/*====================*/
+ ulint offset) /*!< in: record offset on page */
+{
+ ut_ad(offset >= PAGE_NEW_INFIMUM);
+ ut_ad(offset <= UNIV_PAGE_SIZE - PAGE_EMPTY_DIR_START);
+
+ return(UNIV_UNLIKELY(offset == PAGE_NEW_INFIMUM)
+ || UNIV_UNLIKELY(offset == PAGE_OLD_INFIMUM));
+}
+
+/************************************************************//**
+TRUE if the record is a user record on the page.
+@return TRUE if a user record */
+UNIV_INLINE
+ibool
+page_rec_is_user_rec(
+/*=================*/
+ const rec_t* rec) /*!< in: record */
+{
+ return(page_rec_is_user_rec_low(page_offset(rec)));
+}
+
+/************************************************************//**
+TRUE if the record is the supremum record on a page.
+@return TRUE if the supremum record */
+UNIV_INLINE
+ibool
+page_rec_is_supremum(
+/*=================*/
+ const rec_t* rec) /*!< in: record */
+{
+ return(page_rec_is_supremum_low(page_offset(rec)));
+}
+
+/************************************************************//**
+TRUE if the record is the infimum record on a page.
+@return TRUE if the infimum record */
+UNIV_INLINE
+ibool
+page_rec_is_infimum(
+/*================*/
+ const rec_t* rec) /*!< in: record */
+{
+ return(page_rec_is_infimum_low(page_offset(rec)));
+}
+
+#ifndef UNIV_HOTBACKUP
+/*************************************************************//**
+Compares a data tuple to a physical record. Differs from the function
+cmp_dtuple_rec_with_match in the way that the record must reside on an
+index page, and also page infimum and supremum records can be given in
+the parameter rec. These are considered as the negative infinity and
+the positive infinity in the alphabetical order.
+@return 1, 0, -1, if dtuple is greater, equal, less than rec,
+respectively, when only the common first fields are compared */
+UNIV_INLINE
+int
+page_cmp_dtuple_rec_with_match(
+/*===========================*/
+ const dtuple_t* dtuple, /*!< in: data tuple */
+ const rec_t* rec, /*!< in: physical record on a page; may also
+ be page infimum or supremum, in which case
+ matched-parameter values below are not
+ affected */
+ const ulint* offsets,/*!< in: array returned by rec_get_offsets() */
+ ulint* matched_fields, /*!< in/out: number of already completely
+ matched fields; when function returns
+ contains the value for current comparison */
+ ulint* matched_bytes) /*!< in/out: number of already matched
+ bytes within the first field not completely
+ matched; when function returns contains the
+ value for current comparison */
+{
+ ulint rec_offset;
+
+ ut_ad(dtuple_check_typed(dtuple));
+ ut_ad(rec_offs_validate(rec, NULL, offsets));
+ ut_ad(!rec_offs_comp(offsets) == !page_rec_is_comp(rec));
+
+ rec_offset = page_offset(rec);
+
+ if (UNIV_UNLIKELY(rec_offset == PAGE_NEW_INFIMUM)
+ || UNIV_UNLIKELY(rec_offset == PAGE_OLD_INFIMUM)) {
+ return(1);
+ }
+ if (UNIV_UNLIKELY(rec_offset == PAGE_NEW_SUPREMUM)
+ || UNIV_UNLIKELY(rec_offset == PAGE_OLD_SUPREMUM)) {
+ return(-1);
+ }
+
+ return(cmp_dtuple_rec_with_match(dtuple, rec, offsets,
+ matched_fields,
+ matched_bytes));
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/*************************************************************//**
+Gets the page number.
+@return page number */
+UNIV_INLINE
+ulint
+page_get_page_no(
+/*=============*/
+ const page_t* page) /*!< in: page */
+{
+ ut_ad(page == page_align((page_t*) page));
+ return(mach_read_from_4(page + FIL_PAGE_OFFSET));
+}
+
+/*************************************************************//**
+Gets the tablespace identifier.
+@return space id */
+UNIV_INLINE
+ulint
+page_get_space_id(
+/*==============*/
+ const page_t* page) /*!< in: page */
+{
+ ut_ad(page == page_align((page_t*) page));
+ return(mach_read_from_4(page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID));
+}
+
+/*************************************************************//**
+Gets the number of user records on page (infimum and supremum records
+are not user records).
+@return number of user records */
+UNIV_INLINE
+ulint
+page_get_n_recs(
+/*============*/
+ const page_t* page) /*!< in: index page */
+{
+ return(page_header_get_field(page, PAGE_N_RECS));
+}
+
+/*************************************************************//**
+Gets the number of dir slots in directory.
+@return number of slots */
+UNIV_INLINE
+ulint
+page_dir_get_n_slots(
+/*=================*/
+ const page_t* page) /*!< in: index page */
+{
+ return(page_header_get_field(page, PAGE_N_DIR_SLOTS));
+}
+/*************************************************************//**
+Sets the number of dir slots in directory. */
+UNIV_INLINE
+void
+page_dir_set_n_slots(
+/*=================*/
+ page_t* page, /*!< in/out: page */
+ page_zip_des_t* page_zip,/*!< in/out: compressed page whose
+ uncompressed part will be updated, or NULL */
+ ulint n_slots)/*!< in: number of slots */
+{
+ page_header_set_field(page, page_zip, PAGE_N_DIR_SLOTS, n_slots);
+}
+
+/*************************************************************//**
+Gets the number of records in the heap.
+@return number of user records */
+UNIV_INLINE
+ulint
+page_dir_get_n_heap(
+/*================*/
+ const page_t* page) /*!< in: index page */
+{
+ return(page_header_get_field(page, PAGE_N_HEAP) & 0x7fff);
+}
+
+/*************************************************************//**
+Sets the number of records in the heap. */
+UNIV_INLINE
+void
+page_dir_set_n_heap(
+/*================*/
+ page_t* page, /*!< in/out: index page */
+ page_zip_des_t* page_zip,/*!< in/out: compressed page whose
+ uncompressed part will be updated, or NULL.
+ Note that the size of the dense page directory
+ in the compressed page trailer is
+ n_heap * PAGE_ZIP_DIR_SLOT_SIZE. */
+ ulint n_heap) /*!< in: number of records */
+{
+ ut_ad(n_heap < 0x8000);
+ ut_ad(!page_zip || n_heap
+ == (page_header_get_field(page, PAGE_N_HEAP) & 0x7fff) + 1);
+
+ page_header_set_field(page, page_zip, PAGE_N_HEAP, n_heap
+ | (0x8000
+ & page_header_get_field(page, PAGE_N_HEAP)));
+}
+
+#ifdef UNIV_DEBUG
+/*************************************************************//**
+Gets pointer to nth directory slot.
+@return pointer to dir slot */
+UNIV_INLINE
+page_dir_slot_t*
+page_dir_get_nth_slot(
+/*==================*/
+ const page_t* page, /*!< in: index page */
+ ulint n) /*!< in: position */
+{
+ ut_ad(page_dir_get_n_slots(page) > n);
+
+ return((page_dir_slot_t*)
+ page + UNIV_PAGE_SIZE - PAGE_DIR
+ - (n + 1) * PAGE_DIR_SLOT_SIZE);
+}
+#endif /* UNIV_DEBUG */
+
+/**************************************************************//**
+Used to check the consistency of a record on a page.
+@return TRUE if succeed */
+UNIV_INLINE
+ibool
+page_rec_check(
+/*===========*/
+ const rec_t* rec) /*!< in: record */
+{
+ const page_t* page = page_align(rec);
+
+ ut_a(rec);
+
+ ut_a(page_offset(rec) <= page_header_get_field(page, PAGE_HEAP_TOP));
+ ut_a(page_offset(rec) >= PAGE_DATA);
+
+ return(TRUE);
+}
+
+/***************************************************************//**
+Gets the record pointed to by a directory slot.
+@return pointer to record */
+UNIV_INLINE
+const rec_t*
+page_dir_slot_get_rec(
+/*==================*/
+ const page_dir_slot_t* slot) /*!< in: directory slot */
+{
+ return(page_align(slot) + mach_read_from_2(slot));
+}
+
+/***************************************************************//**
+This is used to set the record offset in a directory slot. */
+UNIV_INLINE
+void
+page_dir_slot_set_rec(
+/*==================*/
+ page_dir_slot_t* slot, /*!< in: directory slot */
+ rec_t* rec) /*!< in: record on the page */
+{
+ ut_ad(page_rec_check(rec));
+
+ mach_write_to_2(slot, page_offset(rec));
+}
+
+/***************************************************************//**
+Gets the number of records owned by a directory slot.
+@return number of records */
+UNIV_INLINE
+ulint
+page_dir_slot_get_n_owned(
+/*======================*/
+ const page_dir_slot_t* slot) /*!< in: page directory slot */
+{
+ const rec_t* rec = page_dir_slot_get_rec(slot);
+ if (page_rec_is_comp(slot)) {
+ return(rec_get_n_owned_new(rec));
+ } else {
+ return(rec_get_n_owned_old(rec));
+ }
+}
+
+/***************************************************************//**
+This is used to set the owned records field of a directory slot. */
+UNIV_INLINE
+void
+page_dir_slot_set_n_owned(
+/*======================*/
+ page_dir_slot_t*slot, /*!< in/out: directory slot */
+ page_zip_des_t* page_zip,/*!< in/out: compressed page, or NULL */
+ ulint n) /*!< in: number of records owned by the slot */
+{
+ rec_t* rec = (rec_t*) page_dir_slot_get_rec(slot);
+ if (page_rec_is_comp(slot)) {
+ rec_set_n_owned_new(rec, page_zip, n);
+ } else {
+ ut_ad(!page_zip);
+ rec_set_n_owned_old(rec, n);
+ }
+}
+
+/************************************************************//**
+Calculates the space reserved for directory slots of a given number of
+records. The exact value is a fraction number n * PAGE_DIR_SLOT_SIZE /
+PAGE_DIR_SLOT_MIN_N_OWNED, and it is rounded upwards to an integer. */
+UNIV_INLINE
+ulint
+page_dir_calc_reserved_space(
+/*=========================*/
+ ulint n_recs) /*!< in: number of records */
+{
+ return((PAGE_DIR_SLOT_SIZE * n_recs + PAGE_DIR_SLOT_MIN_N_OWNED - 1)
+ / PAGE_DIR_SLOT_MIN_N_OWNED);
+}
+
+/************************************************************//**
+Gets the pointer to the next record on the page.
+@return pointer to next record */
+UNIV_INLINE
+const rec_t*
+page_rec_get_next_low(
+/*==================*/
+ const rec_t* rec, /*!< in: pointer to record */
+ ulint comp) /*!< in: nonzero=compact page layout */
+{
+ ulint offs;
+ const page_t* page;
+
+ ut_ad(page_rec_check(rec));
+
+ page = page_align(rec);
+
+ offs = rec_get_next_offs(rec, comp);
+
+ if (UNIV_UNLIKELY(offs >= UNIV_PAGE_SIZE)) {
+ fprintf(stderr,
+ "InnoDB: Next record offset is nonsensical %lu"
+ " in record at offset %lu\n"
+ "InnoDB: rec address %p, space id %lu, page %lu\n",
+ (ulong)offs, (ulong) page_offset(rec),
+ (void*) rec,
+ (ulong) page_get_space_id(page),
+ (ulong) page_get_page_no(page));
+ buf_page_print(page, 0);
+
+ ut_error;
+ }
+
+ if (UNIV_UNLIKELY(offs == 0)) {
+
+ return(NULL);
+ }
+
+ return(page + offs);
+}
+
+/************************************************************//**
+Gets the pointer to the next record on the page.
+@return pointer to next record */
+UNIV_INLINE
+rec_t*
+page_rec_get_next(
+/*==============*/
+ rec_t* rec) /*!< in: pointer to record */
+{
+ return((rec_t*) page_rec_get_next_low(rec, page_rec_is_comp(rec)));
+}
+
+/************************************************************//**
+Gets the pointer to the next record on the page.
+@return pointer to next record */
+UNIV_INLINE
+const rec_t*
+page_rec_get_next_const(
+/*====================*/
+ const rec_t* rec) /*!< in: pointer to record */
+{
+ return(page_rec_get_next_low(rec, page_rec_is_comp(rec)));
+}
+
+/************************************************************//**
+Sets the pointer to the next record on the page. */
+UNIV_INLINE
+void
+page_rec_set_next(
+/*==============*/
+ rec_t* rec, /*!< in: pointer to record,
+ must not be page supremum */
+ rec_t* next) /*!< in: pointer to next record,
+ must not be page infimum */
+{
+ ulint offs;
+
+ ut_ad(page_rec_check(rec));
+ ut_ad(!page_rec_is_supremum(rec));
+ ut_ad(rec != next);
+
+ ut_ad(!next || !page_rec_is_infimum(next));
+ ut_ad(!next || page_align(rec) == page_align(next));
+
+ if (UNIV_LIKELY(next != NULL)) {
+ offs = page_offset(next);
+ } else {
+ offs = 0;
+ }
+
+ if (page_rec_is_comp(rec)) {
+ rec_set_next_offs_new(rec, offs);
+ } else {
+ rec_set_next_offs_old(rec, offs);
+ }
+}
+
+/************************************************************//**
+Gets the pointer to the previous record.
+@return pointer to previous record */
+UNIV_INLINE
+const rec_t*
+page_rec_get_prev_const(
+/*====================*/
+ const rec_t* rec) /*!< in: pointer to record, must not be page
+ infimum */
+{
+ const page_dir_slot_t* slot;
+ ulint slot_no;
+ const rec_t* rec2;
+ const rec_t* prev_rec = NULL;
+ const page_t* page;
+
+ ut_ad(page_rec_check(rec));
+
+ page = page_align(rec);
+
+ ut_ad(!page_rec_is_infimum(rec));
+
+ slot_no = page_dir_find_owner_slot(rec);
+
+ ut_a(slot_no != 0);
+
+ slot = page_dir_get_nth_slot(page, slot_no - 1);
+
+ rec2 = page_dir_slot_get_rec(slot);
+
+ if (page_is_comp(page)) {
+ while (rec != rec2) {
+ prev_rec = rec2;
+ rec2 = page_rec_get_next_low(rec2, TRUE);
+ }
+ } else {
+ while (rec != rec2) {
+ prev_rec = rec2;
+ rec2 = page_rec_get_next_low(rec2, FALSE);
+ }
+ }
+
+ ut_a(prev_rec);
+
+ return(prev_rec);
+}
+
+/************************************************************//**
+Gets the pointer to the previous record.
+@return pointer to previous record */
+UNIV_INLINE
+rec_t*
+page_rec_get_prev(
+/*==============*/
+ rec_t* rec) /*!< in: pointer to record, must not be page
+ infimum */
+{
+ return((rec_t*) page_rec_get_prev_const(rec));
+}
+
+/***************************************************************//**
+Looks for the record which owns the given record.
+@return the owner record */
+UNIV_INLINE
+rec_t*
+page_rec_find_owner_rec(
+/*====================*/
+ rec_t* rec) /*!< in: the physical record */
+{
+ ut_ad(page_rec_check(rec));
+
+ if (page_rec_is_comp(rec)) {
+ while (rec_get_n_owned_new(rec) == 0) {
+ rec = page_rec_get_next(rec);
+ }
+ } else {
+ while (rec_get_n_owned_old(rec) == 0) {
+ rec = page_rec_get_next(rec);
+ }
+ }
+
+ return(rec);
+}
+
+/**********************************************************//**
+Returns the base extra size of a physical record. This is the
+size of the fixed header, independent of the record size.
+@return REC_N_NEW_EXTRA_BYTES or REC_N_OLD_EXTRA_BYTES */
+UNIV_INLINE
+ulint
+page_rec_get_base_extra_size(
+/*=========================*/
+ const rec_t* rec) /*!< in: physical record */
+{
+#if REC_N_NEW_EXTRA_BYTES + 1 != REC_N_OLD_EXTRA_BYTES
+# error "REC_N_NEW_EXTRA_BYTES + 1 != REC_N_OLD_EXTRA_BYTES"
+#endif
+ return(REC_N_NEW_EXTRA_BYTES + (ulint) !page_rec_is_comp(rec));
+}
+
+/************************************************************//**
+Returns the sum of the sizes of the records in the record list, excluding
+the infimum and supremum records.
+@return data in bytes */
+UNIV_INLINE
+ulint
+page_get_data_size(
+/*===============*/
+ const page_t* page) /*!< in: index page */
+{
+ ulint ret;
+
+ ret = (ulint)(page_header_get_field(page, PAGE_HEAP_TOP)
+ - (page_is_comp(page)
+ ? PAGE_NEW_SUPREMUM_END
+ : PAGE_OLD_SUPREMUM_END)
+ - page_header_get_field(page, PAGE_GARBAGE));
+
+ ut_ad(ret < UNIV_PAGE_SIZE);
+
+ return(ret);
+}
+
+
+/************************************************************//**
+Allocates a block of memory from the free list of an index page. */
+UNIV_INTERN
+void
+page_mem_alloc_free(
+/*================*/
+ page_t* page, /*!< in/out: index page */
+ page_zip_des_t* page_zip,/*!< in/out: compressed page with enough
+ space available for inserting the record,
+ or NULL */
+ rec_t* next_rec,/*!< in: pointer to the new head of the
+ free record list */
+ ulint need) /*!< in: number of bytes allocated */
+{
+ ulint garbage;
+
+#ifdef UNIV_DEBUG
+ const rec_t* old_rec = page_header_get_ptr(page, PAGE_FREE);
+ ulint next_offs;
+
+ ut_ad(old_rec);
+ next_offs = rec_get_next_offs(old_rec, page_is_comp(page));
+ ut_ad(next_rec == (next_offs ? page + next_offs : NULL));
+#endif
+
+ page_header_set_ptr(page, page_zip, PAGE_FREE, next_rec);
+
+ garbage = page_header_get_field(page, PAGE_GARBAGE);
+ ut_ad(garbage >= need);
+
+ page_header_set_field(page, page_zip, PAGE_GARBAGE, garbage - need);
+}
+
+/*************************************************************//**
+Calculates free space if a page is emptied.
+@return free space */
+UNIV_INLINE
+ulint
+page_get_free_space_of_empty(
+/*=========================*/
+ ulint comp) /*!< in: nonzero=compact page layout */
+{
+ if (UNIV_LIKELY(comp)) {
+ return((ulint)(UNIV_PAGE_SIZE
+ - PAGE_NEW_SUPREMUM_END
+ - PAGE_DIR
+ - 2 * PAGE_DIR_SLOT_SIZE));
+ }
+
+ return((ulint)(UNIV_PAGE_SIZE
+ - PAGE_OLD_SUPREMUM_END
+ - PAGE_DIR
+ - 2 * PAGE_DIR_SLOT_SIZE));
+}
+
+/************************************************************//**
+Each user record on a page, and also the deleted user records in the heap
+takes its size plus the fraction of the dir cell size /
+PAGE_DIR_SLOT_MIN_N_OWNED bytes for it. If the sum of these exceeds the
+value of page_get_free_space_of_empty, the insert is impossible, otherwise
+it is allowed. This function returns the maximum combined size of records
+which can be inserted on top of the record heap.
+@return maximum combined size for inserted records */
+UNIV_INLINE
+ulint
+page_get_max_insert_size(
+/*=====================*/
+ const page_t* page, /*!< in: index page */
+ ulint n_recs) /*!< in: number of records */
+{
+ ulint occupied;
+ ulint free_space;
+
+ if (page_is_comp(page)) {
+ occupied = page_header_get_field(page, PAGE_HEAP_TOP)
+ - PAGE_NEW_SUPREMUM_END
+ + page_dir_calc_reserved_space(
+ n_recs + page_dir_get_n_heap(page) - 2);
+
+ free_space = page_get_free_space_of_empty(TRUE);
+ } else {
+ occupied = page_header_get_field(page, PAGE_HEAP_TOP)
+ - PAGE_OLD_SUPREMUM_END
+ + page_dir_calc_reserved_space(
+ n_recs + page_dir_get_n_heap(page) - 2);
+
+ free_space = page_get_free_space_of_empty(FALSE);
+ }
+
+ /* Above the 'n_recs +' part reserves directory space for the new
+ inserted records; the '- 2' excludes page infimum and supremum
+ records */
+
+ if (occupied > free_space) {
+
+ return(0);
+ }
+
+ return(free_space - occupied);
+}
+
+/************************************************************//**
+Returns the maximum combined size of records which can be inserted on top
+of the record heap if a page is first reorganized.
+@return maximum combined size for inserted records */
+UNIV_INLINE
+ulint
+page_get_max_insert_size_after_reorganize(
+/*======================================*/
+ const page_t* page, /*!< in: index page */
+ ulint n_recs) /*!< in: number of records */
+{
+ ulint occupied;
+ ulint free_space;
+
+ occupied = page_get_data_size(page)
+ + page_dir_calc_reserved_space(n_recs + page_get_n_recs(page));
+
+ free_space = page_get_free_space_of_empty(page_is_comp(page));
+
+ if (occupied > free_space) {
+
+ return(0);
+ }
+
+ return(free_space - occupied);
+}
+
+/************************************************************//**
+Puts a record to free list. */
+UNIV_INLINE
+void
+page_mem_free(
+/*==========*/
+ page_t* page, /*!< in/out: index page */
+ page_zip_des_t* page_zip,/*!< in/out: compressed page, or NULL */
+ rec_t* rec, /*!< in: pointer to the (origin of) record */
+ dict_index_t* index, /*!< in: index of rec */
+ const ulint* offsets)/*!< in: array returned by rec_get_offsets() */
+{
+ rec_t* free;
+ ulint garbage;
+
+ ut_ad(rec_offs_validate(rec, index, offsets));
+ free = page_header_get_ptr(page, PAGE_FREE);
+
+ page_rec_set_next(rec, free);
+ page_header_set_ptr(page, page_zip, PAGE_FREE, rec);
+
+ garbage = page_header_get_field(page, PAGE_GARBAGE);
+
+ page_header_set_field(page, page_zip, PAGE_GARBAGE,
+ garbage + rec_offs_size(offsets));
+
+ if (UNIV_LIKELY_NULL(page_zip)) {
+ page_zip_dir_delete(page_zip, rec, index, offsets, free);
+ } else {
+ page_header_set_field(page, page_zip, PAGE_N_RECS,
+ page_get_n_recs(page) - 1);
+ }
+}
+
+#ifdef UNIV_MATERIALIZE
+#undef UNIV_INLINE
+#define UNIV_INLINE UNIV_INLINE_ORIGINAL
+#endif
diff --git a/storage/innobase/include/page0types.h b/storage/innobase/include/page0types.h
new file mode 100644
index 00000000000..d9a277bf208
--- /dev/null
+++ b/storage/innobase/include/page0types.h
@@ -0,0 +1,150 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/page0types.h
+Index page routines
+
+Created 2/2/1994 Heikki Tuuri
+*******************************************************/
+
+#ifndef page0types_h
+#define page0types_h
+
+#include "univ.i"
+#include "dict0types.h"
+#include "mtr0types.h"
+
+/** Eliminates a name collision on HP-UX */
+#define page_t ib_page_t
+/** Type of the index page */
+typedef byte page_t;
+/** Index page cursor */
+typedef struct page_cur_struct page_cur_t;
+
+/** Compressed index page */
+typedef byte page_zip_t;
+/** Compressed page descriptor */
+typedef struct page_zip_des_struct page_zip_des_t;
+
+/* The following definitions would better belong to page0zip.h,
+but we cannot include page0zip.h from rem0rec.ic, because
+page0*.h includes rem0rec.h and may include rem0rec.ic. */
+
+/** Number of bits needed for representing different compressed page sizes */
+#define PAGE_ZIP_SSIZE_BITS 3
+
+/** log2 of smallest compressed page size */
+#define PAGE_ZIP_MIN_SIZE_SHIFT 10
+/** Smallest compressed page size */
+#define PAGE_ZIP_MIN_SIZE (1 << PAGE_ZIP_MIN_SIZE_SHIFT)
+
+/** Number of supported compressed page sizes */
+#define PAGE_ZIP_NUM_SSIZE (UNIV_PAGE_SIZE_SHIFT - PAGE_ZIP_MIN_SIZE_SHIFT + 2)
+#if PAGE_ZIP_NUM_SSIZE > (1 << PAGE_ZIP_SSIZE_BITS)
+# error "PAGE_ZIP_NUM_SSIZE > (1 << PAGE_ZIP_SSIZE_BITS)"
+#endif
+
+/** Compressed page descriptor */
+struct page_zip_des_struct
+{
+ page_zip_t* data; /*!< compressed page data */
+
+#ifdef UNIV_DEBUG
+ unsigned m_start:16; /*!< start offset of modification log */
+#endif /* UNIV_DEBUG */
+ unsigned m_end:16; /*!< end offset of modification log */
+ unsigned m_nonempty:1; /*!< TRUE if the modification log
+ is not empty */
+ unsigned n_blobs:12; /*!< number of externally stored
+ columns on the page; the maximum
+ is 744 on a 16 KiB page */
+ unsigned ssize:PAGE_ZIP_SSIZE_BITS;
+ /*!< 0 or compressed page size;
+ the size in bytes is
+ PAGE_ZIP_MIN_SIZE << (ssize - 1). */
+};
+
+/** Compression statistics for a given page size */
+struct page_zip_stat_struct {
+ /** Number of page compressions */
+ ulint compressed;
+ /** Number of successful page compressions */
+ ulint compressed_ok;
+ /** Number of page decompressions */
+ ulint decompressed;
+ /** Duration of page compressions in microseconds */
+ ib_uint64_t compressed_usec;
+ /** Duration of page decompressions in microseconds */
+ ib_uint64_t decompressed_usec;
+};
+
+/** Compression statistics */
+typedef struct page_zip_stat_struct page_zip_stat_t;
+
+/** Statistics on compression, indexed by page_zip_des_struct::ssize - 1 */
+extern page_zip_stat_t page_zip_stat[PAGE_ZIP_NUM_SSIZE - 1];
+
+/**********************************************************************//**
+Write the "deleted" flag of a record on a compressed page. The flag must
+already have been written on the uncompressed page. */
+UNIV_INTERN
+void
+page_zip_rec_set_deleted(
+/*=====================*/
+ page_zip_des_t* page_zip,/*!< in/out: compressed page */
+ const byte* rec, /*!< in: record on the uncompressed page */
+ ulint flag) /*!< in: the deleted flag (nonzero=TRUE) */
+ __attribute__((nonnull));
+
+/**********************************************************************//**
+Write the "owned" flag of a record on a compressed page. The n_owned field
+must already have been written on the uncompressed page. */
+UNIV_INTERN
+void
+page_zip_rec_set_owned(
+/*===================*/
+ page_zip_des_t* page_zip,/*!< in/out: compressed page */
+ const byte* rec, /*!< in: record on the uncompressed page */
+ ulint flag) /*!< in: the owned flag (nonzero=TRUE) */
+ __attribute__((nonnull));
+
+/**********************************************************************//**
+Shift the dense page directory when a record is deleted. */
+UNIV_INTERN
+void
+page_zip_dir_delete(
+/*================*/
+ page_zip_des_t* page_zip,/*!< in/out: compressed page */
+ byte* rec, /*!< in: deleted record */
+ dict_index_t* index, /*!< in: index of rec */
+ const ulint* offsets,/*!< in: rec_get_offsets(rec) */
+ const byte* free) /*!< in: previous start of the free list */
+ __attribute__((nonnull(1,2,3,4)));
+
+/**********************************************************************//**
+Add a slot to the dense page directory. */
+UNIV_INTERN
+void
+page_zip_dir_add_slot(
+/*==================*/
+ page_zip_des_t* page_zip, /*!< in/out: compressed page */
+ ulint is_clustered) /*!< in: nonzero for clustered index,
+ zero for others */
+ __attribute__((nonnull));
+#endif
diff --git a/storage/innobase/include/page0zip.h b/storage/innobase/include/page0zip.h
new file mode 100644
index 00000000000..9aaa066306b
--- /dev/null
+++ b/storage/innobase/include/page0zip.h
@@ -0,0 +1,471 @@
+/*****************************************************************************
+
+Copyright (c) 2005, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/page0zip.h
+Compressed page interface
+
+Created June 2005 by Marko Makela
+*******************************************************/
+
+#ifndef page0zip_h
+#define page0zip_h
+
+#ifdef UNIV_MATERIALIZE
+# undef UNIV_INLINE
+# define UNIV_INLINE
+#endif
+
+#include "mtr0types.h"
+#include "page0types.h"
+#include "buf0types.h"
+#include "dict0types.h"
+#include "trx0types.h"
+#include "mem0mem.h"
+
+/**********************************************************************//**
+Determine the size of a compressed page in bytes.
+@return size in bytes */
+UNIV_INLINE
+ulint
+page_zip_get_size(
+/*==============*/
+ const page_zip_des_t* page_zip) /*!< in: compressed page */
+ __attribute__((nonnull, pure));
+/**********************************************************************//**
+Set the size of a compressed page in bytes. */
+UNIV_INLINE
+void
+page_zip_set_size(
+/*==============*/
+ page_zip_des_t* page_zip, /*!< in/out: compressed page */
+ ulint size); /*!< in: size in bytes */
+
+#ifndef UNIV_HOTBACKUP
+/**********************************************************************//**
+Determine if a record is so big that it needs to be stored externally.
+@return FALSE if the entire record can be stored locally on the page */
+UNIV_INLINE
+ibool
+page_zip_rec_needs_ext(
+/*===================*/
+ ulint rec_size, /*!< in: length of the record in bytes */
+ ulint comp, /*!< in: nonzero=compact format */
+ ulint n_fields, /*!< in: number of fields in the record;
+ ignored if zip_size == 0 */
+ ulint zip_size) /*!< in: compressed page size in bytes, or 0 */
+ __attribute__((const));
+
+/**********************************************************************//**
+Determine the guaranteed free space on an empty page.
+@return minimum payload size on the page */
+UNIV_INTERN
+ulint
+page_zip_empty_size(
+/*================*/
+ ulint n_fields, /*!< in: number of columns in the index */
+ ulint zip_size) /*!< in: compressed page size in bytes */
+ __attribute__((const));
+#endif /* !UNIV_HOTBACKUP */
+
+/**********************************************************************//**
+Initialize a compressed page descriptor. */
+UNIV_INLINE
+void
+page_zip_des_init(
+/*==============*/
+ page_zip_des_t* page_zip); /*!< in/out: compressed page
+ descriptor */
+
+/**********************************************************************//**
+Configure the zlib allocator to use the given memory heap. */
+UNIV_INTERN
+void
+page_zip_set_alloc(
+/*===============*/
+ void* stream, /*!< in/out: zlib stream */
+ mem_heap_t* heap); /*!< in: memory heap to use */
+
+/**********************************************************************//**
+Compress a page.
+@return TRUE on success, FALSE on failure; page_zip will be left
+intact on failure. */
+UNIV_INTERN
+ibool
+page_zip_compress(
+/*==============*/
+ page_zip_des_t* page_zip,/*!< in: size; out: data, n_blobs,
+ m_start, m_end, m_nonempty */
+ const page_t* page, /*!< in: uncompressed page */
+ dict_index_t* index, /*!< in: index of the B-tree node */
+ mtr_t* mtr) /*!< in: mini-transaction, or NULL */
+ __attribute__((nonnull(1,2,3)));
+
+/**********************************************************************//**
+Decompress a page. This function should tolerate errors on the compressed
+page. Instead of letting assertions fail, it will return FALSE if an
+inconsistency is detected.
+@return TRUE on success, FALSE on failure */
+UNIV_INTERN
+ibool
+page_zip_decompress(
+/*================*/
+ page_zip_des_t* page_zip,/*!< in: data, ssize;
+ out: m_start, m_end, m_nonempty, n_blobs */
+ page_t* page) /*!< out: uncompressed page, may be trashed */
+ __attribute__((nonnull));
+
+#ifdef UNIV_DEBUG
+/**********************************************************************//**
+Validate a compressed page descriptor.
+@return TRUE if ok */
+UNIV_INLINE
+ibool
+page_zip_simple_validate(
+/*=====================*/
+ const page_zip_des_t* page_zip); /*!< in: compressed page
+ descriptor */
+#endif /* UNIV_DEBUG */
+
+#ifdef UNIV_ZIP_DEBUG
+/**********************************************************************//**
+Check that the compressed and decompressed pages match.
+@return TRUE if valid, FALSE if not */
+UNIV_INTERN
+ibool
+page_zip_validate_low(
+/*==================*/
+ const page_zip_des_t* page_zip,/*!< in: compressed page */
+ const page_t* page, /*!< in: uncompressed page */
+ ibool sloppy) /*!< in: FALSE=strict,
+ TRUE=ignore the MIN_REC_FLAG */
+ __attribute__((nonnull));
+/**********************************************************************//**
+Check that the compressed and decompressed pages match. */
+UNIV_INTERN
+ibool
+page_zip_validate(
+/*==============*/
+ const page_zip_des_t* page_zip,/*!< in: compressed page */
+ const page_t* page) /*!< in: uncompressed page */
+ __attribute__((nonnull));
+#endif /* UNIV_ZIP_DEBUG */
+
+/**********************************************************************//**
+Determine how big record can be inserted without recompressing the page.
+@return a positive number indicating the maximum size of a record
+whose insertion is guaranteed to succeed, or zero or negative */
+UNIV_INLINE
+lint
+page_zip_max_ins_size(
+/*==================*/
+ const page_zip_des_t* page_zip,/*!< in: compressed page */
+ ibool is_clust)/*!< in: TRUE if clustered index */
+ __attribute__((nonnull, pure));
+
+/**********************************************************************//**
+Determine if enough space is available in the modification log.
+@return TRUE if page_zip_write_rec() will succeed */
+UNIV_INLINE
+ibool
+page_zip_available(
+/*===============*/
+ const page_zip_des_t* page_zip,/*!< in: compressed page */
+ ibool is_clust,/*!< in: TRUE if clustered index */
+ ulint length, /*!< in: combined size of the record */
+ ulint create) /*!< in: nonzero=add the record to
+ the heap */
+ __attribute__((nonnull, pure));
+
+/**********************************************************************//**
+Write data to the uncompressed header portion of a page. The data must
+already have been written to the uncompressed page. */
+UNIV_INLINE
+void
+page_zip_write_header(
+/*==================*/
+ page_zip_des_t* page_zip,/*!< in/out: compressed page */
+ const byte* str, /*!< in: address on the uncompressed page */
+ ulint length, /*!< in: length of the data */
+ mtr_t* mtr) /*!< in: mini-transaction, or NULL */
+ __attribute__((nonnull(1,2)));
+
+/**********************************************************************//**
+Write an entire record on the compressed page. The data must already
+have been written to the uncompressed page. */
+UNIV_INTERN
+void
+page_zip_write_rec(
+/*===============*/
+ page_zip_des_t* page_zip,/*!< in/out: compressed page */
+ const byte* rec, /*!< in: record being written */
+ dict_index_t* index, /*!< in: the index the record belongs to */
+ const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */
+ ulint create) /*!< in: nonzero=insert, zero=update */
+ __attribute__((nonnull));
+
+/***********************************************************//**
+Parses a log record of writing a BLOB pointer of a record.
+@return end of log record or NULL */
+UNIV_INTERN
+byte*
+page_zip_parse_write_blob_ptr(
+/*==========================*/
+ byte* ptr, /*!< in: redo log buffer */
+ byte* end_ptr,/*!< in: redo log buffer end */
+ page_t* page, /*!< in/out: uncompressed page */
+ page_zip_des_t* page_zip);/*!< in/out: compressed page */
+
+/**********************************************************************//**
+Write a BLOB pointer of a record on the leaf page of a clustered index.
+The information must already have been updated on the uncompressed page. */
+UNIV_INTERN
+void
+page_zip_write_blob_ptr(
+/*====================*/
+ page_zip_des_t* page_zip,/*!< in/out: compressed page */
+ const byte* rec, /*!< in/out: record whose data is being
+ written */
+ dict_index_t* index, /*!< in: index of the page */
+ const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */
+ ulint n, /*!< in: column index */
+ mtr_t* mtr) /*!< in: mini-transaction handle,
+ or NULL if no logging is needed */
+ __attribute__((nonnull(1,2,3,4)));
+
+/***********************************************************//**
+Parses a log record of writing the node pointer of a record.
+@return end of log record or NULL */
+UNIV_INTERN
+byte*
+page_zip_parse_write_node_ptr(
+/*==========================*/
+ byte* ptr, /*!< in: redo log buffer */
+ byte* end_ptr,/*!< in: redo log buffer end */
+ page_t* page, /*!< in/out: uncompressed page */
+ page_zip_des_t* page_zip);/*!< in/out: compressed page */
+
+/**********************************************************************//**
+Write the node pointer of a record on a non-leaf compressed page. */
+UNIV_INTERN
+void
+page_zip_write_node_ptr(
+/*====================*/
+ page_zip_des_t* page_zip,/*!< in/out: compressed page */
+ byte* rec, /*!< in/out: record */
+ ulint size, /*!< in: data size of rec */
+ ulint ptr, /*!< in: node pointer */
+ mtr_t* mtr) /*!< in: mini-transaction, or NULL */
+ __attribute__((nonnull(1,2)));
+
+/**********************************************************************//**
+Write the trx_id and roll_ptr of a record on a B-tree leaf node page. */
+UNIV_INTERN
+void
+page_zip_write_trx_id_and_roll_ptr(
+/*===============================*/
+ page_zip_des_t* page_zip,/*!< in/out: compressed page */
+ byte* rec, /*!< in/out: record */
+ const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */
+ ulint trx_id_col,/*!< in: column number of TRX_ID in rec */
+ trx_id_t trx_id, /*!< in: transaction identifier */
+ roll_ptr_t roll_ptr)/*!< in: roll_ptr */
+ __attribute__((nonnull));
+
+/**********************************************************************//**
+Write the "deleted" flag of a record on a compressed page. The flag must
+already have been written on the uncompressed page. */
+UNIV_INTERN
+void
+page_zip_rec_set_deleted(
+/*=====================*/
+ page_zip_des_t* page_zip,/*!< in/out: compressed page */
+ const byte* rec, /*!< in: record on the uncompressed page */
+ ulint flag) /*!< in: the deleted flag (nonzero=TRUE) */
+ __attribute__((nonnull));
+
+/**********************************************************************//**
+Write the "owned" flag of a record on a compressed page. The n_owned field
+must already have been written on the uncompressed page. */
+UNIV_INTERN
+void
+page_zip_rec_set_owned(
+/*===================*/
+ page_zip_des_t* page_zip,/*!< in/out: compressed page */
+ const byte* rec, /*!< in: record on the uncompressed page */
+ ulint flag) /*!< in: the owned flag (nonzero=TRUE) */
+ __attribute__((nonnull));
+
+/**********************************************************************//**
+Insert a record to the dense page directory. */
+UNIV_INTERN
+void
+page_zip_dir_insert(
+/*================*/
+ page_zip_des_t* page_zip,/*!< in/out: compressed page */
+ const byte* prev_rec,/*!< in: record after which to insert */
+ const byte* free_rec,/*!< in: record from which rec was
+ allocated, or NULL */
+ byte* rec); /*!< in: record to insert */
+
+/**********************************************************************//**
+Shift the dense page directory and the array of BLOB pointers
+when a record is deleted. */
+UNIV_INTERN
+void
+page_zip_dir_delete(
+/*================*/
+ page_zip_des_t* page_zip,/*!< in/out: compressed page */
+ byte* rec, /*!< in: deleted record */
+ dict_index_t* index, /*!< in: index of rec */
+ const ulint* offsets,/*!< in: rec_get_offsets(rec) */
+ const byte* free) /*!< in: previous start of the free list */
+ __attribute__((nonnull(1,2,3,4)));
+
+/**********************************************************************//**
+Add a slot to the dense page directory. */
+UNIV_INTERN
+void
+page_zip_dir_add_slot(
+/*==================*/
+ page_zip_des_t* page_zip, /*!< in/out: compressed page */
+ ulint is_clustered) /*!< in: nonzero for clustered index,
+ zero for others */
+ __attribute__((nonnull));
+
+/***********************************************************//**
+Parses a log record of writing to the header of a page.
+@return end of log record or NULL */
+UNIV_INTERN
+byte*
+page_zip_parse_write_header(
+/*========================*/
+ byte* ptr, /*!< in: redo log buffer */
+ byte* end_ptr,/*!< in: redo log buffer end */
+ page_t* page, /*!< in/out: uncompressed page */
+ page_zip_des_t* page_zip);/*!< in/out: compressed page */
+
+/**********************************************************************//**
+Write data to the uncompressed header portion of a page. The data must
+already have been written to the uncompressed page.
+However, the data portion of the uncompressed page may differ from
+the compressed page when a record is being inserted in
+page_cur_insert_rec_low(). */
+UNIV_INLINE
+void
+page_zip_write_header(
+/*==================*/
+ page_zip_des_t* page_zip,/*!< in/out: compressed page */
+ const byte* str, /*!< in: address on the uncompressed page */
+ ulint length, /*!< in: length of the data */
+ mtr_t* mtr) /*!< in: mini-transaction, or NULL */
+ __attribute__((nonnull(1,2)));
+
+/**********************************************************************//**
+Reorganize and compress a page. This is a low-level operation for
+compressed pages, to be used when page_zip_compress() fails.
+On success, a redo log entry MLOG_ZIP_PAGE_COMPRESS will be written.
+The function btr_page_reorganize() should be preferred whenever possible.
+IMPORTANT: if page_zip_reorganize() is invoked on a leaf page of a
+non-clustered index, the caller must update the insert buffer free
+bits in the same mini-transaction in such a way that the modification
+will be redo-logged.
+@return TRUE on success, FALSE on failure; page and page_zip will be
+left intact on failure. */
+UNIV_INTERN
+ibool
+page_zip_reorganize(
+/*================*/
+ buf_block_t* block, /*!< in/out: page with compressed page;
+ on the compressed page, in: size;
+ out: data, n_blobs,
+ m_start, m_end, m_nonempty */
+ dict_index_t* index, /*!< in: index of the B-tree node */
+ mtr_t* mtr) /*!< in: mini-transaction */
+ __attribute__((nonnull));
+#ifndef UNIV_HOTBACKUP
+/**********************************************************************//**
+Copy the records of a page byte for byte. Do not copy the page header
+or trailer, except those B-tree header fields that are directly
+related to the storage of records. Also copy PAGE_MAX_TRX_ID.
+NOTE: The caller must update the lock table and the adaptive hash index. */
+UNIV_INTERN
+void
+page_zip_copy_recs(
+/*===============*/
+ page_zip_des_t* page_zip, /*!< out: copy of src_zip
+ (n_blobs, m_start, m_end,
+ m_nonempty, data[0..size-1]) */
+ page_t* page, /*!< out: copy of src */
+ const page_zip_des_t* src_zip, /*!< in: compressed page */
+ const page_t* src, /*!< in: page */
+ dict_index_t* index, /*!< in: index of the B-tree */
+ mtr_t* mtr) /*!< in: mini-transaction */
+ __attribute__((nonnull(1,2,3,4)));
+#endif /* !UNIV_HOTBACKUP */
+
+/**********************************************************************//**
+Parses a log record of compressing an index page.
+@return end of log record or NULL */
+UNIV_INTERN
+byte*
+page_zip_parse_compress(
+/*====================*/
+ byte* ptr, /*!< in: buffer */
+ byte* end_ptr,/*!< in: buffer end */
+ page_t* page, /*!< out: uncompressed page */
+ page_zip_des_t* page_zip)/*!< out: compressed page */
+ __attribute__((nonnull(1,2)));
+
+/**********************************************************************//**
+Calculate the compressed page checksum.
+@return page checksum */
+UNIV_INTERN
+ulint
+page_zip_calc_checksum(
+/*===================*/
+ const void* data, /*!< in: compressed page */
+ ulint size) /*!< in: size of compressed page */
+ __attribute__((nonnull));
+
+#ifndef UNIV_HOTBACKUP
+/** Check if a pointer to an uncompressed page matches a compressed page.
+@param ptr pointer to an uncompressed page frame
+@param page_zip compressed page descriptor
+@return TRUE if ptr and page_zip refer to the same block */
+# define PAGE_ZIP_MATCH(ptr, page_zip) \
+ (buf_frame_get_page_zip(ptr) == (page_zip))
+#else /* !UNIV_HOTBACKUP */
+/** Check if a pointer to an uncompressed page matches a compressed page.
+@param ptr pointer to an uncompressed page frame
+@param page_zip compressed page descriptor
+@return TRUE if ptr and page_zip refer to the same block */
+# define PAGE_ZIP_MATCH(ptr, page_zip) \
+ (page_align(ptr) + UNIV_PAGE_SIZE == (page_zip)->data)
+#endif /* !UNIV_HOTBACKUP */
+
+#ifdef UNIV_MATERIALIZE
+# undef UNIV_INLINE
+# define UNIV_INLINE UNIV_INLINE_ORIGINAL
+#endif
+
+#ifndef UNIV_NONINL
+# include "page0zip.ic"
+#endif
+
+#endif /* page0zip_h */
diff --git a/storage/innobase/include/page0zip.ic b/storage/innobase/include/page0zip.ic
new file mode 100644
index 00000000000..75cc7a9fcc4
--- /dev/null
+++ b/storage/innobase/include/page0zip.ic
@@ -0,0 +1,397 @@
+/*****************************************************************************
+
+Copyright (c) 2005, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/page0zip.ic
+Compressed page interface
+
+Created June 2005 by Marko Makela
+*******************************************************/
+
+#ifdef UNIV_MATERIALIZE
+# undef UNIV_INLINE
+# define UNIV_INLINE
+#endif
+
+#include "page0zip.h"
+#include "page0page.h"
+
+/* The format of compressed pages is as follows.
+
+The header and trailer of the uncompressed pages, excluding the page
+directory in the trailer, are copied as is to the header and trailer
+of the compressed page.
+
+At the end of the compressed page, there is a dense page directory
+pointing to every user record contained on the page, including deleted
+records on the free list. The dense directory is indexed in the
+collation order, i.e., in the order in which the record list is
+linked on the uncompressed page. The infimum and supremum records are
+excluded. The two most significant bits of the entries are allocated
+for the delete-mark and an n_owned flag indicating the last record in
+a chain of records pointed to from the sparse page directory on the
+uncompressed page.
+
+The data between PAGE_ZIP_START and the last page directory entry will
+be written in compressed format, starting at offset PAGE_DATA.
+Infimum and supremum records are not stored. We exclude the
+REC_N_NEW_EXTRA_BYTES in every record header. These can be recovered
+from the dense page directory stored at the end of the compressed
+page.
+
+The fields node_ptr (in non-leaf B-tree nodes; level>0), trx_id and
+roll_ptr (in leaf B-tree nodes; level=0), and BLOB pointers of
+externally stored columns are stored separately, in ascending order of
+heap_no and column index, starting backwards from the dense page
+directory.
+
+The compressed data stream may be followed by a modification log
+covering the compressed portion of the page, as follows.
+
+MODIFICATION LOG ENTRY FORMAT
+- write record:
+ - (heap_no - 1) << 1 (1..2 bytes)
+ - extra bytes backwards
+ - data bytes
+- clear record:
+ - (heap_no - 1) << 1 | 1 (1..2 bytes)
+
+The integer values are stored in a variable-length format:
+- 0xxxxxxx: 0..127
+- 1xxxxxxx xxxxxxxx: 0..32767
+
+The end of the modification log is marked by a 0 byte.
+
+In summary, the compressed page looks like this:
+
+(1) Uncompressed page header (PAGE_DATA bytes)
+(2) Compressed index information
+(3) Compressed page data
+(4) Page modification log (page_zip->m_start..page_zip->m_end)
+(5) Empty zero-filled space
+(6) BLOB pointers (on leaf pages)
+ - BTR_EXTERN_FIELD_REF_SIZE for each externally stored column
+ - in descending collation order
+(7) Uncompressed columns of user records, n_dense * uncompressed_size bytes,
+ - indexed by heap_no
+ - DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN for leaf pages of clustered indexes
+ - REC_NODE_PTR_SIZE for non-leaf pages
+ - 0 otherwise
+(8) dense page directory, stored backwards
+ - n_dense = n_heap - 2
+ - existing records in ascending collation order
+ - deleted records (free list) in link order
+*/
+
+/** Start offset of the area that will be compressed */
+#define PAGE_ZIP_START PAGE_NEW_SUPREMUM_END
+/** Size of an compressed page directory entry */
+#define PAGE_ZIP_DIR_SLOT_SIZE 2
+/** Mask of record offsets */
+#define PAGE_ZIP_DIR_SLOT_MASK 0x3fff
+/** 'owned' flag */
+#define PAGE_ZIP_DIR_SLOT_OWNED 0x4000
+/** 'deleted' flag */
+#define PAGE_ZIP_DIR_SLOT_DEL 0x8000
+
+/**********************************************************************//**
+Determine the size of a compressed page in bytes.
+@return size in bytes */
+UNIV_INLINE
+ulint
+page_zip_get_size(
+/*==============*/
+ const page_zip_des_t* page_zip) /*!< in: compressed page */
+{
+ ulint size;
+
+ if (UNIV_UNLIKELY(!page_zip->ssize)) {
+ return(0);
+ }
+
+ size = (PAGE_ZIP_MIN_SIZE >> 1) << page_zip->ssize;
+
+ ut_ad(size >= PAGE_ZIP_MIN_SIZE);
+ ut_ad(size <= UNIV_PAGE_SIZE);
+
+ return(size);
+}
+/**********************************************************************//**
+Set the size of a compressed page in bytes. */
+UNIV_INLINE
+void
+page_zip_set_size(
+/*==============*/
+ page_zip_des_t* page_zip, /*!< in/out: compressed page */
+ ulint size) /*!< in: size in bytes */
+{
+ if (size) {
+ int ssize;
+
+ ut_ad(ut_is_2pow(size));
+
+ for (ssize = 1; size > (ulint) (512 << ssize); ssize++) {
+ }
+
+ page_zip->ssize = ssize;
+ } else {
+ page_zip->ssize = 0;
+ }
+
+ ut_ad(page_zip_get_size(page_zip) == size);
+}
+
+#ifndef UNIV_HOTBACKUP
+/**********************************************************************//**
+Determine if a record is so big that it needs to be stored externally.
+@return FALSE if the entire record can be stored locally on the page */
+UNIV_INLINE
+ibool
+page_zip_rec_needs_ext(
+/*===================*/
+ ulint rec_size, /*!< in: length of the record in bytes */
+ ulint comp, /*!< in: nonzero=compact format */
+ ulint n_fields, /*!< in: number of fields in the record;
+ ignored if zip_size == 0 */
+ ulint zip_size) /*!< in: compressed page size in bytes, or 0 */
+{
+ ut_ad(rec_size > comp ? REC_N_NEW_EXTRA_BYTES : REC_N_OLD_EXTRA_BYTES);
+ ut_ad(ut_is_2pow(zip_size));
+ ut_ad(comp || !zip_size);
+
+#if UNIV_PAGE_SIZE > REC_MAX_DATA_SIZE
+ if (UNIV_UNLIKELY(rec_size >= REC_MAX_DATA_SIZE)) {
+ return(TRUE);
+ }
+#endif
+
+ if (UNIV_UNLIKELY(zip_size)) {
+ ut_ad(comp);
+ /* On a compressed page, there is a two-byte entry in
+ the dense page directory for every record. But there
+ is no record header. There should be enough room for
+ one record on an empty leaf page. Subtract 1 byte for
+ the encoded heap number. Check also the available space
+ on the uncompressed page. */
+ return(rec_size - (REC_N_NEW_EXTRA_BYTES - 2)
+ >= (page_zip_empty_size(n_fields, zip_size) - 1)
+ || rec_size >= page_get_free_space_of_empty(TRUE) / 2);
+ }
+
+ return(rec_size >= page_get_free_space_of_empty(comp) / 2);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+#ifdef UNIV_DEBUG
+/**********************************************************************//**
+Validate a compressed page descriptor.
+@return TRUE if ok */
+UNIV_INLINE
+ibool
+page_zip_simple_validate(
+/*=====================*/
+ const page_zip_des_t* page_zip)/*!< in: compressed page descriptor */
+{
+ ut_ad(page_zip);
+ ut_ad(page_zip->data);
+ ut_ad(page_zip->ssize < PAGE_ZIP_NUM_SSIZE);
+ ut_ad(page_zip_get_size(page_zip)
+ > PAGE_DATA + PAGE_ZIP_DIR_SLOT_SIZE);
+ ut_ad(page_zip->m_start <= page_zip->m_end);
+ ut_ad(page_zip->m_end < page_zip_get_size(page_zip));
+ ut_ad(page_zip->n_blobs
+ < page_zip_get_size(page_zip) / BTR_EXTERN_FIELD_REF_SIZE);
+ return(TRUE);
+}
+#endif /* UNIV_DEBUG */
+
+/**********************************************************************//**
+Determine if the length of the page trailer.
+@return length of the page trailer, in bytes, not including the
+terminating zero byte of the modification log */
+UNIV_INLINE
+ibool
+page_zip_get_trailer_len(
+/*=====================*/
+ const page_zip_des_t* page_zip,/*!< in: compressed page */
+ ibool is_clust,/*!< in: TRUE if clustered index */
+ ulint* entry_size)/*!< out: size of the uncompressed
+ portion of a user record */
+{
+ ulint uncompressed_size;
+
+ ut_ad(page_zip_simple_validate(page_zip));
+ UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip));
+
+ if (UNIV_UNLIKELY(!page_is_leaf(page_zip->data))) {
+ uncompressed_size = PAGE_ZIP_DIR_SLOT_SIZE
+ + REC_NODE_PTR_SIZE;
+ ut_ad(!page_zip->n_blobs);
+ } else if (UNIV_UNLIKELY(is_clust)) {
+ uncompressed_size = PAGE_ZIP_DIR_SLOT_SIZE
+ + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN;
+ } else {
+ uncompressed_size = PAGE_ZIP_DIR_SLOT_SIZE;
+ ut_ad(!page_zip->n_blobs);
+ }
+
+ if (entry_size) {
+ *entry_size = uncompressed_size;
+ }
+
+ return((page_dir_get_n_heap(page_zip->data) - 2)
+ * uncompressed_size
+ + page_zip->n_blobs * BTR_EXTERN_FIELD_REF_SIZE);
+}
+
+/**********************************************************************//**
+Determine how big record can be inserted without recompressing the page.
+@return a positive number indicating the maximum size of a record
+whose insertion is guaranteed to succeed, or zero or negative */
+UNIV_INLINE
+lint
+page_zip_max_ins_size(
+/*==================*/
+ const page_zip_des_t* page_zip,/*!< in: compressed page */
+ ibool is_clust)/*!< in: TRUE if clustered index */
+{
+ ulint uncompressed_size;
+ ulint trailer_len;
+
+ trailer_len = page_zip_get_trailer_len(page_zip, is_clust,
+ &uncompressed_size);
+
+ /* When a record is created, a pointer may be added to
+ the dense directory.
+ Likewise, space for the columns that will not be
+ compressed will be allocated from the page trailer.
+ Also the BLOB pointers will be allocated from there, but
+ we may as well count them in the length of the record. */
+
+ trailer_len += uncompressed_size;
+
+ return((lint) page_zip_get_size(page_zip)
+ - trailer_len - page_zip->m_end
+ - (REC_N_NEW_EXTRA_BYTES - 2));
+}
+
+/**********************************************************************//**
+Determine if enough space is available in the modification log.
+@return TRUE if enough space is available */
+UNIV_INLINE
+ibool
+page_zip_available(
+/*===============*/
+ const page_zip_des_t* page_zip,/*!< in: compressed page */
+ ibool is_clust,/*!< in: TRUE if clustered index */
+ ulint length, /*!< in: combined size of the record */
+ ulint create) /*!< in: nonzero=add the record to
+ the heap */
+{
+ ulint uncompressed_size;
+ ulint trailer_len;
+
+ ut_ad(length > REC_N_NEW_EXTRA_BYTES);
+
+ trailer_len = page_zip_get_trailer_len(page_zip, is_clust,
+ &uncompressed_size);
+
+ /* Subtract the fixed extra bytes and add the maximum
+ space needed for identifying the record (encoded heap_no). */
+ length -= REC_N_NEW_EXTRA_BYTES - 2;
+
+ if (UNIV_UNLIKELY(create)) {
+ /* When a record is created, a pointer may be added to
+ the dense directory.
+ Likewise, space for the columns that will not be
+ compressed will be allocated from the page trailer.
+ Also the BLOB pointers will be allocated from there, but
+ we may as well count them in the length of the record. */
+
+ trailer_len += uncompressed_size;
+ }
+
+ return(UNIV_LIKELY(length
+ + trailer_len
+ + page_zip->m_end
+ < page_zip_get_size(page_zip)));
+}
+
+/**********************************************************************//**
+Initialize a compressed page descriptor. */
+UNIV_INLINE
+void
+page_zip_des_init(
+/*==============*/
+ page_zip_des_t* page_zip) /*!< in/out: compressed page
+ descriptor */
+{
+ memset(page_zip, 0, sizeof *page_zip);
+}
+
+/**********************************************************************//**
+Write a log record of writing to the uncompressed header portion of a page. */
+UNIV_INTERN
+void
+page_zip_write_header_log(
+/*======================*/
+ const byte* data,/*!< in: data on the uncompressed page */
+ ulint length, /*!< in: length of the data */
+ mtr_t* mtr); /*!< in: mini-transaction */
+
+/**********************************************************************//**
+Write data to the uncompressed header portion of a page. The data must
+already have been written to the uncompressed page.
+However, the data portion of the uncompressed page may differ from
+the compressed page when a record is being inserted in
+page_cur_insert_rec_zip(). */
+UNIV_INLINE
+void
+page_zip_write_header(
+/*==================*/
+ page_zip_des_t* page_zip,/*!< in/out: compressed page */
+ const byte* str, /*!< in: address on the uncompressed page */
+ ulint length, /*!< in: length of the data */
+ mtr_t* mtr) /*!< in: mini-transaction, or NULL */
+{
+ ulint pos;
+
+ ut_ad(PAGE_ZIP_MATCH(str, page_zip));
+ ut_ad(page_zip_simple_validate(page_zip));
+ UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip));
+
+ pos = page_offset(str);
+
+ ut_ad(pos < PAGE_DATA);
+
+ memcpy(page_zip->data + pos, str, length);
+
+ /* The following would fail in page_cur_insert_rec_zip(). */
+ /* ut_ad(page_zip_validate(page_zip, str - pos)); */
+
+ if (UNIV_LIKELY_NULL(mtr)) {
+#ifndef UNIV_HOTBACKUP
+ page_zip_write_header_log(str, length, mtr);
+#endif /* !UNIV_HOTBACKUP */
+ }
+}
+
+#ifdef UNIV_MATERIALIZE
+# undef UNIV_INLINE
+# define UNIV_INLINE UNIV_INLINE_ORIGINAL
+#endif
diff --git a/storage/innobase/include/pars0grm.h b/storage/innobase/include/pars0grm.h
new file mode 100644
index 00000000000..3de233eed3a
--- /dev/null
+++ b/storage/innobase/include/pars0grm.h
@@ -0,0 +1,236 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1984, 1989, 1990, 2000, 2001, 2002, 2003, 2004 Free Software
+Foundation, Inc.
+
+As a special exception, when this file is copied by Bison into a
+Bison output file, you may use that output file without restriction.
+This special exception was added by the Free Software Foundation
+in version 1.24 of Bison.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/* A Bison parser, made by GNU Bison 1.875d. */
+
+/* Tokens. */
+#ifndef YYTOKENTYPE
+# define YYTOKENTYPE
+ /* Put the tokens into the symbol table, so that GDB and other debuggers
+ know about them. */
+ enum yytokentype {
+ PARS_INT_LIT = 258,
+ PARS_FLOAT_LIT = 259,
+ PARS_STR_LIT = 260,
+ PARS_FIXBINARY_LIT = 261,
+ PARS_BLOB_LIT = 262,
+ PARS_NULL_LIT = 263,
+ PARS_ID_TOKEN = 264,
+ PARS_AND_TOKEN = 265,
+ PARS_OR_TOKEN = 266,
+ PARS_NOT_TOKEN = 267,
+ PARS_GE_TOKEN = 268,
+ PARS_LE_TOKEN = 269,
+ PARS_NE_TOKEN = 270,
+ PARS_PROCEDURE_TOKEN = 271,
+ PARS_IN_TOKEN = 272,
+ PARS_OUT_TOKEN = 273,
+ PARS_BINARY_TOKEN = 274,
+ PARS_BLOB_TOKEN = 275,
+ PARS_INT_TOKEN = 276,
+ PARS_INTEGER_TOKEN = 277,
+ PARS_FLOAT_TOKEN = 278,
+ PARS_CHAR_TOKEN = 279,
+ PARS_IS_TOKEN = 280,
+ PARS_BEGIN_TOKEN = 281,
+ PARS_END_TOKEN = 282,
+ PARS_IF_TOKEN = 283,
+ PARS_THEN_TOKEN = 284,
+ PARS_ELSE_TOKEN = 285,
+ PARS_ELSIF_TOKEN = 286,
+ PARS_LOOP_TOKEN = 287,
+ PARS_WHILE_TOKEN = 288,
+ PARS_RETURN_TOKEN = 289,
+ PARS_SELECT_TOKEN = 290,
+ PARS_SUM_TOKEN = 291,
+ PARS_COUNT_TOKEN = 292,
+ PARS_DISTINCT_TOKEN = 293,
+ PARS_FROM_TOKEN = 294,
+ PARS_WHERE_TOKEN = 295,
+ PARS_FOR_TOKEN = 296,
+ PARS_DDOT_TOKEN = 297,
+ PARS_READ_TOKEN = 298,
+ PARS_ORDER_TOKEN = 299,
+ PARS_BY_TOKEN = 300,
+ PARS_ASC_TOKEN = 301,
+ PARS_DESC_TOKEN = 302,
+ PARS_INSERT_TOKEN = 303,
+ PARS_INTO_TOKEN = 304,
+ PARS_VALUES_TOKEN = 305,
+ PARS_UPDATE_TOKEN = 306,
+ PARS_SET_TOKEN = 307,
+ PARS_DELETE_TOKEN = 308,
+ PARS_CURRENT_TOKEN = 309,
+ PARS_OF_TOKEN = 310,
+ PARS_CREATE_TOKEN = 311,
+ PARS_TABLE_TOKEN = 312,
+ PARS_INDEX_TOKEN = 313,
+ PARS_UNIQUE_TOKEN = 314,
+ PARS_CLUSTERED_TOKEN = 315,
+ PARS_DOES_NOT_FIT_IN_MEM_TOKEN = 316,
+ PARS_ON_TOKEN = 317,
+ PARS_ASSIGN_TOKEN = 318,
+ PARS_DECLARE_TOKEN = 319,
+ PARS_CURSOR_TOKEN = 320,
+ PARS_SQL_TOKEN = 321,
+ PARS_OPEN_TOKEN = 322,
+ PARS_FETCH_TOKEN = 323,
+ PARS_CLOSE_TOKEN = 324,
+ PARS_NOTFOUND_TOKEN = 325,
+ PARS_TO_CHAR_TOKEN = 326,
+ PARS_TO_NUMBER_TOKEN = 327,
+ PARS_TO_BINARY_TOKEN = 328,
+ PARS_BINARY_TO_NUMBER_TOKEN = 329,
+ PARS_SUBSTR_TOKEN = 330,
+ PARS_REPLSTR_TOKEN = 331,
+ PARS_CONCAT_TOKEN = 332,
+ PARS_INSTR_TOKEN = 333,
+ PARS_LENGTH_TOKEN = 334,
+ PARS_SYSDATE_TOKEN = 335,
+ PARS_PRINTF_TOKEN = 336,
+ PARS_ASSERT_TOKEN = 337,
+ PARS_RND_TOKEN = 338,
+ PARS_RND_STR_TOKEN = 339,
+ PARS_ROW_PRINTF_TOKEN = 340,
+ PARS_COMMIT_TOKEN = 341,
+ PARS_ROLLBACK_TOKEN = 342,
+ PARS_WORK_TOKEN = 343,
+ PARS_UNSIGNED_TOKEN = 344,
+ PARS_EXIT_TOKEN = 345,
+ PARS_FUNCTION_TOKEN = 346,
+ PARS_LOCK_TOKEN = 347,
+ PARS_SHARE_TOKEN = 348,
+ PARS_MODE_TOKEN = 349,
+ NEG = 350
+ };
+#endif
+#define PARS_INT_LIT 258
+#define PARS_FLOAT_LIT 259
+#define PARS_STR_LIT 260
+#define PARS_FIXBINARY_LIT 261
+#define PARS_BLOB_LIT 262
+#define PARS_NULL_LIT 263
+#define PARS_ID_TOKEN 264
+#define PARS_AND_TOKEN 265
+#define PARS_OR_TOKEN 266
+#define PARS_NOT_TOKEN 267
+#define PARS_GE_TOKEN 268
+#define PARS_LE_TOKEN 269
+#define PARS_NE_TOKEN 270
+#define PARS_PROCEDURE_TOKEN 271
+#define PARS_IN_TOKEN 272
+#define PARS_OUT_TOKEN 273
+#define PARS_BINARY_TOKEN 274
+#define PARS_BLOB_TOKEN 275
+#define PARS_INT_TOKEN 276
+#define PARS_INTEGER_TOKEN 277
+#define PARS_FLOAT_TOKEN 278
+#define PARS_CHAR_TOKEN 279
+#define PARS_IS_TOKEN 280
+#define PARS_BEGIN_TOKEN 281
+#define PARS_END_TOKEN 282
+#define PARS_IF_TOKEN 283
+#define PARS_THEN_TOKEN 284
+#define PARS_ELSE_TOKEN 285
+#define PARS_ELSIF_TOKEN 286
+#define PARS_LOOP_TOKEN 287
+#define PARS_WHILE_TOKEN 288
+#define PARS_RETURN_TOKEN 289
+#define PARS_SELECT_TOKEN 290
+#define PARS_SUM_TOKEN 291
+#define PARS_COUNT_TOKEN 292
+#define PARS_DISTINCT_TOKEN 293
+#define PARS_FROM_TOKEN 294
+#define PARS_WHERE_TOKEN 295
+#define PARS_FOR_TOKEN 296
+#define PARS_DDOT_TOKEN 297
+#define PARS_READ_TOKEN 298
+#define PARS_ORDER_TOKEN 299
+#define PARS_BY_TOKEN 300
+#define PARS_ASC_TOKEN 301
+#define PARS_DESC_TOKEN 302
+#define PARS_INSERT_TOKEN 303
+#define PARS_INTO_TOKEN 304
+#define PARS_VALUES_TOKEN 305
+#define PARS_UPDATE_TOKEN 306
+#define PARS_SET_TOKEN 307
+#define PARS_DELETE_TOKEN 308
+#define PARS_CURRENT_TOKEN 309
+#define PARS_OF_TOKEN 310
+#define PARS_CREATE_TOKEN 311
+#define PARS_TABLE_TOKEN 312
+#define PARS_INDEX_TOKEN 313
+#define PARS_UNIQUE_TOKEN 314
+#define PARS_CLUSTERED_TOKEN 315
+#define PARS_DOES_NOT_FIT_IN_MEM_TOKEN 316
+#define PARS_ON_TOKEN 317
+#define PARS_ASSIGN_TOKEN 318
+#define PARS_DECLARE_TOKEN 319
+#define PARS_CURSOR_TOKEN 320
+#define PARS_SQL_TOKEN 321
+#define PARS_OPEN_TOKEN 322
+#define PARS_FETCH_TOKEN 323
+#define PARS_CLOSE_TOKEN 324
+#define PARS_NOTFOUND_TOKEN 325
+#define PARS_TO_CHAR_TOKEN 326
+#define PARS_TO_NUMBER_TOKEN 327
+#define PARS_TO_BINARY_TOKEN 328
+#define PARS_BINARY_TO_NUMBER_TOKEN 329
+#define PARS_SUBSTR_TOKEN 330
+#define PARS_REPLSTR_TOKEN 331
+#define PARS_CONCAT_TOKEN 332
+#define PARS_INSTR_TOKEN 333
+#define PARS_LENGTH_TOKEN 334
+#define PARS_SYSDATE_TOKEN 335
+#define PARS_PRINTF_TOKEN 336
+#define PARS_ASSERT_TOKEN 337
+#define PARS_RND_TOKEN 338
+#define PARS_RND_STR_TOKEN 339
+#define PARS_ROW_PRINTF_TOKEN 340
+#define PARS_COMMIT_TOKEN 341
+#define PARS_ROLLBACK_TOKEN 342
+#define PARS_WORK_TOKEN 343
+#define PARS_UNSIGNED_TOKEN 344
+#define PARS_EXIT_TOKEN 345
+#define PARS_FUNCTION_TOKEN 346
+#define PARS_LOCK_TOKEN 347
+#define PARS_SHARE_TOKEN 348
+#define PARS_MODE_TOKEN 349
+#define NEG 350
+
+
+
+
+#if ! defined (YYSTYPE) && ! defined (YYSTYPE_IS_DECLARED)
+typedef int YYSTYPE;
+# define yystype YYSTYPE /* obsolescent; will be withdrawn */
+# define YYSTYPE_IS_DECLARED 1
+# define YYSTYPE_IS_TRIVIAL 1
+#endif
+
+extern YYSTYPE yylval;
+
+
+
diff --git a/storage/innobase/include/pars0opt.h b/storage/innobase/include/pars0opt.h
new file mode 100644
index 00000000000..42d956068f8
--- /dev/null
+++ b/storage/innobase/include/pars0opt.h
@@ -0,0 +1,75 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/pars0opt.h
+Simple SQL optimizer
+
+Created 12/21/1997 Heikki Tuuri
+*******************************************************/
+
+#ifndef pars0opt_h
+#define pars0opt_h
+
+#include "univ.i"
+#include "que0types.h"
+#include "usr0types.h"
+#include "pars0sym.h"
+#include "dict0types.h"
+#include "row0sel.h"
+
+/*******************************************************************//**
+Optimizes a select. Decides which indexes to tables to use. The tables
+are accessed in the order that they were written to the FROM part in the
+select statement. */
+UNIV_INTERN
+void
+opt_search_plan(
+/*============*/
+ sel_node_t* sel_node); /*!< in: parsed select node */
+/*******************************************************************//**
+Looks for occurrences of the columns of the table in the query subgraph and
+adds them to the list of columns if an occurrence of the same column does not
+already exist in the list. If the column is already in the list, puts a value
+indirection to point to the occurrence in the column list, except if the
+column occurrence we are looking at is in the column list, in which case
+nothing is done. */
+UNIV_INTERN
+void
+opt_find_all_cols(
+/*==============*/
+ ibool copy_val, /*!< in: if TRUE, new found columns are
+ added as columns to copy */
+ dict_index_t* index, /*!< in: index to use */
+ sym_node_list_t* col_list, /*!< in: base node of a list where
+ to add new found columns */
+ plan_t* plan, /*!< in: plan or NULL */
+ que_node_t* exp); /*!< in: expression or condition */
+/********************************************************************//**
+Prints info of a query plan. */
+UNIV_INTERN
+void
+opt_print_query_plan(
+/*=================*/
+ sel_node_t* sel_node); /*!< in: select node */
+
+#ifndef UNIV_NONINL
+#include "pars0opt.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/pars0opt.ic b/storage/innobase/include/pars0opt.ic
new file mode 100644
index 00000000000..e0bb6bf1af2
--- /dev/null
+++ b/storage/innobase/include/pars0opt.ic
@@ -0,0 +1,24 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/pars0opt.ic
+Simple SQL optimizer
+
+Created 12/21/1997 Heikki Tuuri
+*******************************************************/
diff --git a/storage/innobase/include/pars0pars.h b/storage/innobase/include/pars0pars.h
new file mode 100644
index 00000000000..a7de7f2292e
--- /dev/null
+++ b/storage/innobase/include/pars0pars.h
@@ -0,0 +1,742 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/pars0pars.h
+SQL parser
+
+Created 11/19/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef pars0pars_h
+#define pars0pars_h
+
+#include "univ.i"
+#include "que0types.h"
+#include "usr0types.h"
+#include "pars0types.h"
+#include "row0types.h"
+#include "trx0types.h"
+#include "ut0vec.h"
+
+/** Type of the user functions. The first argument is always InnoDB-supplied
+and varies in type, while 'user_arg' is a user-supplied argument. The
+meaning of the return type also varies. See the individual use cases, e.g.
+the FETCH statement, for details on them. */
+typedef void* (*pars_user_func_cb_t)(void* arg, void* user_arg);
+
+/** If the following is set TRUE, the parser will emit debugging
+information */
+extern int yydebug;
+
+#ifdef UNIV_SQL_DEBUG
+/** If the following is set TRUE, the lexer will print the SQL string
+as it tokenizes it */
+extern ibool pars_print_lexed;
+#endif /* UNIV_SQL_DEBUG */
+
+/* Global variable used while parsing a single procedure or query : the code is
+NOT re-entrant */
+extern sym_tab_t* pars_sym_tab_global;
+
+extern pars_res_word_t pars_to_char_token;
+extern pars_res_word_t pars_to_number_token;
+extern pars_res_word_t pars_to_binary_token;
+extern pars_res_word_t pars_binary_to_number_token;
+extern pars_res_word_t pars_substr_token;
+extern pars_res_word_t pars_replstr_token;
+extern pars_res_word_t pars_concat_token;
+extern pars_res_word_t pars_length_token;
+extern pars_res_word_t pars_instr_token;
+extern pars_res_word_t pars_sysdate_token;
+extern pars_res_word_t pars_printf_token;
+extern pars_res_word_t pars_assert_token;
+extern pars_res_word_t pars_rnd_token;
+extern pars_res_word_t pars_rnd_str_token;
+extern pars_res_word_t pars_count_token;
+extern pars_res_word_t pars_sum_token;
+extern pars_res_word_t pars_distinct_token;
+extern pars_res_word_t pars_binary_token;
+extern pars_res_word_t pars_blob_token;
+extern pars_res_word_t pars_int_token;
+extern pars_res_word_t pars_char_token;
+extern pars_res_word_t pars_float_token;
+extern pars_res_word_t pars_update_token;
+extern pars_res_word_t pars_asc_token;
+extern pars_res_word_t pars_desc_token;
+extern pars_res_word_t pars_open_token;
+extern pars_res_word_t pars_close_token;
+extern pars_res_word_t pars_share_token;
+extern pars_res_word_t pars_unique_token;
+extern pars_res_word_t pars_clustered_token;
+
+extern ulint pars_star_denoter;
+
+/* Procedure parameter types */
+#define PARS_INPUT 0
+#define PARS_OUTPUT 1
+#define PARS_NOT_PARAM 2
+
+int
+yyparse(void);
+
+/*************************************************************//**
+Parses an SQL string returning the query graph.
+@return own: the query graph */
+UNIV_INTERN
+que_t*
+pars_sql(
+/*=====*/
+ pars_info_t* info, /*!< in: extra information, or NULL */
+ const char* str); /*!< in: SQL string */
+/*************************************************************//**
+Retrieves characters to the lexical analyzer. */
+UNIV_INTERN
+void
+pars_get_lex_chars(
+/*===============*/
+ char* buf, /*!< in/out: buffer where to copy */
+ int* result, /*!< out: number of characters copied or EOF */
+ int max_size); /*!< in: maximum number of characters which fit
+ in the buffer */
+/*************************************************************//**
+Called by yyparse on error. */
+UNIV_INTERN
+void
+yyerror(
+/*====*/
+ const char* s); /*!< in: error message string */
+/*********************************************************************//**
+Parses a variable declaration.
+@return own: symbol table node of type SYM_VAR */
+UNIV_INTERN
+sym_node_t*
+pars_variable_declaration(
+/*======================*/
+ sym_node_t* node, /*!< in: symbol table node allocated for the
+ id of the variable */
+ pars_res_word_t* type); /*!< in: pointer to a type token */
+/*********************************************************************//**
+Parses a function expression.
+@return own: function node in a query tree */
+UNIV_INTERN
+func_node_t*
+pars_func(
+/*======*/
+ que_node_t* res_word,/*!< in: function name reserved word */
+ que_node_t* arg); /*!< in: first argument in the argument list */
+/*********************************************************************//**
+Parses an operator expression.
+@return own: function node in a query tree */
+UNIV_INTERN
+func_node_t*
+pars_op(
+/*====*/
+ int func, /*!< in: operator token code */
+ que_node_t* arg1, /*!< in: first argument */
+ que_node_t* arg2); /*!< in: second argument or NULL for an unary
+ operator */
+/*********************************************************************//**
+Parses an ORDER BY clause. Order by a single column only is supported.
+@return own: order-by node in a query tree */
+UNIV_INTERN
+order_node_t*
+pars_order_by(
+/*==========*/
+ sym_node_t* column, /*!< in: column name */
+ pars_res_word_t* asc); /*!< in: &pars_asc_token or pars_desc_token */
+/*********************************************************************//**
+Parses a select list; creates a query graph node for the whole SELECT
+statement.
+@return own: select node in a query tree */
+UNIV_INTERN
+sel_node_t*
+pars_select_list(
+/*=============*/
+ que_node_t* select_list, /*!< in: select list */
+ sym_node_t* into_list); /*!< in: variables list or NULL */
+/*********************************************************************//**
+Parses a cursor declaration.
+@return sym_node */
+UNIV_INTERN
+que_node_t*
+pars_cursor_declaration(
+/*====================*/
+ sym_node_t* sym_node, /*!< in: cursor id node in the symbol
+ table */
+ sel_node_t* select_node); /*!< in: select node */
+/*********************************************************************//**
+Parses a function declaration.
+@return sym_node */
+UNIV_INTERN
+que_node_t*
+pars_function_declaration(
+/*======================*/
+ sym_node_t* sym_node); /*!< in: function id node in the symbol
+ table */
+/*********************************************************************//**
+Parses a select statement.
+@return own: select node in a query tree */
+UNIV_INTERN
+sel_node_t*
+pars_select_statement(
+/*==================*/
+ sel_node_t* select_node, /*!< in: select node already containing
+ the select list */
+ sym_node_t* table_list, /*!< in: table list */
+ que_node_t* search_cond, /*!< in: search condition or NULL */
+ pars_res_word_t* for_update, /*!< in: NULL or &pars_update_token */
+ pars_res_word_t* consistent_read,/*!< in: NULL or
+ &pars_consistent_token */
+ order_node_t* order_by); /*!< in: NULL or an order-by node */
+/*********************************************************************//**
+Parses a column assignment in an update.
+@return column assignment node */
+UNIV_INTERN
+col_assign_node_t*
+pars_column_assignment(
+/*===================*/
+ sym_node_t* column, /*!< in: column to assign */
+ que_node_t* exp); /*!< in: value to assign */
+/*********************************************************************//**
+Parses a delete or update statement start.
+@return own: update node in a query tree */
+UNIV_INTERN
+upd_node_t*
+pars_update_statement_start(
+/*========================*/
+ ibool is_delete, /*!< in: TRUE if delete */
+ sym_node_t* table_sym, /*!< in: table name node */
+ col_assign_node_t* col_assign_list);/*!< in: column assignment list, NULL
+ if delete */
+/*********************************************************************//**
+Parses an update or delete statement.
+@return own: update node in a query tree */
+UNIV_INTERN
+upd_node_t*
+pars_update_statement(
+/*==================*/
+ upd_node_t* node, /*!< in: update node */
+ sym_node_t* cursor_sym, /*!< in: pointer to a cursor entry in
+ the symbol table or NULL */
+ que_node_t* search_cond); /*!< in: search condition or NULL */
+/*********************************************************************//**
+Parses an insert statement.
+@return own: update node in a query tree */
+UNIV_INTERN
+ins_node_t*
+pars_insert_statement(
+/*==================*/
+ sym_node_t* table_sym, /*!< in: table name node */
+ que_node_t* values_list, /*!< in: value expression list or NULL */
+ sel_node_t* select); /*!< in: select condition or NULL */
+/*********************************************************************//**
+Parses a procedure parameter declaration.
+@return own: symbol table node of type SYM_VAR */
+UNIV_INTERN
+sym_node_t*
+pars_parameter_declaration(
+/*=======================*/
+ sym_node_t* node, /*!< in: symbol table node allocated for the
+ id of the parameter */
+ ulint param_type,
+ /*!< in: PARS_INPUT or PARS_OUTPUT */
+ pars_res_word_t* type); /*!< in: pointer to a type token */
+/*********************************************************************//**
+Parses an elsif element.
+@return elsif node */
+UNIV_INTERN
+elsif_node_t*
+pars_elsif_element(
+/*===============*/
+ que_node_t* cond, /*!< in: if-condition */
+ que_node_t* stat_list); /*!< in: statement list */
+/*********************************************************************//**
+Parses an if-statement.
+@return if-statement node */
+UNIV_INTERN
+if_node_t*
+pars_if_statement(
+/*==============*/
+ que_node_t* cond, /*!< in: if-condition */
+ que_node_t* stat_list, /*!< in: statement list */
+ que_node_t* else_part); /*!< in: else-part statement list */
+/*********************************************************************//**
+Parses a for-loop-statement.
+@return for-statement node */
+UNIV_INTERN
+for_node_t*
+pars_for_statement(
+/*===============*/
+ sym_node_t* loop_var, /*!< in: loop variable */
+ que_node_t* loop_start_limit,/*!< in: loop start expression */
+ que_node_t* loop_end_limit, /*!< in: loop end expression */
+ que_node_t* stat_list); /*!< in: statement list */
+/*********************************************************************//**
+Parses a while-statement.
+@return while-statement node */
+UNIV_INTERN
+while_node_t*
+pars_while_statement(
+/*=================*/
+ que_node_t* cond, /*!< in: while-condition */
+ que_node_t* stat_list); /*!< in: statement list */
+/*********************************************************************//**
+Parses an exit statement.
+@return exit statement node */
+UNIV_INTERN
+exit_node_t*
+pars_exit_statement(void);
+/*=====================*/
+/*********************************************************************//**
+Parses a return-statement.
+@return return-statement node */
+UNIV_INTERN
+return_node_t*
+pars_return_statement(void);
+/*=======================*/
+/*********************************************************************//**
+Parses a procedure call.
+@return function node */
+UNIV_INTERN
+func_node_t*
+pars_procedure_call(
+/*================*/
+ que_node_t* res_word,/*!< in: procedure name reserved word */
+ que_node_t* args); /*!< in: argument list */
+/*********************************************************************//**
+Parses an assignment statement.
+@return assignment statement node */
+UNIV_INTERN
+assign_node_t*
+pars_assignment_statement(
+/*======================*/
+ sym_node_t* var, /*!< in: variable to assign */
+ que_node_t* val); /*!< in: value to assign */
+/*********************************************************************//**
+Parses a fetch statement. into_list or user_func (but not both) must be
+non-NULL.
+@return fetch statement node */
+UNIV_INTERN
+fetch_node_t*
+pars_fetch_statement(
+/*=================*/
+ sym_node_t* cursor, /*!< in: cursor node */
+ sym_node_t* into_list, /*!< in: variables to set, or NULL */
+ sym_node_t* user_func); /*!< in: user function name, or NULL */
+/*********************************************************************//**
+Parses an open or close cursor statement.
+@return fetch statement node */
+UNIV_INTERN
+open_node_t*
+pars_open_statement(
+/*================*/
+ ulint type, /*!< in: ROW_SEL_OPEN_CURSOR
+ or ROW_SEL_CLOSE_CURSOR */
+ sym_node_t* cursor); /*!< in: cursor node */
+/*********************************************************************//**
+Parses a row_printf-statement.
+@return row_printf-statement node */
+UNIV_INTERN
+row_printf_node_t*
+pars_row_printf_statement(
+/*======================*/
+ sel_node_t* sel_node); /*!< in: select node */
+/*********************************************************************//**
+Parses a commit statement.
+@return own: commit node struct */
+UNIV_INTERN
+commit_node_t*
+pars_commit_statement(void);
+/*=======================*/
+/*********************************************************************//**
+Parses a rollback statement.
+@return own: rollback node struct */
+UNIV_INTERN
+roll_node_t*
+pars_rollback_statement(void);
+/*=========================*/
+/*********************************************************************//**
+Parses a column definition at a table creation.
+@return column sym table node */
+UNIV_INTERN
+sym_node_t*
+pars_column_def(
+/*============*/
+ sym_node_t* sym_node, /*!< in: column node in the
+ symbol table */
+ pars_res_word_t* type, /*!< in: data type */
+ sym_node_t* len, /*!< in: length of column, or
+ NULL */
+ void* is_unsigned, /*!< in: if not NULL, column
+ is of type UNSIGNED. */
+ void* is_not_null); /*!< in: if not NULL, column
+ is of type NOT NULL. */
+/*********************************************************************//**
+Parses a table creation operation.
+@return table create subgraph */
+UNIV_INTERN
+tab_node_t*
+pars_create_table(
+/*==============*/
+ sym_node_t* table_sym, /*!< in: table name node in the symbol
+ table */
+ sym_node_t* column_defs, /*!< in: list of column names */
+ void* not_fit_in_memory);/*!< in: a non-NULL pointer means that
+ this is a table which in simulations
+ should be simulated as not fitting
+ in memory; thread is put to sleep
+ to simulate disk accesses; NOTE that
+ this flag is not stored to the data
+ dictionary on disk, and the database
+ will forget about non-NULL value if
+ it has to reload the table definition
+ from disk */
+/*********************************************************************//**
+Parses an index creation operation.
+@return index create subgraph */
+UNIV_INTERN
+ind_node_t*
+pars_create_index(
+/*==============*/
+ pars_res_word_t* unique_def, /*!< in: not NULL if a unique index */
+ pars_res_word_t* clustered_def, /*!< in: not NULL if a clustered index */
+ sym_node_t* index_sym, /*!< in: index name node in the symbol
+ table */
+ sym_node_t* table_sym, /*!< in: table name node in the symbol
+ table */
+ sym_node_t* column_list); /*!< in: list of column names */
+/*********************************************************************//**
+Parses a procedure definition.
+@return query fork node */
+UNIV_INTERN
+que_fork_t*
+pars_procedure_definition(
+/*======================*/
+ sym_node_t* sym_node, /*!< in: procedure id node in the symbol
+ table */
+ sym_node_t* param_list, /*!< in: parameter declaration list */
+ que_node_t* stat_list); /*!< in: statement list */
+
+/*************************************************************//**
+Parses a stored procedure call, when this is not within another stored
+procedure, that is, the client issues a procedure call directly.
+In MySQL/InnoDB, stored InnoDB procedures are invoked via the
+parsed procedure tree, not via InnoDB SQL, so this function is not used.
+@return query graph */
+UNIV_INTERN
+que_fork_t*
+pars_stored_procedure_call(
+/*=======================*/
+ sym_node_t* sym_node); /*!< in: stored procedure name */
+/******************************************************************//**
+Completes a query graph by adding query thread and fork nodes
+above it and prepares the graph for running. The fork created is of
+type QUE_FORK_MYSQL_INTERFACE.
+@return query thread node to run */
+UNIV_INTERN
+que_thr_t*
+pars_complete_graph_for_exec(
+/*=========================*/
+ que_node_t* node, /*!< in: root node for an incomplete
+ query graph */
+ trx_t* trx, /*!< in: transaction handle */
+ mem_heap_t* heap); /*!< in: memory heap from which allocated */
+
+/****************************************************************//**
+Create parser info struct.
+@return own: info struct */
+UNIV_INTERN
+pars_info_t*
+pars_info_create(void);
+/*==================*/
+
+/****************************************************************//**
+Free info struct and everything it contains. */
+UNIV_INTERN
+void
+pars_info_free(
+/*===========*/
+ pars_info_t* info); /*!< in, own: info struct */
+
+/****************************************************************//**
+Add bound literal. */
+UNIV_INTERN
+void
+pars_info_add_literal(
+/*==================*/
+ pars_info_t* info, /*!< in: info struct */
+ const char* name, /*!< in: name */
+ const void* address, /*!< in: address */
+ ulint length, /*!< in: length of data */
+ ulint type, /*!< in: type, e.g. DATA_FIXBINARY */
+ ulint prtype); /*!< in: precise type, e.g.
+ DATA_UNSIGNED */
+
+/****************************************************************//**
+Equivalent to pars_info_add_literal(info, name, str, strlen(str),
+DATA_VARCHAR, DATA_ENGLISH). */
+UNIV_INTERN
+void
+pars_info_add_str_literal(
+/*======================*/
+ pars_info_t* info, /*!< in: info struct */
+ const char* name, /*!< in: name */
+ const char* str); /*!< in: string */
+
+/****************************************************************//**
+Equivalent to:
+
+char buf[4];
+mach_write_to_4(buf, val);
+pars_info_add_literal(info, name, buf, 4, DATA_INT, 0);
+
+except that the buffer is dynamically allocated from the info struct's
+heap. */
+UNIV_INTERN
+void
+pars_info_add_int4_literal(
+/*=======================*/
+ pars_info_t* info, /*!< in: info struct */
+ const char* name, /*!< in: name */
+ lint val); /*!< in: value */
+
+/****************************************************************//**
+Equivalent to:
+
+char buf[8];
+mach_write_to_8(buf, val);
+pars_info_add_literal(info, name, buf, 8, DATA_BINARY, 0);
+
+except that the buffer is dynamically allocated from the info struct's
+heap. */
+UNIV_INTERN
+void
+pars_info_add_dulint_literal(
+/*=========================*/
+ pars_info_t* info, /*!< in: info struct */
+ const char* name, /*!< in: name */
+ dulint val); /*!< in: value */
+/****************************************************************//**
+Add user function. */
+UNIV_INTERN
+void
+pars_info_add_function(
+/*===================*/
+ pars_info_t* info, /*!< in: info struct */
+ const char* name, /*!< in: function name */
+ pars_user_func_cb_t func, /*!< in: function address */
+ void* arg); /*!< in: user-supplied argument */
+
+/****************************************************************//**
+Add bound id. */
+UNIV_INTERN
+void
+pars_info_add_id(
+/*=============*/
+ pars_info_t* info, /*!< in: info struct */
+ const char* name, /*!< in: name */
+ const char* id); /*!< in: id */
+
+/****************************************************************//**
+Get user function with the given name.
+@return user func, or NULL if not found */
+UNIV_INTERN
+pars_user_func_t*
+pars_info_get_user_func(
+/*====================*/
+ pars_info_t* info, /*!< in: info struct */
+ const char* name); /*!< in: function name to find*/
+
+/****************************************************************//**
+Get bound literal with the given name.
+@return bound literal, or NULL if not found */
+UNIV_INTERN
+pars_bound_lit_t*
+pars_info_get_bound_lit(
+/*====================*/
+ pars_info_t* info, /*!< in: info struct */
+ const char* name); /*!< in: bound literal name to find */
+
+/****************************************************************//**
+Get bound id with the given name.
+@return bound id, or NULL if not found */
+UNIV_INTERN
+pars_bound_id_t*
+pars_info_get_bound_id(
+/*===================*/
+ pars_info_t* info, /*!< in: info struct */
+ const char* name); /*!< in: bound id name to find */
+
+
+/** Extra information supplied for pars_sql(). */
+struct pars_info_struct {
+ mem_heap_t* heap; /*!< our own memory heap */
+
+ ib_vector_t* funcs; /*!< user functions, or NUll
+ (pars_user_func_t*) */
+ ib_vector_t* bound_lits; /*!< bound literals, or NULL
+ (pars_bound_lit_t*) */
+ ib_vector_t* bound_ids; /*!< bound ids, or NULL
+ (pars_bound_id_t*) */
+
+ ibool graph_owns_us; /*!< if TRUE (which is the default),
+ que_graph_free() will free us */
+};
+
+/** User-supplied function and argument. */
+struct pars_user_func_struct {
+ const char* name; /*!< function name */
+ pars_user_func_cb_t func; /*!< function address */
+ void* arg; /*!< user-supplied argument */
+};
+
+/** Bound literal. */
+struct pars_bound_lit_struct {
+ const char* name; /*!< name */
+ const void* address; /*!< address */
+ ulint length; /*!< length of data */
+ ulint type; /*!< type, e.g. DATA_FIXBINARY */
+ ulint prtype; /*!< precise type, e.g. DATA_UNSIGNED */
+};
+
+/** Bound identifier. */
+struct pars_bound_id_struct {
+ const char* name; /*!< name */
+ const char* id; /*!< identifier */
+};
+
+/** Struct used to denote a reserved word in a parsing tree */
+struct pars_res_word_struct{
+ int code; /*!< the token code for the reserved word from
+ pars0grm.h */
+};
+
+/** A predefined function or operator node in a parsing tree; this construct
+is also used for some non-functions like the assignment ':=' */
+struct func_node_struct{
+ que_common_t common; /*!< type: QUE_NODE_FUNC */
+ int func; /*!< token code of the function name */
+ ulint class; /*!< class of the function */
+ que_node_t* args; /*!< argument(s) of the function */
+ UT_LIST_NODE_T(func_node_t) cond_list;
+ /*!< list of comparison conditions; defined
+ only for comparison operator nodes except,
+ presently, for OPT_SCROLL_TYPE ones */
+ UT_LIST_NODE_T(func_node_t) func_node_list;
+ /*!< list of function nodes in a parsed
+ query graph */
+};
+
+/** An order-by node in a select */
+struct order_node_struct{
+ que_common_t common; /*!< type: QUE_NODE_ORDER */
+ sym_node_t* column; /*!< order-by column */
+ ibool asc; /*!< TRUE if ascending, FALSE if descending */
+};
+
+/** Procedure definition node */
+struct proc_node_struct{
+ que_common_t common; /*!< type: QUE_NODE_PROC */
+ sym_node_t* proc_id; /*!< procedure name symbol in the symbol
+ table of this same procedure */
+ sym_node_t* param_list; /*!< input and output parameters */
+ que_node_t* stat_list; /*!< statement list */
+ sym_tab_t* sym_tab; /*!< symbol table of this procedure */
+};
+
+/** elsif-element node */
+struct elsif_node_struct{
+ que_common_t common; /*!< type: QUE_NODE_ELSIF */
+ que_node_t* cond; /*!< if condition */
+ que_node_t* stat_list; /*!< statement list */
+};
+
+/** if-statement node */
+struct if_node_struct{
+ que_common_t common; /*!< type: QUE_NODE_IF */
+ que_node_t* cond; /*!< if condition */
+ que_node_t* stat_list; /*!< statement list */
+ que_node_t* else_part; /*!< else-part statement list */
+ elsif_node_t* elsif_list; /*!< elsif element list */
+};
+
+/** while-statement node */
+struct while_node_struct{
+ que_common_t common; /*!< type: QUE_NODE_WHILE */
+ que_node_t* cond; /*!< while condition */
+ que_node_t* stat_list; /*!< statement list */
+};
+
+/** for-loop-statement node */
+struct for_node_struct{
+ que_common_t common; /*!< type: QUE_NODE_FOR */
+ sym_node_t* loop_var; /*!< loop variable: this is the
+ dereferenced symbol from the
+ variable declarations, not the
+ symbol occurrence in the for loop
+ definition */
+ que_node_t* loop_start_limit;/*!< initial value of loop variable */
+ que_node_t* loop_end_limit; /*!< end value of loop variable */
+ lint loop_end_value; /*!< evaluated value for the end value:
+ it is calculated only when the loop
+ is entered, and will not change within
+ the loop */
+ que_node_t* stat_list; /*!< statement list */
+};
+
+/** exit statement node */
+struct exit_node_struct{
+ que_common_t common; /*!< type: QUE_NODE_EXIT */
+};
+
+/** return-statement node */
+struct return_node_struct{
+ que_common_t common; /*!< type: QUE_NODE_RETURN */
+};
+
+/** Assignment statement node */
+struct assign_node_struct{
+ que_common_t common; /*!< type: QUE_NODE_ASSIGNMENT */
+ sym_node_t* var; /*!< variable to set */
+ que_node_t* val; /*!< value to assign */
+};
+
+/** Column assignment node */
+struct col_assign_node_struct{
+ que_common_t common; /*!< type: QUE_NODE_COL_ASSIGN */
+ sym_node_t* col; /*!< column to set */
+ que_node_t* val; /*!< value to assign */
+};
+
+/** Classes of functions */
+/* @{ */
+#define PARS_FUNC_ARITH 1 /*!< +, -, *, / */
+#define PARS_FUNC_LOGICAL 2 /*!< AND, OR, NOT */
+#define PARS_FUNC_CMP 3 /*!< comparison operators */
+#define PARS_FUNC_PREDEFINED 4 /*!< TO_NUMBER, SUBSTR, ... */
+#define PARS_FUNC_AGGREGATE 5 /*!< COUNT, DISTINCT, SUM */
+#define PARS_FUNC_OTHER 6 /*!< these are not real functions,
+ e.g., := */
+/* @} */
+
+#ifndef UNIV_NONINL
+#include "pars0pars.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/pars0pars.ic b/storage/innobase/include/pars0pars.ic
new file mode 100644
index 00000000000..ae6c13cd671
--- /dev/null
+++ b/storage/innobase/include/pars0pars.ic
@@ -0,0 +1,24 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/pars0pars.ic
+SQL parser
+
+Created 11/19/1996 Heikki Tuuri
+*******************************************************/
diff --git a/storage/innobase/include/pars0sym.h b/storage/innobase/include/pars0sym.h
new file mode 100644
index 00000000000..6d1a4b82414
--- /dev/null
+++ b/storage/innobase/include/pars0sym.h
@@ -0,0 +1,244 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/pars0sym.h
+SQL parser symbol table
+
+Created 12/15/1997 Heikki Tuuri
+*******************************************************/
+
+#ifndef pars0sym_h
+#define pars0sym_h
+
+#include "univ.i"
+#include "que0types.h"
+#include "usr0types.h"
+#include "dict0types.h"
+#include "pars0types.h"
+#include "row0types.h"
+
+/******************************************************************//**
+Creates a symbol table for a single stored procedure or query.
+@return own: symbol table */
+UNIV_INTERN
+sym_tab_t*
+sym_tab_create(
+/*===========*/
+ mem_heap_t* heap); /*!< in: memory heap where to create */
+/******************************************************************//**
+Frees the memory allocated dynamically AFTER parsing phase for variables
+etc. in the symbol table. Does not free the mem heap where the table was
+originally created. Frees also SQL explicit cursor definitions. */
+UNIV_INTERN
+void
+sym_tab_free_private(
+/*=================*/
+ sym_tab_t* sym_tab); /*!< in, own: symbol table */
+/******************************************************************//**
+Adds an integer literal to a symbol table.
+@return symbol table node */
+UNIV_INTERN
+sym_node_t*
+sym_tab_add_int_lit(
+/*================*/
+ sym_tab_t* sym_tab, /*!< in: symbol table */
+ ulint val); /*!< in: integer value */
+/******************************************************************//**
+Adds an string literal to a symbol table.
+@return symbol table node */
+UNIV_INTERN
+sym_node_t*
+sym_tab_add_str_lit(
+/*================*/
+ sym_tab_t* sym_tab, /*!< in: symbol table */
+ byte* str, /*!< in: string with no quotes around
+ it */
+ ulint len); /*!< in: string length */
+/******************************************************************//**
+Add a bound literal to a symbol table.
+@return symbol table node */
+UNIV_INTERN
+sym_node_t*
+sym_tab_add_bound_lit(
+/*==================*/
+ sym_tab_t* sym_tab, /*!< in: symbol table */
+ const char* name, /*!< in: name of bound literal */
+ ulint* lit_type); /*!< out: type of literal (PARS_*_LIT) */
+/******************************************************************//**
+Adds an SQL null literal to a symbol table.
+@return symbol table node */
+UNIV_INTERN
+sym_node_t*
+sym_tab_add_null_lit(
+/*=================*/
+ sym_tab_t* sym_tab); /*!< in: symbol table */
+/******************************************************************//**
+Adds an identifier to a symbol table.
+@return symbol table node */
+UNIV_INTERN
+sym_node_t*
+sym_tab_add_id(
+/*===========*/
+ sym_tab_t* sym_tab, /*!< in: symbol table */
+ byte* name, /*!< in: identifier name */
+ ulint len); /*!< in: identifier length */
+
+/******************************************************************//**
+Add a bound identifier to a symbol table.
+@return symbol table node */
+UNIV_INTERN
+sym_node_t*
+sym_tab_add_bound_id(
+/*===========*/
+ sym_tab_t* sym_tab, /*!< in: symbol table */
+ const char* name); /*!< in: name of bound id */
+
+/** Index of sym_node_struct::field_nos corresponding to the clustered index */
+#define SYM_CLUST_FIELD_NO 0
+/** Index of sym_node_struct::field_nos corresponding to a secondary index */
+#define SYM_SEC_FIELD_NO 1
+
+/** Types of a symbol table node */
+enum sym_tab_entry {
+ SYM_VAR = 91, /*!< declared parameter or local
+ variable of a procedure */
+ SYM_IMPLICIT_VAR, /*!< storage for a intermediate result
+ of a calculation */
+ SYM_LIT, /*!< literal */
+ SYM_TABLE, /*!< database table name */
+ SYM_COLUMN, /*!< database table name */
+ SYM_CURSOR, /*!< named cursor */
+ SYM_PROCEDURE_NAME, /*!< stored procedure name */
+ SYM_INDEX, /*!< database index name */
+ SYM_FUNCTION /*!< user function name */
+};
+
+/** Symbol table node */
+struct sym_node_struct{
+ que_common_t common; /*!< node type:
+ QUE_NODE_SYMBOL */
+ /* NOTE: if the data field in 'common.val' is not NULL and the symbol
+ table node is not for a temporary column, the memory for the value has
+ been allocated from dynamic memory and it should be freed when the
+ symbol table is discarded */
+
+ /* 'alias' and 'indirection' are almost the same, but not quite.
+ 'alias' always points to the primary instance of the variable, while
+ 'indirection' does the same only if we should use the primary
+ instance's values for the node's data. This is usually the case, but
+ when initializing a cursor (e.g., "DECLARE CURSOR c IS SELECT * FROM
+ t WHERE id = x;"), we copy the values from the primary instance to
+ the cursor's instance so that they are fixed for the duration of the
+ cursor, and set 'indirection' to NULL. If we did not, the value of
+ 'x' could change between fetches and things would break horribly.
+
+ TODO: It would be cleaner to make 'indirection' a boolean field and
+ always use 'alias' to refer to the primary node. */
+
+ sym_node_t* indirection; /*!< pointer to
+ another symbol table
+ node which contains
+ the value for this
+ node, NULL otherwise */
+ sym_node_t* alias; /*!< pointer to
+ another symbol table
+ node for which this
+ node is an alias,
+ NULL otherwise */
+ UT_LIST_NODE_T(sym_node_t) col_var_list; /*!< list of table
+ columns or a list of
+ input variables for an
+ explicit cursor */
+ ibool copy_val; /*!< TRUE if a column
+ and its value should
+ be copied to dynamic
+ memory when fetched */
+ ulint field_nos[2]; /*!< if a column, in
+ the position
+ SYM_CLUST_FIELD_NO is
+ the field number in the
+ clustered index; in
+ the position
+ SYM_SEC_FIELD_NO
+ the field number in the
+ non-clustered index to
+ use first; if not found
+ from the index, then
+ ULINT_UNDEFINED */
+ ibool resolved; /*!< TRUE if the
+ meaning of a variable
+ or a column has been
+ resolved; for literals
+ this is always TRUE */
+ enum sym_tab_entry token_type; /*!< type of the
+ parsed token */
+ const char* name; /*!< name of an id */
+ ulint name_len; /*!< id name length */
+ dict_table_t* table; /*!< table definition
+ if a table id or a
+ column id */
+ ulint col_no; /*!< column number if a
+ column */
+ sel_buf_t* prefetch_buf; /*!< NULL, or a buffer
+ for cached column
+ values for prefetched
+ rows */
+ sel_node_t* cursor_def; /*!< cursor definition
+ select node if a
+ named cursor */
+ ulint param_type; /*!< PARS_INPUT,
+ PARS_OUTPUT, or
+ PARS_NOT_PARAM if not a
+ procedure parameter */
+ sym_tab_t* sym_table; /*!< back pointer to
+ the symbol table */
+ UT_LIST_NODE_T(sym_node_t) sym_list; /*!< list of symbol
+ nodes */
+};
+
+/** Symbol table */
+struct sym_tab_struct{
+ que_t* query_graph;
+ /*!< query graph generated by the
+ parser */
+ const char* sql_string;
+ /*!< SQL string to parse */
+ size_t string_len;
+ /*!< SQL string length */
+ int next_char_pos;
+ /*!< position of the next character in
+ sql_string to give to the lexical
+ analyzer */
+ pars_info_t* info; /*!< extra information, or NULL */
+ sym_node_list_t sym_list;
+ /*!< list of symbol nodes in the symbol
+ table */
+ UT_LIST_BASE_NODE_T(func_node_t)
+ func_node_list;
+ /*!< list of function nodes in the
+ parsed query graph */
+ mem_heap_t* heap; /*!< memory heap from which we can
+ allocate space */
+};
+
+#ifndef UNIV_NONINL
+#include "pars0sym.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/pars0sym.ic b/storage/innobase/include/pars0sym.ic
new file mode 100644
index 00000000000..9eb09db3a47
--- /dev/null
+++ b/storage/innobase/include/pars0sym.ic
@@ -0,0 +1,24 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/pars0sym.ic
+SQL parser symbol table
+
+Created 12/15/1997 Heikki Tuuri
+*******************************************************/
diff --git a/storage/innobase/include/pars0types.h b/storage/innobase/include/pars0types.h
new file mode 100644
index 00000000000..e0a8a86bf07
--- /dev/null
+++ b/storage/innobase/include/pars0types.h
@@ -0,0 +1,50 @@
+/*****************************************************************************
+
+Copyright (c) 1998, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/pars0types.h
+SQL parser global types
+
+Created 1/11/1998 Heikki Tuuri
+*******************************************************/
+
+#ifndef pars0types_h
+#define pars0types_h
+
+typedef struct pars_info_struct pars_info_t;
+typedef struct pars_user_func_struct pars_user_func_t;
+typedef struct pars_bound_lit_struct pars_bound_lit_t;
+typedef struct pars_bound_id_struct pars_bound_id_t;
+typedef struct sym_node_struct sym_node_t;
+typedef struct sym_tab_struct sym_tab_t;
+typedef struct pars_res_word_struct pars_res_word_t;
+typedef struct func_node_struct func_node_t;
+typedef struct order_node_struct order_node_t;
+typedef struct proc_node_struct proc_node_t;
+typedef struct elsif_node_struct elsif_node_t;
+typedef struct if_node_struct if_node_t;
+typedef struct while_node_struct while_node_t;
+typedef struct for_node_struct for_node_t;
+typedef struct exit_node_struct exit_node_t;
+typedef struct return_node_struct return_node_t;
+typedef struct assign_node_struct assign_node_t;
+typedef struct col_assign_node_struct col_assign_node_t;
+
+typedef UT_LIST_BASE_NODE_T(sym_node_t) sym_node_list_t;
+
+#endif
diff --git a/storage/innobase/include/que0que.h b/storage/innobase/include/que0que.h
new file mode 100644
index 00000000000..420f34550e2
--- /dev/null
+++ b/storage/innobase/include/que0que.h
@@ -0,0 +1,513 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/que0que.h
+Query graph
+
+Created 5/27/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef que0que_h
+#define que0que_h
+
+#include "univ.i"
+#include "data0data.h"
+#include "dict0types.h"
+#include "trx0trx.h"
+#include "srv0srv.h"
+#include "usr0types.h"
+#include "que0types.h"
+#include "row0types.h"
+#include "pars0types.h"
+
+/* If the following flag is set TRUE, the module will print trace info
+of SQL execution in the UNIV_SQL_DEBUG version */
+extern ibool que_trace_on;
+
+/***********************************************************************//**
+Adds a query graph to the session's list of graphs. */
+UNIV_INTERN
+void
+que_graph_publish(
+/*==============*/
+ que_t* graph, /*!< in: graph */
+ sess_t* sess); /*!< in: session */
+/***********************************************************************//**
+Creates a query graph fork node.
+@return own: fork node */
+UNIV_INTERN
+que_fork_t*
+que_fork_create(
+/*============*/
+ que_t* graph, /*!< in: graph, if NULL then this
+ fork node is assumed to be the
+ graph root */
+ que_node_t* parent, /*!< in: parent node */
+ ulint fork_type, /*!< in: fork type */
+ mem_heap_t* heap); /*!< in: memory heap where created */
+/***********************************************************************//**
+Gets the first thr in a fork. */
+UNIV_INLINE
+que_thr_t*
+que_fork_get_first_thr(
+/*===================*/
+ que_fork_t* fork); /*!< in: query fork */
+/***********************************************************************//**
+Gets the child node of the first thr in a fork. */
+UNIV_INLINE
+que_node_t*
+que_fork_get_child(
+/*===============*/
+ que_fork_t* fork); /*!< in: query fork */
+/***********************************************************************//**
+Sets the parent of a graph node. */
+UNIV_INLINE
+void
+que_node_set_parent(
+/*================*/
+ que_node_t* node, /*!< in: graph node */
+ que_node_t* parent);/*!< in: parent */
+/***********************************************************************//**
+Creates a query graph thread node.
+@return own: query thread node */
+UNIV_INTERN
+que_thr_t*
+que_thr_create(
+/*===========*/
+ que_fork_t* parent, /*!< in: parent node, i.e., a fork node */
+ mem_heap_t* heap); /*!< in: memory heap where created */
+/**********************************************************************//**
+Frees a query graph, but not the heap where it was created. Does not free
+explicit cursor declarations, they are freed in que_graph_free. */
+UNIV_INTERN
+void
+que_graph_free_recursive(
+/*=====================*/
+ que_node_t* node); /*!< in: query graph node */
+/**********************************************************************//**
+Frees a query graph. */
+UNIV_INTERN
+void
+que_graph_free(
+/*===========*/
+ que_t* graph); /*!< in: query graph; we assume that the memory
+ heap where this graph was created is private
+ to this graph: if not, then use
+ que_graph_free_recursive and free the heap
+ afterwards! */
+/**********************************************************************//**
+Stops a query thread if graph or trx is in a state requiring it. The
+conditions are tested in the order (1) graph, (2) trx. The kernel mutex has
+to be reserved.
+@return TRUE if stopped */
+UNIV_INTERN
+ibool
+que_thr_stop(
+/*=========*/
+ que_thr_t* thr); /*!< in: query thread */
+/**********************************************************************//**
+Moves a thread from another state to the QUE_THR_RUNNING state. Increments
+the n_active_thrs counters of the query graph and transaction. */
+UNIV_INTERN
+void
+que_thr_move_to_run_state_for_mysql(
+/*================================*/
+ que_thr_t* thr, /*!< in: an query thread */
+ trx_t* trx); /*!< in: transaction */
+/**********************************************************************//**
+A patch for MySQL used to 'stop' a dummy query thread used in MySQL
+select, when there is no error or lock wait. */
+UNIV_INTERN
+void
+que_thr_stop_for_mysql_no_error(
+/*============================*/
+ que_thr_t* thr, /*!< in: query thread */
+ trx_t* trx); /*!< in: transaction */
+/**********************************************************************//**
+A patch for MySQL used to 'stop' a dummy query thread used in MySQL. The
+query thread is stopped and made inactive, except in the case where
+it was put to the lock wait state in lock0lock.c, but the lock has already
+been granted or the transaction chosen as a victim in deadlock resolution. */
+UNIV_INTERN
+void
+que_thr_stop_for_mysql(
+/*===================*/
+ que_thr_t* thr); /*!< in: query thread */
+/**********************************************************************//**
+Run a query thread. Handles lock waits. */
+UNIV_INTERN
+void
+que_run_threads(
+/*============*/
+ que_thr_t* thr); /*!< in: query thread */
+/**********************************************************************//**
+After signal handling is finished, returns control to a query graph error
+handling routine. (Currently, just returns the control to the root of the
+graph so that the graph can communicate an error message to the client.) */
+UNIV_INTERN
+void
+que_fork_error_handle(
+/*==================*/
+ trx_t* trx, /*!< in: trx */
+ que_t* fork); /*!< in: query graph which was run before signal
+ handling started, NULL not allowed */
+/**********************************************************************//**
+Moves a suspended query thread to the QUE_THR_RUNNING state and releases
+a single worker thread to execute it. This function should be used to end
+the wait state of a query thread waiting for a lock or a stored procedure
+completion. */
+UNIV_INTERN
+void
+que_thr_end_wait(
+/*=============*/
+ que_thr_t* thr, /*!< in: query thread in the
+ QUE_THR_LOCK_WAIT,
+ or QUE_THR_PROCEDURE_WAIT, or
+ QUE_THR_SIG_REPLY_WAIT state */
+ que_thr_t** next_thr); /*!< in/out: next query thread to run;
+ if the value which is passed in is
+ a pointer to a NULL pointer, then the
+ calling function can start running
+ a new query thread */
+/**********************************************************************//**
+Same as que_thr_end_wait, but no parameter next_thr available. */
+UNIV_INTERN
+void
+que_thr_end_wait_no_next_thr(
+/*=========================*/
+ que_thr_t* thr); /*!< in: query thread in the
+ QUE_THR_LOCK_WAIT,
+ or QUE_THR_PROCEDURE_WAIT, or
+ QUE_THR_SIG_REPLY_WAIT state */
+/**********************************************************************//**
+Starts execution of a command in a query fork. Picks a query thread which
+is not in the QUE_THR_RUNNING state and moves it to that state. If none
+can be chosen, a situation which may arise in parallelized fetches, NULL
+is returned.
+@return a query thread of the graph moved to QUE_THR_RUNNING state, or
+NULL; the query thread should be executed by que_run_threads by the
+caller */
+UNIV_INTERN
+que_thr_t*
+que_fork_start_command(
+/*===================*/
+ que_fork_t* fork); /*!< in: a query fork */
+/***********************************************************************//**
+Gets the trx of a query thread. */
+UNIV_INLINE
+trx_t*
+thr_get_trx(
+/*========*/
+ que_thr_t* thr); /*!< in: query thread */
+/***********************************************************************//**
+Gets the type of a graph node. */
+UNIV_INLINE
+ulint
+que_node_get_type(
+/*==============*/
+ que_node_t* node); /*!< in: graph node */
+/***********************************************************************//**
+Gets pointer to the value data type field of a graph node. */
+UNIV_INLINE
+dtype_t*
+que_node_get_data_type(
+/*===================*/
+ que_node_t* node); /*!< in: graph node */
+/***********************************************************************//**
+Gets pointer to the value dfield of a graph node. */
+UNIV_INLINE
+dfield_t*
+que_node_get_val(
+/*=============*/
+ que_node_t* node); /*!< in: graph node */
+/***********************************************************************//**
+Gets the value buffer size of a graph node.
+@return val buffer size, not defined if val.data == NULL in node */
+UNIV_INLINE
+ulint
+que_node_get_val_buf_size(
+/*======================*/
+ que_node_t* node); /*!< in: graph node */
+/***********************************************************************//**
+Sets the value buffer size of a graph node. */
+UNIV_INLINE
+void
+que_node_set_val_buf_size(
+/*======================*/
+ que_node_t* node, /*!< in: graph node */
+ ulint size); /*!< in: size */
+/*********************************************************************//**
+Gets the next list node in a list of query graph nodes. */
+UNIV_INLINE
+que_node_t*
+que_node_get_next(
+/*==============*/
+ que_node_t* node); /*!< in: node in a list */
+/*********************************************************************//**
+Gets the parent node of a query graph node.
+@return parent node or NULL */
+UNIV_INLINE
+que_node_t*
+que_node_get_parent(
+/*================*/
+ que_node_t* node); /*!< in: node */
+/****************************************************************//**
+Get the first containing loop node (e.g. while_node_t or for_node_t) for the
+given node, or NULL if the node is not within a loop.
+@return containing loop node, or NULL. */
+UNIV_INTERN
+que_node_t*
+que_node_get_containing_loop_node(
+/*==============================*/
+ que_node_t* node); /*!< in: node */
+/*********************************************************************//**
+Catenates a query graph node to a list of them, possible empty list.
+@return one-way list of nodes */
+UNIV_INLINE
+que_node_t*
+que_node_list_add_last(
+/*===================*/
+ que_node_t* node_list, /*!< in: node list, or NULL */
+ que_node_t* node); /*!< in: node */
+/*********************************************************************//**
+Gets a query graph node list length.
+@return length, for NULL list 0 */
+UNIV_INLINE
+ulint
+que_node_list_get_len(
+/*==================*/
+ que_node_t* node_list); /*!< in: node list, or NULL */
+/**********************************************************************//**
+Checks if graph, trx, or session is in a state where the query thread should
+be stopped.
+@return TRUE if should be stopped; NOTE that if the peek is made
+without reserving the kernel mutex, then another peek with the mutex
+reserved is necessary before deciding the actual stopping */
+UNIV_INLINE
+ibool
+que_thr_peek_stop(
+/*==============*/
+ que_thr_t* thr); /*!< in: query thread */
+/***********************************************************************//**
+Returns TRUE if the query graph is for a SELECT statement.
+@return TRUE if a select */
+UNIV_INLINE
+ibool
+que_graph_is_select(
+/*================*/
+ que_t* graph); /*!< in: graph */
+/**********************************************************************//**
+Prints info of an SQL query graph node. */
+UNIV_INTERN
+void
+que_node_print_info(
+/*================*/
+ que_node_t* node); /*!< in: query graph node */
+/*********************************************************************//**
+Evaluate the given SQL
+@return error code or DB_SUCCESS */
+UNIV_INTERN
+ulint
+que_eval_sql(
+/*=========*/
+ pars_info_t* info, /*!< in: info struct, or NULL */
+ const char* sql, /*!< in: SQL string */
+ ibool reserve_dict_mutex,
+ /*!< in: if TRUE, acquire/release
+ dict_sys->mutex around call to pars_sql. */
+ trx_t* trx); /*!< in: trx */
+
+/* Query graph query thread node: the fields are protected by the kernel
+mutex with the exceptions named below */
+
+struct que_thr_struct{
+ que_common_t common; /*!< type: QUE_NODE_THR */
+ ulint magic_n; /*!< magic number to catch memory
+ corruption */
+ que_node_t* child; /*!< graph child node */
+ que_t* graph; /*!< graph where this node belongs */
+ ibool is_active; /*!< TRUE if the thread has been set
+ to the run state in
+ que_thr_move_to_run_state, but not
+ deactivated in
+ que_thr_dec_reference_count */
+ ulint state; /*!< state of the query thread */
+ UT_LIST_NODE_T(que_thr_t)
+ thrs; /*!< list of thread nodes of the fork
+ node */
+ UT_LIST_NODE_T(que_thr_t)
+ trx_thrs; /*!< lists of threads in wait list of
+ the trx */
+ UT_LIST_NODE_T(que_thr_t)
+ queue; /*!< list of runnable thread nodes in
+ the server task queue */
+ /*------------------------------*/
+ /* The following fields are private to the OS thread executing the
+ query thread, and are not protected by the kernel mutex: */
+
+ que_node_t* run_node; /*!< pointer to the node where the
+ subgraph down from this node is
+ currently executed */
+ que_node_t* prev_node; /*!< pointer to the node from which
+ the control came */
+ ulint resource; /*!< resource usage of the query thread
+ thus far */
+ ulint lock_state; /*!< lock state of thread (table or
+ row) */
+};
+
+#define QUE_THR_MAGIC_N 8476583
+#define QUE_THR_MAGIC_FREED 123461526
+
+/* Query graph fork node: its fields are protected by the kernel mutex */
+struct que_fork_struct{
+ que_common_t common; /*!< type: QUE_NODE_FORK */
+ que_t* graph; /*!< query graph of this node */
+ ulint fork_type; /*!< fork type */
+ ulint n_active_thrs; /*!< if this is the root of a graph, the
+ number query threads that have been
+ started in que_thr_move_to_run_state
+ but for which que_thr_dec_refer_count
+ has not yet been called */
+ trx_t* trx; /*!< transaction: this is set only in
+ the root node */
+ ulint state; /*!< state of the fork node */
+ que_thr_t* caller; /*!< pointer to a possible calling query
+ thread */
+ UT_LIST_BASE_NODE_T(que_thr_t)
+ thrs; /*!< list of query threads */
+ /*------------------------------*/
+ /* The fields in this section are defined only in the root node */
+ sym_tab_t* sym_tab; /*!< symbol table of the query,
+ generated by the parser, or NULL
+ if the graph was created 'by hand' */
+ pars_info_t* info; /*!< info struct, or NULL */
+ /* The following cur_... fields are relevant only in a select graph */
+
+ ulint cur_end; /*!< QUE_CUR_NOT_DEFINED, QUE_CUR_START,
+ QUE_CUR_END */
+ ulint cur_pos; /*!< if there are n rows in the result
+ set, values 0 and n + 1 mean before
+ first row, or after last row, depending
+ on cur_end; values 1...n mean a row
+ index */
+ ibool cur_on_row; /*!< TRUE if cursor is on a row, i.e.,
+ it is not before the first row or
+ after the last row */
+ dulint n_inserts; /*!< number of rows inserted */
+ dulint n_updates; /*!< number of rows updated */
+ dulint n_deletes; /*!< number of rows deleted */
+ sel_node_t* last_sel_node; /*!< last executed select node, or NULL
+ if none */
+ UT_LIST_NODE_T(que_fork_t)
+ graphs; /*!< list of query graphs of a session
+ or a stored procedure */
+ /*------------------------------*/
+ mem_heap_t* heap; /*!< memory heap where the fork was
+ created */
+
+};
+
+/* Query fork (or graph) types */
+#define QUE_FORK_SELECT_NON_SCROLL 1 /* forward-only cursor */
+#define QUE_FORK_SELECT_SCROLL 2 /* scrollable cursor */
+#define QUE_FORK_INSERT 3
+#define QUE_FORK_UPDATE 4
+#define QUE_FORK_ROLLBACK 5
+ /* This is really the undo graph used in rollback,
+ no signal-sending roll_node in this graph */
+#define QUE_FORK_PURGE 6
+#define QUE_FORK_EXECUTE 7
+#define QUE_FORK_PROCEDURE 8
+#define QUE_FORK_PROCEDURE_CALL 9
+#define QUE_FORK_MYSQL_INTERFACE 10
+#define QUE_FORK_RECOVERY 11
+
+/* Query fork (or graph) states */
+#define QUE_FORK_ACTIVE 1
+#define QUE_FORK_COMMAND_WAIT 2
+#define QUE_FORK_INVALID 3
+#define QUE_FORK_BEING_FREED 4
+
+/* Flag which is ORed to control structure statement node types */
+#define QUE_NODE_CONTROL_STAT 1024
+
+/* Query graph node types */
+#define QUE_NODE_LOCK 1
+#define QUE_NODE_INSERT 2
+#define QUE_NODE_UPDATE 4
+#define QUE_NODE_CURSOR 5
+#define QUE_NODE_SELECT 6
+#define QUE_NODE_AGGREGATE 7
+#define QUE_NODE_FORK 8
+#define QUE_NODE_THR 9
+#define QUE_NODE_UNDO 10
+#define QUE_NODE_COMMIT 11
+#define QUE_NODE_ROLLBACK 12
+#define QUE_NODE_PURGE 13
+#define QUE_NODE_CREATE_TABLE 14
+#define QUE_NODE_CREATE_INDEX 15
+#define QUE_NODE_SYMBOL 16
+#define QUE_NODE_RES_WORD 17
+#define QUE_NODE_FUNC 18
+#define QUE_NODE_ORDER 19
+#define QUE_NODE_PROC (20 + QUE_NODE_CONTROL_STAT)
+#define QUE_NODE_IF (21 + QUE_NODE_CONTROL_STAT)
+#define QUE_NODE_WHILE (22 + QUE_NODE_CONTROL_STAT)
+#define QUE_NODE_ASSIGNMENT 23
+#define QUE_NODE_FETCH 24
+#define QUE_NODE_OPEN 25
+#define QUE_NODE_COL_ASSIGNMENT 26
+#define QUE_NODE_FOR (27 + QUE_NODE_CONTROL_STAT)
+#define QUE_NODE_RETURN 28
+#define QUE_NODE_ROW_PRINTF 29
+#define QUE_NODE_ELSIF 30
+#define QUE_NODE_CALL 31
+#define QUE_NODE_EXIT 32
+
+/* Query thread states */
+#define QUE_THR_RUNNING 1
+#define QUE_THR_PROCEDURE_WAIT 2
+#define QUE_THR_COMPLETED 3 /* in selects this means that the
+ thread is at the end of its result set
+ (or start, in case of a scroll cursor);
+ in other statements, this means the
+ thread has done its task */
+#define QUE_THR_COMMAND_WAIT 4
+#define QUE_THR_LOCK_WAIT 5
+#define QUE_THR_SIG_REPLY_WAIT 6
+#define QUE_THR_SUSPENDED 7
+#define QUE_THR_ERROR 8
+
+/* Query thread lock states */
+#define QUE_THR_LOCK_NOLOCK 0
+#define QUE_THR_LOCK_ROW 1
+#define QUE_THR_LOCK_TABLE 2
+
+/* From where the cursor position is counted */
+#define QUE_CUR_NOT_DEFINED 1
+#define QUE_CUR_START 2
+#define QUE_CUR_END 3
+
+
+#ifndef UNIV_NONINL
+#include "que0que.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/que0que.ic b/storage/innobase/include/que0que.ic
new file mode 100644
index 00000000000..a1c0dc1e77a
--- /dev/null
+++ b/storage/innobase/include/que0que.ic
@@ -0,0 +1,273 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/que0que.ic
+Query graph
+
+Created 5/27/1996 Heikki Tuuri
+*******************************************************/
+
+#include "usr0sess.h"
+
+/***********************************************************************//**
+Gets the trx of a query thread. */
+UNIV_INLINE
+trx_t*
+thr_get_trx(
+/*========*/
+ que_thr_t* thr) /*!< in: query thread */
+{
+ ut_ad(thr);
+
+ return(thr->graph->trx);
+}
+
+/***********************************************************************//**
+Gets the first thr in a fork. */
+UNIV_INLINE
+que_thr_t*
+que_fork_get_first_thr(
+/*===================*/
+ que_fork_t* fork) /*!< in: query fork */
+{
+ return(UT_LIST_GET_FIRST(fork->thrs));
+}
+
+/***********************************************************************//**
+Gets the child node of the first thr in a fork. */
+UNIV_INLINE
+que_node_t*
+que_fork_get_child(
+/*===============*/
+ que_fork_t* fork) /*!< in: query fork */
+{
+ que_thr_t* thr;
+
+ thr = UT_LIST_GET_FIRST(fork->thrs);
+
+ return(thr->child);
+}
+
+/***********************************************************************//**
+Gets the type of a graph node. */
+UNIV_INLINE
+ulint
+que_node_get_type(
+/*==============*/
+ que_node_t* node) /*!< in: graph node */
+{
+ ut_ad(node);
+
+ return(((que_common_t*)node)->type);
+}
+
+/***********************************************************************//**
+Gets pointer to the value dfield of a graph node. */
+UNIV_INLINE
+dfield_t*
+que_node_get_val(
+/*=============*/
+ que_node_t* node) /*!< in: graph node */
+{
+ ut_ad(node);
+
+ return(&(((que_common_t*)node)->val));
+}
+
+/***********************************************************************//**
+Gets the value buffer size of a graph node.
+@return val buffer size, not defined if val.data == NULL in node */
+UNIV_INLINE
+ulint
+que_node_get_val_buf_size(
+/*======================*/
+ que_node_t* node) /*!< in: graph node */
+{
+ ut_ad(node);
+
+ return(((que_common_t*)node)->val_buf_size);
+}
+
+/***********************************************************************//**
+Sets the value buffer size of a graph node. */
+UNIV_INLINE
+void
+que_node_set_val_buf_size(
+/*======================*/
+ que_node_t* node, /*!< in: graph node */
+ ulint size) /*!< in: size */
+{
+ ut_ad(node);
+
+ ((que_common_t*)node)->val_buf_size = size;
+}
+
+/***********************************************************************//**
+Sets the parent of a graph node. */
+UNIV_INLINE
+void
+que_node_set_parent(
+/*================*/
+ que_node_t* node, /*!< in: graph node */
+ que_node_t* parent) /*!< in: parent */
+{
+ ut_ad(node);
+
+ ((que_common_t*)node)->parent = parent;
+}
+
+/***********************************************************************//**
+Gets pointer to the value data type field of a graph node. */
+UNIV_INLINE
+dtype_t*
+que_node_get_data_type(
+/*===================*/
+ que_node_t* node) /*!< in: graph node */
+{
+ ut_ad(node);
+
+ return(dfield_get_type(&((que_common_t*) node)->val));
+}
+
+/*********************************************************************//**
+Catenates a query graph node to a list of them, possible empty list.
+@return one-way list of nodes */
+UNIV_INLINE
+que_node_t*
+que_node_list_add_last(
+/*===================*/
+ que_node_t* node_list, /*!< in: node list, or NULL */
+ que_node_t* node) /*!< in: node */
+{
+ que_common_t* cnode;
+ que_common_t* cnode2;
+
+ cnode = (que_common_t*) node;
+
+ cnode->brother = NULL;
+
+ if (node_list == NULL) {
+
+ return(node);
+ }
+
+ cnode2 = (que_common_t*) node_list;
+
+ while (cnode2->brother != NULL) {
+ cnode2 = (que_common_t*) cnode2->brother;
+ }
+
+ cnode2->brother = node;
+
+ return(node_list);
+}
+
+/*********************************************************************//**
+Gets the next list node in a list of query graph nodes.
+@return next node in a list of nodes */
+UNIV_INLINE
+que_node_t*
+que_node_get_next(
+/*==============*/
+ que_node_t* node) /*!< in: node in a list */
+{
+ return(((que_common_t*)node)->brother);
+}
+
+/*********************************************************************//**
+Gets a query graph node list length.
+@return length, for NULL list 0 */
+UNIV_INLINE
+ulint
+que_node_list_get_len(
+/*==================*/
+ que_node_t* node_list) /*!< in: node list, or NULL */
+{
+ const que_common_t* cnode;
+ ulint len;
+
+ cnode = (const que_common_t*) node_list;
+ len = 0;
+
+ while (cnode != NULL) {
+ len++;
+ cnode = (const que_common_t*) cnode->brother;
+ }
+
+ return(len);
+}
+
+/*********************************************************************//**
+Gets the parent node of a query graph node.
+@return parent node or NULL */
+UNIV_INLINE
+que_node_t*
+que_node_get_parent(
+/*================*/
+ que_node_t* node) /*!< in: node */
+{
+ return(((que_common_t*)node)->parent);
+}
+
+/**********************************************************************//**
+Checks if graph, trx, or session is in a state where the query thread should
+be stopped.
+@return TRUE if should be stopped; NOTE that if the peek is made
+without reserving the kernel mutex, then another peek with the mutex
+reserved is necessary before deciding the actual stopping */
+UNIV_INLINE
+ibool
+que_thr_peek_stop(
+/*==============*/
+ que_thr_t* thr) /*!< in: query thread */
+{
+ trx_t* trx;
+ que_t* graph;
+
+ graph = thr->graph;
+ trx = graph->trx;
+
+ if (graph->state != QUE_FORK_ACTIVE
+ || trx->que_state == TRX_QUE_LOCK_WAIT
+ || (UT_LIST_GET_LEN(trx->signals) > 0
+ && trx->que_state == TRX_QUE_RUNNING)) {
+
+ return(TRUE);
+ }
+
+ return(FALSE);
+}
+
+/***********************************************************************//**
+Returns TRUE if the query graph is for a SELECT statement.
+@return TRUE if a select */
+UNIV_INLINE
+ibool
+que_graph_is_select(
+/*================*/
+ que_t* graph) /*!< in: graph */
+{
+ if (graph->fork_type == QUE_FORK_SELECT_SCROLL
+ || graph->fork_type == QUE_FORK_SELECT_NON_SCROLL) {
+
+ return(TRUE);
+ }
+
+ return(FALSE);
+}
diff --git a/storage/innobase/include/que0types.h b/storage/innobase/include/que0types.h
new file mode 100644
index 00000000000..ea976074768
--- /dev/null
+++ b/storage/innobase/include/que0types.h
@@ -0,0 +1,60 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/que0types.h
+Query graph global types
+
+Created 5/27/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef que0types_h
+#define que0types_h
+
+#include "data0data.h"
+#include "dict0types.h"
+
+/* Pseudotype for all graph nodes */
+typedef void que_node_t;
+
+typedef struct que_fork_struct que_fork_t;
+
+/* Query graph root is a fork node */
+typedef que_fork_t que_t;
+
+typedef struct que_thr_struct que_thr_t;
+typedef struct que_common_struct que_common_t;
+
+/* Common struct at the beginning of each query graph node; the name of this
+substruct must be 'common' */
+
+struct que_common_struct{
+ ulint type; /*!< query node type */
+ que_node_t* parent; /*!< back pointer to parent node, or NULL */
+ que_node_t* brother;/* pointer to a possible brother node */
+ dfield_t val; /*!< evaluated value for an expression */
+ ulint val_buf_size;
+ /* buffer size for the evaluated value data,
+ if the buffer has been allocated dynamically:
+ if this field is != 0, and the node is a
+ symbol node or a function node, then we
+ have to free the data field in val
+ explicitly */
+};
+
+#endif
diff --git a/storage/innobase/include/read0read.h b/storage/innobase/include/read0read.h
new file mode 100644
index 00000000000..4d9a9fade36
--- /dev/null
+++ b/storage/innobase/include/read0read.h
@@ -0,0 +1,194 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/read0read.h
+Cursor read
+
+Created 2/16/1997 Heikki Tuuri
+*******************************************************/
+
+#ifndef read0read_h
+#define read0read_h
+
+#include "univ.i"
+
+
+#include "ut0byte.h"
+#include "ut0lst.h"
+#include "trx0trx.h"
+#include "read0types.h"
+
+/*********************************************************************//**
+Opens a read view where exactly the transactions serialized before this
+point in time are seen in the view.
+@return own: read view struct */
+UNIV_INTERN
+read_view_t*
+read_view_open_now(
+/*===============*/
+ trx_id_t cr_trx_id, /*!< in: trx_id of creating
+ transaction, or ut_dulint_zero
+ used in purge */
+ mem_heap_t* heap); /*!< in: memory heap from which
+ allocated */
+/*********************************************************************//**
+Makes a copy of the oldest existing read view, or opens a new. The view
+must be closed with ..._close.
+@return own: read view struct */
+UNIV_INTERN
+read_view_t*
+read_view_oldest_copy_or_open_new(
+/*==============================*/
+ trx_id_t cr_trx_id, /*!< in: trx_id of creating
+ transaction, or ut_dulint_zero
+ used in purge */
+ mem_heap_t* heap); /*!< in: memory heap from which
+ allocated */
+/*********************************************************************//**
+Closes a read view. */
+UNIV_INTERN
+void
+read_view_close(
+/*============*/
+ read_view_t* view); /*!< in: read view */
+/*********************************************************************//**
+Closes a consistent read view for MySQL. This function is called at an SQL
+statement end if the trx isolation level is <= TRX_ISO_READ_COMMITTED. */
+UNIV_INTERN
+void
+read_view_close_for_mysql(
+/*======================*/
+ trx_t* trx); /*!< in: trx which has a read view */
+/*********************************************************************//**
+Checks if a read view sees the specified transaction.
+@return TRUE if sees */
+UNIV_INLINE
+ibool
+read_view_sees_trx_id(
+/*==================*/
+ const read_view_t* view, /*!< in: read view */
+ trx_id_t trx_id);/*!< in: trx id */
+/*********************************************************************//**
+Prints a read view to stderr. */
+UNIV_INTERN
+void
+read_view_print(
+/*============*/
+ const read_view_t* view); /*!< in: read view */
+/*********************************************************************//**
+Create a consistent cursor view for mysql to be used in cursors. In this
+consistent read view modifications done by the creating transaction or future
+transactions are not visible. */
+UNIV_INTERN
+cursor_view_t*
+read_cursor_view_create_for_mysql(
+/*==============================*/
+ trx_t* cr_trx);/*!< in: trx where cursor view is created */
+/*********************************************************************//**
+Close a given consistent cursor view for mysql and restore global read view
+back to a transaction read view. */
+UNIV_INTERN
+void
+read_cursor_view_close_for_mysql(
+/*=============================*/
+ trx_t* trx, /*!< in: trx */
+ cursor_view_t* curview); /*!< in: cursor view to be closed */
+/*********************************************************************//**
+This function sets a given consistent cursor view to a transaction
+read view if given consistent cursor view is not NULL. Otherwise, function
+restores a global read view to a transaction read view. */
+UNIV_INTERN
+void
+read_cursor_set_for_mysql(
+/*======================*/
+ trx_t* trx, /*!< in: transaction where cursor is set */
+ cursor_view_t* curview);/*!< in: consistent cursor view to be set */
+
+/** Read view lists the trx ids of those transactions for which a consistent
+read should not see the modifications to the database. */
+
+struct read_view_struct{
+ ulint type; /*!< VIEW_NORMAL, VIEW_HIGH_GRANULARITY */
+ undo_no_t undo_no;/*!< ut_dulint_zero or if type is
+ VIEW_HIGH_GRANULARITY
+ transaction undo_no when this high-granularity
+ consistent read view was created */
+ trx_id_t low_limit_no;
+ /*!< The view does not need to see the undo
+ logs for transactions whose transaction number
+ is strictly smaller (<) than this value: they
+ can be removed in purge if not needed by other
+ views */
+ trx_id_t low_limit_id;
+ /*!< The read should not see any transaction
+ with trx id >= this value. In other words,
+ this is the "high water mark". */
+ trx_id_t up_limit_id;
+ /*!< The read should see all trx ids which
+ are strictly smaller (<) than this value.
+ In other words,
+ this is the "low water mark". */
+ ulint n_trx_ids;
+ /*!< Number of cells in the trx_ids array */
+ trx_id_t* trx_ids;/*!< Additional trx ids which the read should
+ not see: typically, these are the active
+ transactions at the time when the read is
+ serialized, except the reading transaction
+ itself; the trx ids in this array are in a
+ descending order. These trx_ids should be
+ between the "low" and "high" water marks,
+ that is, up_limit_id and low_limit_id. */
+ trx_id_t creator_trx_id;
+ /*!< trx id of creating transaction, or
+ ut_dulint_zero used in purge */
+ UT_LIST_NODE_T(read_view_t) view_list;
+ /*!< List of read views in trx_sys */
+};
+
+/** Read view types @{ */
+#define VIEW_NORMAL 1 /*!< Normal consistent read view
+ where transaction does not see changes
+ made by active transactions except
+ creating transaction. */
+#define VIEW_HIGH_GRANULARITY 2 /*!< High-granularity read view where
+ transaction does not see changes
+ made by active transactions and own
+ changes after a point in time when this
+ read view was created. */
+/* @} */
+
+/** Implement InnoDB framework to support consistent read views in
+cursors. This struct holds both heap where consistent read view
+is allocated and pointer to a read view. */
+
+struct cursor_view_struct{
+ mem_heap_t* heap;
+ /*!< Memory heap for the cursor view */
+ read_view_t* read_view;
+ /*!< Consistent read view of the cursor*/
+ ulint n_mysql_tables_in_use;
+ /*!< number of Innobase tables used in the
+ processing of this cursor */
+};
+
+#ifndef UNIV_NONINL
+#include "read0read.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/read0read.ic b/storage/innobase/include/read0read.ic
new file mode 100644
index 00000000000..9924967cc2d
--- /dev/null
+++ b/storage/innobase/include/read0read.ic
@@ -0,0 +1,98 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/read0read.ic
+Cursor read
+
+Created 2/16/1997 Heikki Tuuri
+*******************************************************/
+
+/*********************************************************************//**
+Gets the nth trx id in a read view.
+@return trx id */
+UNIV_INLINE
+trx_id_t
+read_view_get_nth_trx_id(
+/*=====================*/
+ const read_view_t* view, /*!< in: read view */
+ ulint n) /*!< in: position */
+{
+ ut_ad(n < view->n_trx_ids);
+
+ return(*(view->trx_ids + n));
+}
+
+/*********************************************************************//**
+Sets the nth trx id in a read view. */
+UNIV_INLINE
+void
+read_view_set_nth_trx_id(
+/*=====================*/
+ read_view_t* view, /*!< in: read view */
+ ulint n, /*!< in: position */
+ trx_id_t trx_id) /*!< in: trx id to set */
+{
+ ut_ad(n < view->n_trx_ids);
+
+ *(view->trx_ids + n) = trx_id;
+}
+
+/*********************************************************************//**
+Checks if a read view sees the specified transaction.
+@return TRUE if sees */
+UNIV_INLINE
+ibool
+read_view_sees_trx_id(
+/*==================*/
+ const read_view_t* view, /*!< in: read view */
+ trx_id_t trx_id) /*!< in: trx id */
+{
+ ulint n_ids;
+ int cmp;
+ ulint i;
+
+ if (ut_dulint_cmp(trx_id, view->up_limit_id) < 0) {
+
+ return(TRUE);
+ }
+
+ if (ut_dulint_cmp(trx_id, view->low_limit_id) >= 0) {
+
+ return(FALSE);
+ }
+
+ /* We go through the trx ids in the array smallest first: this order
+ may save CPU time, because if there was a very long running
+ transaction in the trx id array, its trx id is looked at first, and
+ the first two comparisons may well decide the visibility of trx_id. */
+
+ n_ids = view->n_trx_ids;
+
+ for (i = 0; i < n_ids; i++) {
+
+ cmp = ut_dulint_cmp(
+ trx_id,
+ read_view_get_nth_trx_id(view, n_ids - i - 1));
+ if (cmp <= 0) {
+ return(cmp < 0);
+ }
+ }
+
+ return(TRUE);
+}
diff --git a/storage/innobase/include/read0types.h b/storage/innobase/include/read0types.h
new file mode 100644
index 00000000000..caf69e3fb51
--- /dev/null
+++ b/storage/innobase/include/read0types.h
@@ -0,0 +1,32 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/read0types.h
+Cursor read
+
+Created 2/16/1997 Heikki Tuuri
+*******************************************************/
+
+#ifndef read0types_h
+#define read0types_h
+
+typedef struct read_view_struct read_view_t;
+typedef struct cursor_view_struct cursor_view_t;
+
+#endif
diff --git a/storage/innobase/include/rem0cmp.h b/storage/innobase/include/rem0cmp.h
new file mode 100644
index 00000000000..d30d9f86abe
--- /dev/null
+++ b/storage/innobase/include/rem0cmp.h
@@ -0,0 +1,194 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/*******************************************************************//**
+@file include/rem0cmp.h
+Comparison services for records
+
+Created 7/1/1994 Heikki Tuuri
+************************************************************************/
+
+#ifndef rem0cmp_h
+#define rem0cmp_h
+
+#include "univ.i"
+#include "data0data.h"
+#include "data0type.h"
+#include "dict0dict.h"
+#include "rem0rec.h"
+
+/*************************************************************//**
+Returns TRUE if two columns are equal for comparison purposes.
+@return TRUE if the columns are considered equal in comparisons */
+UNIV_INTERN
+ibool
+cmp_cols_are_equal(
+/*===============*/
+ const dict_col_t* col1, /*!< in: column 1 */
+ const dict_col_t* col2, /*!< in: column 2 */
+ ibool check_charsets);
+ /*!< in: whether to check charsets */
+/*************************************************************//**
+This function is used to compare two data fields for which we know the
+data type.
+@return 1, 0, -1, if data1 is greater, equal, less than data2, respectively */
+UNIV_INLINE
+int
+cmp_data_data(
+/*==========*/
+ ulint mtype, /*!< in: main type */
+ ulint prtype, /*!< in: precise type */
+ const byte* data1, /*!< in: data field (== a pointer to a memory
+ buffer) */
+ ulint len1, /*!< in: data field length or UNIV_SQL_NULL */
+ const byte* data2, /*!< in: data field (== a pointer to a memory
+ buffer) */
+ ulint len2); /*!< in: data field length or UNIV_SQL_NULL */
+/*************************************************************//**
+This function is used to compare two data fields for which we know the
+data type.
+@return 1, 0, -1, if data1 is greater, equal, less than data2, respectively */
+UNIV_INTERN
+int
+cmp_data_data_slow(
+/*===============*/
+ ulint mtype, /*!< in: main type */
+ ulint prtype, /*!< in: precise type */
+ const byte* data1, /*!< in: data field (== a pointer to a memory
+ buffer) */
+ ulint len1, /*!< in: data field length or UNIV_SQL_NULL */
+ const byte* data2, /*!< in: data field (== a pointer to a memory
+ buffer) */
+ ulint len2); /*!< in: data field length or UNIV_SQL_NULL */
+/*************************************************************//**
+This function is used to compare two dfields where at least the first
+has its data type field set.
+@return 1, 0, -1, if dfield1 is greater, equal, less than dfield2,
+respectively */
+UNIV_INLINE
+int
+cmp_dfield_dfield(
+/*==============*/
+ const dfield_t* dfield1,/*!< in: data field; must have type field set */
+ const dfield_t* dfield2);/*!< in: data field */
+/*************************************************************//**
+This function is used to compare a data tuple to a physical record.
+Only dtuple->n_fields_cmp first fields are taken into account for
+the the data tuple! If we denote by n = n_fields_cmp, then rec must
+have either m >= n fields, or it must differ from dtuple in some of
+the m fields rec has. If rec has an externally stored field we do not
+compare it but return with value 0 if such a comparison should be
+made.
+@return 1, 0, -1, if dtuple is greater, equal, less than rec,
+respectively, when only the common first fields are compared, or until
+the first externally stored field in rec */
+UNIV_INTERN
+int
+cmp_dtuple_rec_with_match(
+/*======================*/
+ const dtuple_t* dtuple, /*!< in: data tuple */
+ const rec_t* rec, /*!< in: physical record which differs from
+ dtuple in some of the common fields, or which
+ has an equal number or more fields than
+ dtuple */
+ const ulint* offsets,/*!< in: array returned by rec_get_offsets() */
+ ulint* matched_fields, /*!< in/out: number of already completely
+ matched fields; when function returns,
+ contains the value for current comparison */
+ ulint* matched_bytes); /*!< in/out: number of already matched
+ bytes within the first field not completely
+ matched; when function returns, contains the
+ value for current comparison */
+/**************************************************************//**
+Compares a data tuple to a physical record.
+@see cmp_dtuple_rec_with_match
+@return 1, 0, -1, if dtuple is greater, equal, less than rec, respectively */
+UNIV_INTERN
+int
+cmp_dtuple_rec(
+/*===========*/
+ const dtuple_t* dtuple, /*!< in: data tuple */
+ const rec_t* rec, /*!< in: physical record */
+ const ulint* offsets);/*!< in: array returned by rec_get_offsets() */
+/**************************************************************//**
+Checks if a dtuple is a prefix of a record. The last field in dtuple
+is allowed to be a prefix of the corresponding field in the record.
+@return TRUE if prefix */
+UNIV_INTERN
+ibool
+cmp_dtuple_is_prefix_of_rec(
+/*========================*/
+ const dtuple_t* dtuple, /*!< in: data tuple */
+ const rec_t* rec, /*!< in: physical record */
+ const ulint* offsets);/*!< in: array returned by rec_get_offsets() */
+/*************************************************************//**
+Compare two physical records that contain the same number of columns,
+none of which are stored externally.
+@return 1, 0, -1 if rec1 is greater, equal, less, respectively, than rec2 */
+UNIV_INTERN
+int
+cmp_rec_rec_simple(
+/*===============*/
+ const rec_t* rec1, /*!< in: physical record */
+ const rec_t* rec2, /*!< in: physical record */
+ const ulint* offsets1,/*!< in: rec_get_offsets(rec1, ...) */
+ const ulint* offsets2,/*!< in: rec_get_offsets(rec2, ...) */
+ const dict_index_t* index); /*!< in: data dictionary index */
+/*************************************************************//**
+This function is used to compare two physical records. Only the common
+first fields are compared, and if an externally stored field is
+encountered, then 0 is returned.
+@return 1, 0, -1 if rec1 is greater, equal, less, respectively */
+UNIV_INTERN
+int
+cmp_rec_rec_with_match(
+/*===================*/
+ const rec_t* rec1, /*!< in: physical record */
+ const rec_t* rec2, /*!< in: physical record */
+ const ulint* offsets1,/*!< in: rec_get_offsets(rec1, index) */
+ const ulint* offsets2,/*!< in: rec_get_offsets(rec2, index) */
+ dict_index_t* index, /*!< in: data dictionary index */
+ ulint* matched_fields, /*!< in/out: number of already completely
+ matched fields; when the function returns,
+ contains the value the for current
+ comparison */
+ ulint* matched_bytes);/*!< in/out: number of already matched
+ bytes within the first field not completely
+ matched; when the function returns, contains
+ the value for the current comparison */
+/*************************************************************//**
+This function is used to compare two physical records. Only the common
+first fields are compared.
+@return 1, 0 , -1 if rec1 is greater, equal, less, respectively, than
+rec2; only the common first fields are compared */
+UNIV_INLINE
+int
+cmp_rec_rec(
+/*========*/
+ const rec_t* rec1, /*!< in: physical record */
+ const rec_t* rec2, /*!< in: physical record */
+ const ulint* offsets1,/*!< in: rec_get_offsets(rec1, index) */
+ const ulint* offsets2,/*!< in: rec_get_offsets(rec2, index) */
+ dict_index_t* index); /*!< in: data dictionary index */
+
+
+#ifndef UNIV_NONINL
+#include "rem0cmp.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/rem0cmp.ic b/storage/innobase/include/rem0cmp.ic
new file mode 100644
index 00000000000..39ef5f4fba3
--- /dev/null
+++ b/storage/innobase/include/rem0cmp.ic
@@ -0,0 +1,91 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/*******************************************************************//**
+@file include/rem0cmp.ic
+Comparison services for records
+
+Created 7/1/1994 Heikki Tuuri
+************************************************************************/
+
+/*************************************************************//**
+This function is used to compare two data fields for which we know the
+data type.
+@return 1, 0, -1, if data1 is greater, equal, less than data2, respectively */
+UNIV_INLINE
+int
+cmp_data_data(
+/*==========*/
+ ulint mtype, /*!< in: main type */
+ ulint prtype, /*!< in: precise type */
+ const byte* data1, /*!< in: data field (== a pointer to a memory
+ buffer) */
+ ulint len1, /*!< in: data field length or UNIV_SQL_NULL */
+ const byte* data2, /*!< in: data field (== a pointer to a memory
+ buffer) */
+ ulint len2) /*!< in: data field length or UNIV_SQL_NULL */
+{
+ return(cmp_data_data_slow(mtype, prtype, data1, len1, data2, len2));
+}
+
+/*************************************************************//**
+This function is used to compare two dfields where at least the first
+has its data type field set.
+@return 1, 0, -1, if dfield1 is greater, equal, less than dfield2,
+respectively */
+UNIV_INLINE
+int
+cmp_dfield_dfield(
+/*==============*/
+ const dfield_t* dfield1,/*!< in: data field; must have type field set */
+ const dfield_t* dfield2)/*!< in: data field */
+{
+ const dtype_t* type;
+
+ ut_ad(dfield_check_typed(dfield1));
+
+ type = dfield_get_type(dfield1);
+
+ return(cmp_data_data(type->mtype, type->prtype,
+ (const byte*) dfield_get_data(dfield1),
+ dfield_get_len(dfield1),
+ (const byte*) dfield_get_data(dfield2),
+ dfield_get_len(dfield2)));
+}
+
+/*************************************************************//**
+This function is used to compare two physical records. Only the common
+first fields are compared.
+@return 1, 0 , -1 if rec1 is greater, equal, less, respectively, than
+rec2; only the common first fields are compared */
+UNIV_INLINE
+int
+cmp_rec_rec(
+/*========*/
+ const rec_t* rec1, /*!< in: physical record */
+ const rec_t* rec2, /*!< in: physical record */
+ const ulint* offsets1,/*!< in: rec_get_offsets(rec1, index) */
+ const ulint* offsets2,/*!< in: rec_get_offsets(rec2, index) */
+ dict_index_t* index) /*!< in: data dictionary index */
+{
+ ulint match_f = 0;
+ ulint match_b = 0;
+
+ return(cmp_rec_rec_with_match(rec1, rec2, offsets1, offsets2, index,
+ &match_f, &match_b));
+}
diff --git a/storage/innobase/include/rem0rec.h b/storage/innobase/include/rem0rec.h
new file mode 100644
index 00000000000..17d08afabb9
--- /dev/null
+++ b/storage/innobase/include/rem0rec.h
@@ -0,0 +1,824 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/rem0rec.h
+Record manager
+
+Created 5/30/1994 Heikki Tuuri
+*************************************************************************/
+
+#ifndef rem0rec_h
+#define rem0rec_h
+
+#include "univ.i"
+#include "data0data.h"
+#include "rem0types.h"
+#include "mtr0types.h"
+#include "page0types.h"
+
+/* Info bit denoting the predefined minimum record: this bit is set
+if and only if the record is the first user record on a non-leaf
+B-tree page that is the leftmost page on its level
+(PAGE_LEVEL is nonzero and FIL_PAGE_PREV is FIL_NULL). */
+#define REC_INFO_MIN_REC_FLAG 0x10UL
+/* The deleted flag in info bits */
+#define REC_INFO_DELETED_FLAG 0x20UL /* when bit is set to 1, it means the
+ record has been delete marked */
+
+/* Number of extra bytes in an old-style record,
+in addition to the data and the offsets */
+#define REC_N_OLD_EXTRA_BYTES 6
+/* Number of extra bytes in a new-style record,
+in addition to the data and the offsets */
+#define REC_N_NEW_EXTRA_BYTES 5
+
+/* Record status values */
+#define REC_STATUS_ORDINARY 0
+#define REC_STATUS_NODE_PTR 1
+#define REC_STATUS_INFIMUM 2
+#define REC_STATUS_SUPREMUM 3
+
+/* The following four constants are needed in page0zip.c in order to
+efficiently compress and decompress pages. */
+
+/* The offset of heap_no in a compact record */
+#define REC_NEW_HEAP_NO 4
+/* The shift of heap_no in a compact record.
+The status is stored in the low-order bits. */
+#define REC_HEAP_NO_SHIFT 3
+
+/* Length of a B-tree node pointer, in bytes */
+#define REC_NODE_PTR_SIZE 4
+
+#ifdef UNIV_DEBUG
+/* Length of the rec_get_offsets() header */
+# define REC_OFFS_HEADER_SIZE 4
+#else /* UNIV_DEBUG */
+/* Length of the rec_get_offsets() header */
+# define REC_OFFS_HEADER_SIZE 2
+#endif /* UNIV_DEBUG */
+
+/* Number of elements that should be initially allocated for the
+offsets[] array, first passed to rec_get_offsets() */
+#define REC_OFFS_NORMAL_SIZE 100
+#define REC_OFFS_SMALL_SIZE 10
+
+/******************************************************//**
+The following function is used to get the pointer of the next chained record
+on the same page.
+@return pointer to the next chained record, or NULL if none */
+UNIV_INLINE
+const rec_t*
+rec_get_next_ptr_const(
+/*===================*/
+ const rec_t* rec, /*!< in: physical record */
+ ulint comp); /*!< in: nonzero=compact page format */
+/******************************************************//**
+The following function is used to get the pointer of the next chained record
+on the same page.
+@return pointer to the next chained record, or NULL if none */
+UNIV_INLINE
+rec_t*
+rec_get_next_ptr(
+/*=============*/
+ rec_t* rec, /*!< in: physical record */
+ ulint comp); /*!< in: nonzero=compact page format */
+/******************************************************//**
+The following function is used to get the offset of the
+next chained record on the same page.
+@return the page offset of the next chained record, or 0 if none */
+UNIV_INLINE
+ulint
+rec_get_next_offs(
+/*==============*/
+ const rec_t* rec, /*!< in: physical record */
+ ulint comp); /*!< in: nonzero=compact page format */
+/******************************************************//**
+The following function is used to set the next record offset field
+of an old-style record. */
+UNIV_INLINE
+void
+rec_set_next_offs_old(
+/*==================*/
+ rec_t* rec, /*!< in: old-style physical record */
+ ulint next); /*!< in: offset of the next record */
+/******************************************************//**
+The following function is used to set the next record offset field
+of a new-style record. */
+UNIV_INLINE
+void
+rec_set_next_offs_new(
+/*==================*/
+ rec_t* rec, /*!< in/out: new-style physical record */
+ ulint next); /*!< in: offset of the next record */
+/******************************************************//**
+The following function is used to get the number of fields
+in an old-style record.
+@return number of data fields */
+UNIV_INLINE
+ulint
+rec_get_n_fields_old(
+/*=================*/
+ const rec_t* rec); /*!< in: physical record */
+/******************************************************//**
+The following function is used to get the number of fields
+in a record.
+@return number of data fields */
+UNIV_INLINE
+ulint
+rec_get_n_fields(
+/*=============*/
+ const rec_t* rec, /*!< in: physical record */
+ const dict_index_t* index); /*!< in: record descriptor */
+/******************************************************//**
+The following function is used to get the number of records owned by the
+previous directory record.
+@return number of owned records */
+UNIV_INLINE
+ulint
+rec_get_n_owned_old(
+/*================*/
+ const rec_t* rec); /*!< in: old-style physical record */
+/******************************************************//**
+The following function is used to set the number of owned records. */
+UNIV_INLINE
+void
+rec_set_n_owned_old(
+/*================*/
+ rec_t* rec, /*!< in: old-style physical record */
+ ulint n_owned); /*!< in: the number of owned */
+/******************************************************//**
+The following function is used to get the number of records owned by the
+previous directory record.
+@return number of owned records */
+UNIV_INLINE
+ulint
+rec_get_n_owned_new(
+/*================*/
+ const rec_t* rec); /*!< in: new-style physical record */
+/******************************************************//**
+The following function is used to set the number of owned records. */
+UNIV_INLINE
+void
+rec_set_n_owned_new(
+/*================*/
+ rec_t* rec, /*!< in/out: new-style physical record */
+ page_zip_des_t* page_zip,/*!< in/out: compressed page, or NULL */
+ ulint n_owned);/*!< in: the number of owned */
+/******************************************************//**
+The following function is used to retrieve the info bits of
+a record.
+@return info bits */
+UNIV_INLINE
+ulint
+rec_get_info_bits(
+/*==============*/
+ const rec_t* rec, /*!< in: physical record */
+ ulint comp); /*!< in: nonzero=compact page format */
+/******************************************************//**
+The following function is used to set the info bits of a record. */
+UNIV_INLINE
+void
+rec_set_info_bits_old(
+/*==================*/
+ rec_t* rec, /*!< in: old-style physical record */
+ ulint bits); /*!< in: info bits */
+/******************************************************//**
+The following function is used to set the info bits of a record. */
+UNIV_INLINE
+void
+rec_set_info_bits_new(
+/*==================*/
+ rec_t* rec, /*!< in/out: new-style physical record */
+ ulint bits); /*!< in: info bits */
+/******************************************************//**
+The following function retrieves the status bits of a new-style record.
+@return status bits */
+UNIV_INLINE
+ulint
+rec_get_status(
+/*===========*/
+ const rec_t* rec); /*!< in: physical record */
+
+/******************************************************//**
+The following function is used to set the status bits of a new-style record. */
+UNIV_INLINE
+void
+rec_set_status(
+/*===========*/
+ rec_t* rec, /*!< in/out: physical record */
+ ulint bits); /*!< in: info bits */
+
+/******************************************************//**
+The following function is used to retrieve the info and status
+bits of a record. (Only compact records have status bits.)
+@return info bits */
+UNIV_INLINE
+ulint
+rec_get_info_and_status_bits(
+/*=========================*/
+ const rec_t* rec, /*!< in: physical record */
+ ulint comp); /*!< in: nonzero=compact page format */
+/******************************************************//**
+The following function is used to set the info and status
+bits of a record. (Only compact records have status bits.) */
+UNIV_INLINE
+void
+rec_set_info_and_status_bits(
+/*=========================*/
+ rec_t* rec, /*!< in/out: compact physical record */
+ ulint bits); /*!< in: info bits */
+
+/******************************************************//**
+The following function tells if record is delete marked.
+@return nonzero if delete marked */
+UNIV_INLINE
+ulint
+rec_get_deleted_flag(
+/*=================*/
+ const rec_t* rec, /*!< in: physical record */
+ ulint comp); /*!< in: nonzero=compact page format */
+/******************************************************//**
+The following function is used to set the deleted bit. */
+UNIV_INLINE
+void
+rec_set_deleted_flag_old(
+/*=====================*/
+ rec_t* rec, /*!< in: old-style physical record */
+ ulint flag); /*!< in: nonzero if delete marked */
+/******************************************************//**
+The following function is used to set the deleted bit. */
+UNIV_INLINE
+void
+rec_set_deleted_flag_new(
+/*=====================*/
+ rec_t* rec, /*!< in/out: new-style physical record */
+ page_zip_des_t* page_zip,/*!< in/out: compressed page, or NULL */
+ ulint flag); /*!< in: nonzero if delete marked */
+/******************************************************//**
+The following function tells if a new-style record is a node pointer.
+@return TRUE if node pointer */
+UNIV_INLINE
+ibool
+rec_get_node_ptr_flag(
+/*==================*/
+ const rec_t* rec); /*!< in: physical record */
+/******************************************************//**
+The following function is used to get the order number
+of an old-style record in the heap of the index page.
+@return heap order number */
+UNIV_INLINE
+ulint
+rec_get_heap_no_old(
+/*================*/
+ const rec_t* rec); /*!< in: physical record */
+/******************************************************//**
+The following function is used to set the heap number
+field in an old-style record. */
+UNIV_INLINE
+void
+rec_set_heap_no_old(
+/*================*/
+ rec_t* rec, /*!< in: physical record */
+ ulint heap_no);/*!< in: the heap number */
+/******************************************************//**
+The following function is used to get the order number
+of a new-style record in the heap of the index page.
+@return heap order number */
+UNIV_INLINE
+ulint
+rec_get_heap_no_new(
+/*================*/
+ const rec_t* rec); /*!< in: physical record */
+/******************************************************//**
+The following function is used to set the heap number
+field in a new-style record. */
+UNIV_INLINE
+void
+rec_set_heap_no_new(
+/*================*/
+ rec_t* rec, /*!< in/out: physical record */
+ ulint heap_no);/*!< in: the heap number */
+/******************************************************//**
+The following function is used to test whether the data offsets
+in the record are stored in one-byte or two-byte format.
+@return TRUE if 1-byte form */
+UNIV_INLINE
+ibool
+rec_get_1byte_offs_flag(
+/*====================*/
+ const rec_t* rec); /*!< in: physical record */
+
+/******************************************************//**
+Determine how many of the first n columns in a compact
+physical record are stored externally.
+@return number of externally stored columns */
+UNIV_INTERN
+ulint
+rec_get_n_extern_new(
+/*=================*/
+ const rec_t* rec, /*!< in: compact physical record */
+ dict_index_t* index, /*!< in: record descriptor */
+ ulint n); /*!< in: number of columns to scan */
+
+/******************************************************//**
+The following function determines the offsets to each field
+in the record. It can reuse a previously allocated array.
+@return the new offsets */
+UNIV_INTERN
+ulint*
+rec_get_offsets_func(
+/*=================*/
+ const rec_t* rec, /*!< in: physical record */
+ const dict_index_t* index, /*!< in: record descriptor */
+ ulint* offsets,/*!< in/out: array consisting of
+ offsets[0] allocated elements,
+ or an array from rec_get_offsets(),
+ or NULL */
+ ulint n_fields,/*!< in: maximum number of
+ initialized fields
+ (ULINT_UNDEFINED if all fields) */
+ mem_heap_t** heap, /*!< in/out: memory heap */
+ const char* file, /*!< in: file name where called */
+ ulint line); /*!< in: line number where called */
+
+#define rec_get_offsets(rec,index,offsets,n,heap) \
+ rec_get_offsets_func(rec,index,offsets,n,heap,__FILE__,__LINE__)
+
+/******************************************************//**
+Determine the offset to each field in a leaf-page record
+in ROW_FORMAT=COMPACT. This is a special case of
+rec_init_offsets() and rec_get_offsets_func(). */
+UNIV_INTERN
+void
+rec_init_offsets_comp_ordinary(
+/*===========================*/
+ const rec_t* rec, /*!< in: physical record in
+ ROW_FORMAT=COMPACT */
+ ulint extra, /*!< in: number of bytes to reserve
+ between the record header and
+ the data payload
+ (usually REC_N_NEW_EXTRA_BYTES) */
+ const dict_index_t* index, /*!< in: record descriptor */
+ ulint* offsets);/*!< in/out: array of offsets;
+ in: n=rec_offs_n_fields(offsets) */
+
+/******************************************************//**
+The following function determines the offsets to each field
+in the record. It can reuse a previously allocated array. */
+UNIV_INTERN
+void
+rec_get_offsets_reverse(
+/*====================*/
+ const byte* extra, /*!< in: the extra bytes of a
+ compact record in reverse order,
+ excluding the fixed-size
+ REC_N_NEW_EXTRA_BYTES */
+ const dict_index_t* index, /*!< in: record descriptor */
+ ulint node_ptr,/*!< in: nonzero=node pointer,
+ 0=leaf node */
+ ulint* offsets);/*!< in/out: array consisting of
+ offsets[0] allocated elements */
+
+/************************************************************//**
+Validates offsets returned by rec_get_offsets().
+@return TRUE if valid */
+UNIV_INLINE
+ibool
+rec_offs_validate(
+/*==============*/
+ const rec_t* rec, /*!< in: record or NULL */
+ const dict_index_t* index, /*!< in: record descriptor or NULL */
+ const ulint* offsets);/*!< in: array returned by
+ rec_get_offsets() */
+#ifdef UNIV_DEBUG
+/************************************************************//**
+Updates debug data in offsets, in order to avoid bogus
+rec_offs_validate() failures. */
+UNIV_INLINE
+void
+rec_offs_make_valid(
+/*================*/
+ const rec_t* rec, /*!< in: record */
+ const dict_index_t* index, /*!< in: record descriptor */
+ ulint* offsets);/*!< in: array returned by
+ rec_get_offsets() */
+#else
+# define rec_offs_make_valid(rec, index, offsets) ((void) 0)
+#endif /* UNIV_DEBUG */
+
+/************************************************************//**
+The following function is used to get the offset to the nth
+data field in an old-style record.
+@return offset to the field */
+UNIV_INTERN
+ulint
+rec_get_nth_field_offs_old(
+/*=======================*/
+ const rec_t* rec, /*!< in: record */
+ ulint n, /*!< in: index of the field */
+ ulint* len); /*!< out: length of the field; UNIV_SQL_NULL
+ if SQL null */
+#define rec_get_nth_field_old(rec, n, len) \
+((rec) + rec_get_nth_field_offs_old(rec, n, len))
+/************************************************************//**
+Gets the physical size of an old-style field.
+Also an SQL null may have a field of size > 0,
+if the data type is of a fixed size.
+@return field size in bytes */
+UNIV_INLINE
+ulint
+rec_get_nth_field_size(
+/*===================*/
+ const rec_t* rec, /*!< in: record */
+ ulint n); /*!< in: index of the field */
+/************************************************************//**
+The following function is used to get an offset to the nth
+data field in a record.
+@return offset from the origin of rec */
+UNIV_INLINE
+ulint
+rec_get_nth_field_offs(
+/*===================*/
+ const ulint* offsets,/*!< in: array returned by rec_get_offsets() */
+ ulint n, /*!< in: index of the field */
+ ulint* len); /*!< out: length of the field; UNIV_SQL_NULL
+ if SQL null */
+#define rec_get_nth_field(rec, offsets, n, len) \
+((rec) + rec_get_nth_field_offs(offsets, n, len))
+/******************************************************//**
+Determine if the offsets are for a record in the new
+compact format.
+@return nonzero if compact format */
+UNIV_INLINE
+ulint
+rec_offs_comp(
+/*==========*/
+ const ulint* offsets);/*!< in: array returned by rec_get_offsets() */
+/******************************************************//**
+Determine if the offsets are for a record containing
+externally stored columns.
+@return nonzero if externally stored */
+UNIV_INLINE
+ulint
+rec_offs_any_extern(
+/*================*/
+ const ulint* offsets);/*!< in: array returned by rec_get_offsets() */
+/******************************************************//**
+Returns nonzero if the extern bit is set in nth field of rec.
+@return nonzero if externally stored */
+UNIV_INLINE
+ulint
+rec_offs_nth_extern(
+/*================*/
+ const ulint* offsets,/*!< in: array returned by rec_get_offsets() */
+ ulint n); /*!< in: nth field */
+/******************************************************//**
+Returns nonzero if the SQL NULL bit is set in nth field of rec.
+@return nonzero if SQL NULL */
+UNIV_INLINE
+ulint
+rec_offs_nth_sql_null(
+/*==================*/
+ const ulint* offsets,/*!< in: array returned by rec_get_offsets() */
+ ulint n); /*!< in: nth field */
+/******************************************************//**
+Gets the physical size of a field.
+@return length of field */
+UNIV_INLINE
+ulint
+rec_offs_nth_size(
+/*==============*/
+ const ulint* offsets,/*!< in: array returned by rec_get_offsets() */
+ ulint n); /*!< in: nth field */
+
+/******************************************************//**
+Returns the number of extern bits set in a record.
+@return number of externally stored fields */
+UNIV_INLINE
+ulint
+rec_offs_n_extern(
+/*==============*/
+ const ulint* offsets);/*!< in: array returned by rec_get_offsets() */
+/***********************************************************//**
+This is used to modify the value of an already existing field in a record.
+The previous value must have exactly the same size as the new value. If len
+is UNIV_SQL_NULL then the field is treated as an SQL null.
+For records in ROW_FORMAT=COMPACT (new-style records), len must not be
+UNIV_SQL_NULL unless the field already is SQL null. */
+UNIV_INLINE
+void
+rec_set_nth_field(
+/*==============*/
+ rec_t* rec, /*!< in: record */
+ const ulint* offsets,/*!< in: array returned by rec_get_offsets() */
+ ulint n, /*!< in: index number of the field */
+ const void* data, /*!< in: pointer to the data if not SQL null */
+ ulint len); /*!< in: length of the data or UNIV_SQL_NULL */
+/**********************************************************//**
+The following function returns the data size of an old-style physical
+record, that is the sum of field lengths. SQL null fields
+are counted as length 0 fields. The value returned by the function
+is the distance from record origin to record end in bytes.
+@return size */
+UNIV_INLINE
+ulint
+rec_get_data_size_old(
+/*==================*/
+ const rec_t* rec); /*!< in: physical record */
+/**********************************************************//**
+The following function returns the number of allocated elements
+for an array of offsets.
+@return number of elements */
+UNIV_INLINE
+ulint
+rec_offs_get_n_alloc(
+/*=================*/
+ const ulint* offsets);/*!< in: array for rec_get_offsets() */
+/**********************************************************//**
+The following function sets the number of allocated elements
+for an array of offsets. */
+UNIV_INLINE
+void
+rec_offs_set_n_alloc(
+/*=================*/
+ ulint* offsets, /*!< out: array for rec_get_offsets(),
+ must be allocated */
+ ulint n_alloc); /*!< in: number of elements */
+#define rec_offs_init(offsets) \
+ rec_offs_set_n_alloc(offsets, (sizeof offsets) / sizeof *offsets)
+/**********************************************************//**
+The following function returns the number of fields in a record.
+@return number of fields */
+UNIV_INLINE
+ulint
+rec_offs_n_fields(
+/*==============*/
+ const ulint* offsets);/*!< in: array returned by rec_get_offsets() */
+/**********************************************************//**
+The following function returns the data size of a physical
+record, that is the sum of field lengths. SQL null fields
+are counted as length 0 fields. The value returned by the function
+is the distance from record origin to record end in bytes.
+@return size */
+UNIV_INLINE
+ulint
+rec_offs_data_size(
+/*===============*/
+ const ulint* offsets);/*!< in: array returned by rec_get_offsets() */
+/**********************************************************//**
+Returns the total size of record minus data size of record.
+The value returned by the function is the distance from record
+start to record origin in bytes.
+@return size */
+UNIV_INLINE
+ulint
+rec_offs_extra_size(
+/*================*/
+ const ulint* offsets);/*!< in: array returned by rec_get_offsets() */
+/**********************************************************//**
+Returns the total size of a physical record.
+@return size */
+UNIV_INLINE
+ulint
+rec_offs_size(
+/*==========*/
+ const ulint* offsets);/*!< in: array returned by rec_get_offsets() */
+/**********************************************************//**
+Returns a pointer to the start of the record.
+@return pointer to start */
+UNIV_INLINE
+byte*
+rec_get_start(
+/*==========*/
+ rec_t* rec, /*!< in: pointer to record */
+ const ulint* offsets);/*!< in: array returned by rec_get_offsets() */
+/**********************************************************//**
+Returns a pointer to the end of the record.
+@return pointer to end */
+UNIV_INLINE
+byte*
+rec_get_end(
+/*========*/
+ rec_t* rec, /*!< in: pointer to record */
+ const ulint* offsets);/*!< in: array returned by rec_get_offsets() */
+/***************************************************************//**
+Copies a physical record to a buffer.
+@return pointer to the origin of the copy */
+UNIV_INLINE
+rec_t*
+rec_copy(
+/*=====*/
+ void* buf, /*!< in: buffer */
+ const rec_t* rec, /*!< in: physical record */
+ const ulint* offsets);/*!< in: array returned by rec_get_offsets() */
+#ifndef UNIV_HOTBACKUP
+/**************************************************************//**
+Copies the first n fields of a physical record to a new physical record in
+a buffer.
+@return own: copied record */
+UNIV_INTERN
+rec_t*
+rec_copy_prefix_to_buf(
+/*===================*/
+ const rec_t* rec, /*!< in: physical record */
+ const dict_index_t* index, /*!< in: record descriptor */
+ ulint n_fields, /*!< in: number of fields
+ to copy */
+ byte** buf, /*!< in/out: memory buffer
+ for the copied prefix,
+ or NULL */
+ ulint* buf_size); /*!< in/out: buffer size */
+/************************************************************//**
+Folds a prefix of a physical record to a ulint.
+@return the folded value */
+UNIV_INLINE
+ulint
+rec_fold(
+/*=====*/
+ const rec_t* rec, /*!< in: the physical record */
+ const ulint* offsets, /*!< in: array returned by
+ rec_get_offsets() */
+ ulint n_fields, /*!< in: number of complete
+ fields to fold */
+ ulint n_bytes, /*!< in: number of bytes to fold
+ in an incomplete last field */
+ dulint tree_id) /*!< in: index tree id */
+ __attribute__((pure));
+#endif /* !UNIV_HOTBACKUP */
+/*********************************************************//**
+Builds a ROW_FORMAT=COMPACT record out of a data tuple. */
+UNIV_INTERN
+void
+rec_convert_dtuple_to_rec_comp(
+/*===========================*/
+ rec_t* rec, /*!< in: origin of record */
+ ulint extra, /*!< in: number of bytes to
+ reserve between the record
+ header and the data payload
+ (normally REC_N_NEW_EXTRA_BYTES) */
+ const dict_index_t* index, /*!< in: record descriptor */
+ ulint status, /*!< in: status bits of the record */
+ const dfield_t* fields, /*!< in: array of data fields */
+ ulint n_fields);/*!< in: number of data fields */
+/*********************************************************//**
+Builds a physical record out of a data tuple and
+stores it into the given buffer.
+@return pointer to the origin of physical record */
+UNIV_INTERN
+rec_t*
+rec_convert_dtuple_to_rec(
+/*======================*/
+ byte* buf, /*!< in: start address of the
+ physical record */
+ const dict_index_t* index, /*!< in: record descriptor */
+ const dtuple_t* dtuple, /*!< in: data tuple */
+ ulint n_ext); /*!< in: number of
+ externally stored columns */
+/**********************************************************//**
+Returns the extra size of an old-style physical record if we know its
+data size and number of fields.
+@return extra size */
+UNIV_INLINE
+ulint
+rec_get_converted_extra_size(
+/*=========================*/
+ ulint data_size, /*!< in: data size */
+ ulint n_fields, /*!< in: number of fields */
+ ulint n_ext) /*!< in: number of externally stored columns */
+ __attribute__((const));
+/**********************************************************//**
+Determines the size of a data tuple prefix in ROW_FORMAT=COMPACT.
+@return total size */
+UNIV_INTERN
+ulint
+rec_get_converted_size_comp_prefix(
+/*===============================*/
+ const dict_index_t* index, /*!< in: record descriptor;
+ dict_table_is_comp() is
+ assumed to hold, even if
+ it does not */
+ const dfield_t* fields, /*!< in: array of data fields */
+ ulint n_fields,/*!< in: number of data fields */
+ ulint* extra); /*!< out: extra size */
+/**********************************************************//**
+Determines the size of a data tuple in ROW_FORMAT=COMPACT.
+@return total size */
+UNIV_INTERN
+ulint
+rec_get_converted_size_comp(
+/*========================*/
+ const dict_index_t* index, /*!< in: record descriptor;
+ dict_table_is_comp() is
+ assumed to hold, even if
+ it does not */
+ ulint status, /*!< in: status bits of the record */
+ const dfield_t* fields, /*!< in: array of data fields */
+ ulint n_fields,/*!< in: number of data fields */
+ ulint* extra); /*!< out: extra size */
+/**********************************************************//**
+The following function returns the size of a data tuple when converted to
+a physical record.
+@return size */
+UNIV_INLINE
+ulint
+rec_get_converted_size(
+/*===================*/
+ dict_index_t* index, /*!< in: record descriptor */
+ const dtuple_t* dtuple, /*!< in: data tuple */
+ ulint n_ext); /*!< in: number of externally stored columns */
+#ifndef UNIV_HOTBACKUP
+/**************************************************************//**
+Copies the first n fields of a physical record to a data tuple.
+The fields are copied to the memory heap. */
+UNIV_INTERN
+void
+rec_copy_prefix_to_dtuple(
+/*======================*/
+ dtuple_t* tuple, /*!< out: data tuple */
+ const rec_t* rec, /*!< in: physical record */
+ const dict_index_t* index, /*!< in: record descriptor */
+ ulint n_fields, /*!< in: number of fields
+ to copy */
+ mem_heap_t* heap); /*!< in: memory heap */
+#endif /* !UNIV_HOTBACKUP */
+/***************************************************************//**
+Validates the consistency of a physical record.
+@return TRUE if ok */
+UNIV_INTERN
+ibool
+rec_validate(
+/*=========*/
+ const rec_t* rec, /*!< in: physical record */
+ const ulint* offsets);/*!< in: array returned by rec_get_offsets() */
+/***************************************************************//**
+Prints an old-style physical record. */
+UNIV_INTERN
+void
+rec_print_old(
+/*==========*/
+ FILE* file, /*!< in: file where to print */
+ const rec_t* rec); /*!< in: physical record */
+#ifndef UNIV_HOTBACKUP
+/***************************************************************//**
+Prints a physical record in ROW_FORMAT=COMPACT. Ignores the
+record header. */
+UNIV_INTERN
+void
+rec_print_comp(
+/*===========*/
+ FILE* file, /*!< in: file where to print */
+ const rec_t* rec, /*!< in: physical record */
+ const ulint* offsets);/*!< in: array returned by rec_get_offsets() */
+/***************************************************************//**
+Prints a physical record. */
+UNIV_INTERN
+void
+rec_print_new(
+/*==========*/
+ FILE* file, /*!< in: file where to print */
+ const rec_t* rec, /*!< in: physical record */
+ const ulint* offsets);/*!< in: array returned by rec_get_offsets() */
+/***************************************************************//**
+Prints a physical record. */
+UNIV_INTERN
+void
+rec_print(
+/*======*/
+ FILE* file, /*!< in: file where to print */
+ const rec_t* rec, /*!< in: physical record */
+ dict_index_t* index); /*!< in: record descriptor */
+#endif /* UNIV_HOTBACKUP */
+
+#define REC_INFO_BITS 6 /* This is single byte bit-field */
+
+/* Maximum lengths for the data in a physical record if the offsets
+are given in one byte (resp. two byte) format. */
+#define REC_1BYTE_OFFS_LIMIT 0x7FUL
+#define REC_2BYTE_OFFS_LIMIT 0x7FFFUL
+
+/* The data size of record must be smaller than this because we reserve
+two upmost bits in a two byte offset for special purposes */
+#define REC_MAX_DATA_SIZE (16 * 1024)
+
+#ifndef UNIV_NONINL
+#include "rem0rec.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/rem0rec.ic b/storage/innobase/include/rem0rec.ic
new file mode 100644
index 00000000000..9fe736f9b0b
--- /dev/null
+++ b/storage/innobase/include/rem0rec.ic
@@ -0,0 +1,1647 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/rem0rec.ic
+Record manager
+
+Created 5/30/1994 Heikki Tuuri
+*************************************************************************/
+
+#include "mach0data.h"
+#include "ut0byte.h"
+#include "dict0dict.h"
+
+/* Compact flag ORed to the extra size returned by rec_get_offsets() */
+#define REC_OFFS_COMPACT ((ulint) 1 << 31)
+/* SQL NULL flag in offsets returned by rec_get_offsets() */
+#define REC_OFFS_SQL_NULL ((ulint) 1 << 31)
+/* External flag in offsets returned by rec_get_offsets() */
+#define REC_OFFS_EXTERNAL ((ulint) 1 << 30)
+/* Mask for offsets returned by rec_get_offsets() */
+#define REC_OFFS_MASK (REC_OFFS_EXTERNAL - 1)
+
+/* Offsets of the bit-fields in an old-style record. NOTE! In the table the
+most significant bytes and bits are written below less significant.
+
+ (1) byte offset (2) bit usage within byte
+ downward from
+ origin -> 1 8 bits pointer to next record
+ 2 8 bits pointer to next record
+ 3 1 bit short flag
+ 7 bits number of fields
+ 4 3 bits number of fields
+ 5 bits heap number
+ 5 8 bits heap number
+ 6 4 bits n_owned
+ 4 bits info bits
+*/
+
+/* Offsets of the bit-fields in a new-style record. NOTE! In the table the
+most significant bytes and bits are written below less significant.
+
+ (1) byte offset (2) bit usage within byte
+ downward from
+ origin -> 1 8 bits relative offset of next record
+ 2 8 bits relative offset of next record
+ the relative offset is an unsigned 16-bit
+ integer:
+ (offset_of_next_record
+ - offset_of_this_record) mod 64Ki,
+ where mod is the modulo as a non-negative
+ number;
+ we can calculate the the offset of the next
+ record with the formula:
+ relative_offset + offset_of_this_record
+ mod UNIV_PAGE_SIZE
+ 3 3 bits status:
+ 000=conventional record
+ 001=node pointer record (inside B-tree)
+ 010=infimum record
+ 011=supremum record
+ 1xx=reserved
+ 5 bits heap number
+ 4 8 bits heap number
+ 5 4 bits n_owned
+ 4 bits info bits
+*/
+
+/* We list the byte offsets from the origin of the record, the mask,
+and the shift needed to obtain each bit-field of the record. */
+
+#define REC_NEXT 2
+#define REC_NEXT_MASK 0xFFFFUL
+#define REC_NEXT_SHIFT 0
+
+#define REC_OLD_SHORT 3 /* This is single byte bit-field */
+#define REC_OLD_SHORT_MASK 0x1UL
+#define REC_OLD_SHORT_SHIFT 0
+
+#define REC_OLD_N_FIELDS 4
+#define REC_OLD_N_FIELDS_MASK 0x7FEUL
+#define REC_OLD_N_FIELDS_SHIFT 1
+
+#define REC_NEW_STATUS 3 /* This is single byte bit-field */
+#define REC_NEW_STATUS_MASK 0x7UL
+#define REC_NEW_STATUS_SHIFT 0
+
+#define REC_OLD_HEAP_NO 5
+#define REC_HEAP_NO_MASK 0xFFF8UL
+#if 0 /* defined in rem0rec.h for use of page0zip.c */
+#define REC_NEW_HEAP_NO 4
+#define REC_HEAP_NO_SHIFT 3
+#endif
+
+#define REC_OLD_N_OWNED 6 /* This is single byte bit-field */
+#define REC_NEW_N_OWNED 5 /* This is single byte bit-field */
+#define REC_N_OWNED_MASK 0xFUL
+#define REC_N_OWNED_SHIFT 0
+
+#define REC_OLD_INFO_BITS 6 /* This is single byte bit-field */
+#define REC_NEW_INFO_BITS 5 /* This is single byte bit-field */
+#define REC_INFO_BITS_MASK 0xF0UL
+#define REC_INFO_BITS_SHIFT 0
+
+/* The following masks are used to filter the SQL null bit from
+one-byte and two-byte offsets */
+
+#define REC_1BYTE_SQL_NULL_MASK 0x80UL
+#define REC_2BYTE_SQL_NULL_MASK 0x8000UL
+
+/* In a 2-byte offset the second most significant bit denotes
+a field stored to another page: */
+
+#define REC_2BYTE_EXTERN_MASK 0x4000UL
+
+#if REC_OLD_SHORT_MASK << (8 * (REC_OLD_SHORT - 3)) \
+ ^ REC_OLD_N_FIELDS_MASK << (8 * (REC_OLD_N_FIELDS - 4)) \
+ ^ REC_HEAP_NO_MASK << (8 * (REC_OLD_HEAP_NO - 4)) \
+ ^ REC_N_OWNED_MASK << (8 * (REC_OLD_N_OWNED - 3)) \
+ ^ REC_INFO_BITS_MASK << (8 * (REC_OLD_INFO_BITS - 3)) \
+ ^ 0xFFFFFFFFUL
+# error "sum of old-style masks != 0xFFFFFFFFUL"
+#endif
+#if REC_NEW_STATUS_MASK << (8 * (REC_NEW_STATUS - 3)) \
+ ^ REC_HEAP_NO_MASK << (8 * (REC_NEW_HEAP_NO - 4)) \
+ ^ REC_N_OWNED_MASK << (8 * (REC_NEW_N_OWNED - 3)) \
+ ^ REC_INFO_BITS_MASK << (8 * (REC_NEW_INFO_BITS - 3)) \
+ ^ 0xFFFFFFUL
+# error "sum of new-style masks != 0xFFFFFFUL"
+#endif
+
+/***********************************************************//**
+Sets the value of the ith field SQL null bit of an old-style record. */
+UNIV_INTERN
+void
+rec_set_nth_field_null_bit(
+/*=======================*/
+ rec_t* rec, /*!< in: record */
+ ulint i, /*!< in: ith field */
+ ibool val); /*!< in: value to set */
+/***********************************************************//**
+Sets an old-style record field to SQL null.
+The physical size of the field is not changed. */
+UNIV_INTERN
+void
+rec_set_nth_field_sql_null(
+/*=======================*/
+ rec_t* rec, /*!< in: record */
+ ulint n); /*!< in: index of the field */
+
+/******************************************************//**
+Gets a bit field from within 1 byte. */
+UNIV_INLINE
+ulint
+rec_get_bit_field_1(
+/*================*/
+ const rec_t* rec, /*!< in: pointer to record origin */
+ ulint offs, /*!< in: offset from the origin down */
+ ulint mask, /*!< in: mask used to filter bits */
+ ulint shift) /*!< in: shift right applied after masking */
+{
+ ut_ad(rec);
+
+ return((mach_read_from_1(rec - offs) & mask) >> shift);
+}
+
+/******************************************************//**
+Sets a bit field within 1 byte. */
+UNIV_INLINE
+void
+rec_set_bit_field_1(
+/*================*/
+ rec_t* rec, /*!< in: pointer to record origin */
+ ulint val, /*!< in: value to set */
+ ulint offs, /*!< in: offset from the origin down */
+ ulint mask, /*!< in: mask used to filter bits */
+ ulint shift) /*!< in: shift right applied after masking */
+{
+ ut_ad(rec);
+ ut_ad(offs <= REC_N_OLD_EXTRA_BYTES);
+ ut_ad(mask);
+ ut_ad(mask <= 0xFFUL);
+ ut_ad(((mask >> shift) << shift) == mask);
+ ut_ad(((val << shift) & mask) == (val << shift));
+
+ mach_write_to_1(rec - offs,
+ (mach_read_from_1(rec - offs) & ~mask)
+ | (val << shift));
+}
+
+/******************************************************//**
+Gets a bit field from within 2 bytes. */
+UNIV_INLINE
+ulint
+rec_get_bit_field_2(
+/*================*/
+ const rec_t* rec, /*!< in: pointer to record origin */
+ ulint offs, /*!< in: offset from the origin down */
+ ulint mask, /*!< in: mask used to filter bits */
+ ulint shift) /*!< in: shift right applied after masking */
+{
+ ut_ad(rec);
+
+ return((mach_read_from_2(rec - offs) & mask) >> shift);
+}
+
+/******************************************************//**
+Sets a bit field within 2 bytes. */
+UNIV_INLINE
+void
+rec_set_bit_field_2(
+/*================*/
+ rec_t* rec, /*!< in: pointer to record origin */
+ ulint val, /*!< in: value to set */
+ ulint offs, /*!< in: offset from the origin down */
+ ulint mask, /*!< in: mask used to filter bits */
+ ulint shift) /*!< in: shift right applied after masking */
+{
+ ut_ad(rec);
+ ut_ad(offs <= REC_N_OLD_EXTRA_BYTES);
+ ut_ad(mask > 0xFFUL);
+ ut_ad(mask <= 0xFFFFUL);
+ ut_ad((mask >> shift) & 1);
+ ut_ad(0 == ((mask >> shift) & ((mask >> shift) + 1)));
+ ut_ad(((mask >> shift) << shift) == mask);
+ ut_ad(((val << shift) & mask) == (val << shift));
+
+ mach_write_to_2(rec - offs,
+ (mach_read_from_2(rec - offs) & ~mask)
+ | (val << shift));
+}
+
+/******************************************************//**
+The following function is used to get the pointer of the next chained record
+on the same page.
+@return pointer to the next chained record, or NULL if none */
+UNIV_INLINE
+const rec_t*
+rec_get_next_ptr_const(
+/*===================*/
+ const rec_t* rec, /*!< in: physical record */
+ ulint comp) /*!< in: nonzero=compact page format */
+{
+ ulint field_value;
+
+ ut_ad(REC_NEXT_MASK == 0xFFFFUL);
+ ut_ad(REC_NEXT_SHIFT == 0);
+
+ field_value = mach_read_from_2(rec - REC_NEXT);
+
+ if (UNIV_UNLIKELY(field_value == 0)) {
+
+ return(NULL);
+ }
+
+ if (UNIV_EXPECT(comp, REC_OFFS_COMPACT)) {
+#if UNIV_PAGE_SIZE <= 32768
+ /* Note that for 64 KiB pages, field_value can 'wrap around'
+ and the debug assertion is not valid */
+
+ /* In the following assertion, field_value is interpreted
+ as signed 16-bit integer in 2's complement arithmetics.
+ If all platforms defined int16_t in the standard headers,
+ the expression could be written simpler as
+ (int16_t) field_value + ut_align_offset(...) < UNIV_PAGE_SIZE
+ */
+ ut_ad((field_value >= 32768
+ ? field_value - 65536
+ : field_value)
+ + ut_align_offset(rec, UNIV_PAGE_SIZE)
+ < UNIV_PAGE_SIZE);
+#endif
+ /* There must be at least REC_N_NEW_EXTRA_BYTES + 1
+ between each record. */
+ ut_ad((field_value > REC_N_NEW_EXTRA_BYTES
+ && field_value < 32768)
+ || field_value < (uint16) -REC_N_NEW_EXTRA_BYTES);
+
+ return((byte*) ut_align_down(rec, UNIV_PAGE_SIZE)
+ + ut_align_offset(rec + field_value, UNIV_PAGE_SIZE));
+ } else {
+ ut_ad(field_value < UNIV_PAGE_SIZE);
+
+ return((byte*) ut_align_down(rec, UNIV_PAGE_SIZE)
+ + field_value);
+ }
+}
+
+/******************************************************//**
+The following function is used to get the pointer of the next chained record
+on the same page.
+@return pointer to the next chained record, or NULL if none */
+UNIV_INLINE
+rec_t*
+rec_get_next_ptr(
+/*=============*/
+ rec_t* rec, /*!< in: physical record */
+ ulint comp) /*!< in: nonzero=compact page format */
+{
+ return((rec_t*) rec_get_next_ptr_const(rec, comp));
+}
+
+/******************************************************//**
+The following function is used to get the offset of the next chained record
+on the same page.
+@return the page offset of the next chained record, or 0 if none */
+UNIV_INLINE
+ulint
+rec_get_next_offs(
+/*==============*/
+ const rec_t* rec, /*!< in: physical record */
+ ulint comp) /*!< in: nonzero=compact page format */
+{
+ ulint field_value;
+#if REC_NEXT_MASK != 0xFFFFUL
+# error "REC_NEXT_MASK != 0xFFFFUL"
+#endif
+#if REC_NEXT_SHIFT
+# error "REC_NEXT_SHIFT != 0"
+#endif
+
+ field_value = mach_read_from_2(rec - REC_NEXT);
+
+ if (UNIV_EXPECT(comp, REC_OFFS_COMPACT)) {
+#if UNIV_PAGE_SIZE <= 32768
+ /* Note that for 64 KiB pages, field_value can 'wrap around'
+ and the debug assertion is not valid */
+
+ /* In the following assertion, field_value is interpreted
+ as signed 16-bit integer in 2's complement arithmetics.
+ If all platforms defined int16_t in the standard headers,
+ the expression could be written simpler as
+ (int16_t) field_value + ut_align_offset(...) < UNIV_PAGE_SIZE
+ */
+ ut_ad((field_value >= 32768
+ ? field_value - 65536
+ : field_value)
+ + ut_align_offset(rec, UNIV_PAGE_SIZE)
+ < UNIV_PAGE_SIZE);
+#endif
+ if (UNIV_UNLIKELY(field_value == 0)) {
+
+ return(0);
+ }
+
+ /* There must be at least REC_N_NEW_EXTRA_BYTES + 1
+ between each record. */
+ ut_ad((field_value > REC_N_NEW_EXTRA_BYTES
+ && field_value < 32768)
+ || field_value < (uint16) -REC_N_NEW_EXTRA_BYTES);
+
+ return(ut_align_offset(rec + field_value, UNIV_PAGE_SIZE));
+ } else {
+ ut_ad(field_value < UNIV_PAGE_SIZE);
+
+ return(field_value);
+ }
+}
+
+/******************************************************//**
+The following function is used to set the next record offset field
+of an old-style record. */
+UNIV_INLINE
+void
+rec_set_next_offs_old(
+/*==================*/
+ rec_t* rec, /*!< in: old-style physical record */
+ ulint next) /*!< in: offset of the next record */
+{
+ ut_ad(rec);
+ ut_ad(UNIV_PAGE_SIZE > next);
+#if REC_NEXT_MASK != 0xFFFFUL
+# error "REC_NEXT_MASK != 0xFFFFUL"
+#endif
+#if REC_NEXT_SHIFT
+# error "REC_NEXT_SHIFT != 0"
+#endif
+
+ mach_write_to_2(rec - REC_NEXT, next);
+}
+
+/******************************************************//**
+The following function is used to set the next record offset field
+of a new-style record. */
+UNIV_INLINE
+void
+rec_set_next_offs_new(
+/*==================*/
+ rec_t* rec, /*!< in/out: new-style physical record */
+ ulint next) /*!< in: offset of the next record */
+{
+ ulint field_value;
+
+ ut_ad(rec);
+ ut_ad(UNIV_PAGE_SIZE > next);
+
+ if (UNIV_UNLIKELY(!next)) {
+ field_value = 0;
+ } else {
+ /* The following two statements calculate
+ next - offset_of_rec mod 64Ki, where mod is the modulo
+ as a non-negative number */
+
+ field_value = (ulint)
+ ((lint) next
+ - (lint) ut_align_offset(rec, UNIV_PAGE_SIZE));
+ field_value &= REC_NEXT_MASK;
+ }
+
+ mach_write_to_2(rec - REC_NEXT, field_value);
+}
+
+/******************************************************//**
+The following function is used to get the number of fields
+in an old-style record.
+@return number of data fields */
+UNIV_INLINE
+ulint
+rec_get_n_fields_old(
+/*=================*/
+ const rec_t* rec) /*!< in: physical record */
+{
+ ulint ret;
+
+ ut_ad(rec);
+
+ ret = rec_get_bit_field_2(rec, REC_OLD_N_FIELDS,
+ REC_OLD_N_FIELDS_MASK,
+ REC_OLD_N_FIELDS_SHIFT);
+ ut_ad(ret <= REC_MAX_N_FIELDS);
+ ut_ad(ret > 0);
+
+ return(ret);
+}
+
+/******************************************************//**
+The following function is used to set the number of fields
+in an old-style record. */
+UNIV_INLINE
+void
+rec_set_n_fields_old(
+/*=================*/
+ rec_t* rec, /*!< in: physical record */
+ ulint n_fields) /*!< in: the number of fields */
+{
+ ut_ad(rec);
+ ut_ad(n_fields <= REC_MAX_N_FIELDS);
+ ut_ad(n_fields > 0);
+
+ rec_set_bit_field_2(rec, n_fields, REC_OLD_N_FIELDS,
+ REC_OLD_N_FIELDS_MASK, REC_OLD_N_FIELDS_SHIFT);
+}
+
+/******************************************************//**
+The following function retrieves the status bits of a new-style record.
+@return status bits */
+UNIV_INLINE
+ulint
+rec_get_status(
+/*===========*/
+ const rec_t* rec) /*!< in: physical record */
+{
+ ulint ret;
+
+ ut_ad(rec);
+
+ ret = rec_get_bit_field_1(rec, REC_NEW_STATUS,
+ REC_NEW_STATUS_MASK, REC_NEW_STATUS_SHIFT);
+ ut_ad((ret & ~REC_NEW_STATUS_MASK) == 0);
+
+ return(ret);
+}
+
+/******************************************************//**
+The following function is used to get the number of fields
+in a record.
+@return number of data fields */
+UNIV_INLINE
+ulint
+rec_get_n_fields(
+/*=============*/
+ const rec_t* rec, /*!< in: physical record */
+ const dict_index_t* index) /*!< in: record descriptor */
+{
+ ut_ad(rec);
+ ut_ad(index);
+
+ if (!dict_table_is_comp(index->table)) {
+ return(rec_get_n_fields_old(rec));
+ }
+
+ switch (rec_get_status(rec)) {
+ case REC_STATUS_ORDINARY:
+ return(dict_index_get_n_fields(index));
+ case REC_STATUS_NODE_PTR:
+ return(dict_index_get_n_unique_in_tree(index) + 1);
+ case REC_STATUS_INFIMUM:
+ case REC_STATUS_SUPREMUM:
+ return(1);
+ default:
+ ut_error;
+ return(ULINT_UNDEFINED);
+ }
+}
+
+/******************************************************//**
+The following function is used to get the number of records owned by the
+previous directory record.
+@return number of owned records */
+UNIV_INLINE
+ulint
+rec_get_n_owned_old(
+/*================*/
+ const rec_t* rec) /*!< in: old-style physical record */
+{
+ return(rec_get_bit_field_1(rec, REC_OLD_N_OWNED,
+ REC_N_OWNED_MASK, REC_N_OWNED_SHIFT));
+}
+
+/******************************************************//**
+The following function is used to set the number of owned records. */
+UNIV_INLINE
+void
+rec_set_n_owned_old(
+/*================*/
+ rec_t* rec, /*!< in: old-style physical record */
+ ulint n_owned) /*!< in: the number of owned */
+{
+ rec_set_bit_field_1(rec, n_owned, REC_OLD_N_OWNED,
+ REC_N_OWNED_MASK, REC_N_OWNED_SHIFT);
+}
+
+/******************************************************//**
+The following function is used to get the number of records owned by the
+previous directory record.
+@return number of owned records */
+UNIV_INLINE
+ulint
+rec_get_n_owned_new(
+/*================*/
+ const rec_t* rec) /*!< in: new-style physical record */
+{
+ return(rec_get_bit_field_1(rec, REC_NEW_N_OWNED,
+ REC_N_OWNED_MASK, REC_N_OWNED_SHIFT));
+}
+
+/******************************************************//**
+The following function is used to set the number of owned records. */
+UNIV_INLINE
+void
+rec_set_n_owned_new(
+/*================*/
+ rec_t* rec, /*!< in/out: new-style physical record */
+ page_zip_des_t* page_zip,/*!< in/out: compressed page, or NULL */
+ ulint n_owned)/*!< in: the number of owned */
+{
+ rec_set_bit_field_1(rec, n_owned, REC_NEW_N_OWNED,
+ REC_N_OWNED_MASK, REC_N_OWNED_SHIFT);
+ if (UNIV_LIKELY_NULL(page_zip)
+ && UNIV_LIKELY(rec_get_status(rec)
+ != REC_STATUS_SUPREMUM)) {
+ page_zip_rec_set_owned(page_zip, rec, n_owned);
+ }
+}
+
+/******************************************************//**
+The following function is used to retrieve the info bits of a record.
+@return info bits */
+UNIV_INLINE
+ulint
+rec_get_info_bits(
+/*==============*/
+ const rec_t* rec, /*!< in: physical record */
+ ulint comp) /*!< in: nonzero=compact page format */
+{
+ return(rec_get_bit_field_1(
+ rec, comp ? REC_NEW_INFO_BITS : REC_OLD_INFO_BITS,
+ REC_INFO_BITS_MASK, REC_INFO_BITS_SHIFT));
+}
+
+/******************************************************//**
+The following function is used to set the info bits of a record. */
+UNIV_INLINE
+void
+rec_set_info_bits_old(
+/*==================*/
+ rec_t* rec, /*!< in: old-style physical record */
+ ulint bits) /*!< in: info bits */
+{
+ rec_set_bit_field_1(rec, bits, REC_OLD_INFO_BITS,
+ REC_INFO_BITS_MASK, REC_INFO_BITS_SHIFT);
+}
+/******************************************************//**
+The following function is used to set the info bits of a record. */
+UNIV_INLINE
+void
+rec_set_info_bits_new(
+/*==================*/
+ rec_t* rec, /*!< in/out: new-style physical record */
+ ulint bits) /*!< in: info bits */
+{
+ rec_set_bit_field_1(rec, bits, REC_NEW_INFO_BITS,
+ REC_INFO_BITS_MASK, REC_INFO_BITS_SHIFT);
+}
+
+/******************************************************//**
+The following function is used to set the status bits of a new-style record. */
+UNIV_INLINE
+void
+rec_set_status(
+/*===========*/
+ rec_t* rec, /*!< in/out: physical record */
+ ulint bits) /*!< in: info bits */
+{
+ rec_set_bit_field_1(rec, bits, REC_NEW_STATUS,
+ REC_NEW_STATUS_MASK, REC_NEW_STATUS_SHIFT);
+}
+
+/******************************************************//**
+The following function is used to retrieve the info and status
+bits of a record. (Only compact records have status bits.)
+@return info bits */
+UNIV_INLINE
+ulint
+rec_get_info_and_status_bits(
+/*=========================*/
+ const rec_t* rec, /*!< in: physical record */
+ ulint comp) /*!< in: nonzero=compact page format */
+{
+ ulint bits;
+#if (REC_NEW_STATUS_MASK >> REC_NEW_STATUS_SHIFT) \
+& (REC_INFO_BITS_MASK >> REC_INFO_BITS_SHIFT)
+# error "REC_NEW_STATUS_MASK and REC_INFO_BITS_MASK overlap"
+#endif
+ if (UNIV_EXPECT(comp, REC_OFFS_COMPACT)) {
+ bits = rec_get_info_bits(rec, TRUE) | rec_get_status(rec);
+ } else {
+ bits = rec_get_info_bits(rec, FALSE);
+ ut_ad(!(bits & ~(REC_INFO_BITS_MASK >> REC_INFO_BITS_SHIFT)));
+ }
+ return(bits);
+}
+/******************************************************//**
+The following function is used to set the info and status
+bits of a record. (Only compact records have status bits.) */
+UNIV_INLINE
+void
+rec_set_info_and_status_bits(
+/*=========================*/
+ rec_t* rec, /*!< in/out: physical record */
+ ulint bits) /*!< in: info bits */
+{
+#if (REC_NEW_STATUS_MASK >> REC_NEW_STATUS_SHIFT) \
+& (REC_INFO_BITS_MASK >> REC_INFO_BITS_SHIFT)
+# error "REC_NEW_STATUS_MASK and REC_INFO_BITS_MASK overlap"
+#endif
+ rec_set_status(rec, bits & REC_NEW_STATUS_MASK);
+ rec_set_info_bits_new(rec, bits & ~REC_NEW_STATUS_MASK);
+}
+
+/******************************************************//**
+The following function tells if record is delete marked.
+@return nonzero if delete marked */
+UNIV_INLINE
+ulint
+rec_get_deleted_flag(
+/*=================*/
+ const rec_t* rec, /*!< in: physical record */
+ ulint comp) /*!< in: nonzero=compact page format */
+{
+ if (UNIV_EXPECT(comp, REC_OFFS_COMPACT)) {
+ return(UNIV_UNLIKELY(
+ rec_get_bit_field_1(rec, REC_NEW_INFO_BITS,
+ REC_INFO_DELETED_FLAG,
+ REC_INFO_BITS_SHIFT)));
+ } else {
+ return(UNIV_UNLIKELY(
+ rec_get_bit_field_1(rec, REC_OLD_INFO_BITS,
+ REC_INFO_DELETED_FLAG,
+ REC_INFO_BITS_SHIFT)));
+ }
+}
+
+/******************************************************//**
+The following function is used to set the deleted bit. */
+UNIV_INLINE
+void
+rec_set_deleted_flag_old(
+/*=====================*/
+ rec_t* rec, /*!< in: old-style physical record */
+ ulint flag) /*!< in: nonzero if delete marked */
+{
+ ulint val;
+
+ val = rec_get_info_bits(rec, FALSE);
+
+ if (flag) {
+ val |= REC_INFO_DELETED_FLAG;
+ } else {
+ val &= ~REC_INFO_DELETED_FLAG;
+ }
+
+ rec_set_info_bits_old(rec, val);
+}
+
+/******************************************************//**
+The following function is used to set the deleted bit. */
+UNIV_INLINE
+void
+rec_set_deleted_flag_new(
+/*=====================*/
+ rec_t* rec, /*!< in/out: new-style physical record */
+ page_zip_des_t* page_zip,/*!< in/out: compressed page, or NULL */
+ ulint flag) /*!< in: nonzero if delete marked */
+{
+ ulint val;
+
+ val = rec_get_info_bits(rec, TRUE);
+
+ if (flag) {
+ val |= REC_INFO_DELETED_FLAG;
+ } else {
+ val &= ~REC_INFO_DELETED_FLAG;
+ }
+
+ rec_set_info_bits_new(rec, val);
+
+ if (UNIV_LIKELY_NULL(page_zip)) {
+ page_zip_rec_set_deleted(page_zip, rec, flag);
+ }
+}
+
+/******************************************************//**
+The following function tells if a new-style record is a node pointer.
+@return TRUE if node pointer */
+UNIV_INLINE
+ibool
+rec_get_node_ptr_flag(
+/*==================*/
+ const rec_t* rec) /*!< in: physical record */
+{
+ return(REC_STATUS_NODE_PTR == rec_get_status(rec));
+}
+
+/******************************************************//**
+The following function is used to get the order number
+of an old-style record in the heap of the index page.
+@return heap order number */
+UNIV_INLINE
+ulint
+rec_get_heap_no_old(
+/*================*/
+ const rec_t* rec) /*!< in: physical record */
+{
+ return(rec_get_bit_field_2(rec, REC_OLD_HEAP_NO,
+ REC_HEAP_NO_MASK, REC_HEAP_NO_SHIFT));
+}
+
+/******************************************************//**
+The following function is used to set the heap number
+field in an old-style record. */
+UNIV_INLINE
+void
+rec_set_heap_no_old(
+/*================*/
+ rec_t* rec, /*!< in: physical record */
+ ulint heap_no)/*!< in: the heap number */
+{
+ rec_set_bit_field_2(rec, heap_no, REC_OLD_HEAP_NO,
+ REC_HEAP_NO_MASK, REC_HEAP_NO_SHIFT);
+}
+
+/******************************************************//**
+The following function is used to get the order number
+of a new-style record in the heap of the index page.
+@return heap order number */
+UNIV_INLINE
+ulint
+rec_get_heap_no_new(
+/*================*/
+ const rec_t* rec) /*!< in: physical record */
+{
+ return(rec_get_bit_field_2(rec, REC_NEW_HEAP_NO,
+ REC_HEAP_NO_MASK, REC_HEAP_NO_SHIFT));
+}
+
+/******************************************************//**
+The following function is used to set the heap number
+field in a new-style record. */
+UNIV_INLINE
+void
+rec_set_heap_no_new(
+/*================*/
+ rec_t* rec, /*!< in/out: physical record */
+ ulint heap_no)/*!< in: the heap number */
+{
+ rec_set_bit_field_2(rec, heap_no, REC_NEW_HEAP_NO,
+ REC_HEAP_NO_MASK, REC_HEAP_NO_SHIFT);
+}
+
+/******************************************************//**
+The following function is used to test whether the data offsets in the record
+are stored in one-byte or two-byte format.
+@return TRUE if 1-byte form */
+UNIV_INLINE
+ibool
+rec_get_1byte_offs_flag(
+/*====================*/
+ const rec_t* rec) /*!< in: physical record */
+{
+#if TRUE != 1
+#error "TRUE != 1"
+#endif
+
+ return(rec_get_bit_field_1(rec, REC_OLD_SHORT, REC_OLD_SHORT_MASK,
+ REC_OLD_SHORT_SHIFT));
+}
+
+/******************************************************//**
+The following function is used to set the 1-byte offsets flag. */
+UNIV_INLINE
+void
+rec_set_1byte_offs_flag(
+/*====================*/
+ rec_t* rec, /*!< in: physical record */
+ ibool flag) /*!< in: TRUE if 1byte form */
+{
+#if TRUE != 1
+#error "TRUE != 1"
+#endif
+ ut_ad(flag <= TRUE);
+
+ rec_set_bit_field_1(rec, flag, REC_OLD_SHORT, REC_OLD_SHORT_MASK,
+ REC_OLD_SHORT_SHIFT);
+}
+
+/******************************************************//**
+Returns the offset of nth field end if the record is stored in the 1-byte
+offsets form. If the field is SQL null, the flag is ORed in the returned
+value.
+@return offset of the start of the field, SQL null flag ORed */
+UNIV_INLINE
+ulint
+rec_1_get_field_end_info(
+/*=====================*/
+ const rec_t* rec, /*!< in: record */
+ ulint n) /*!< in: field index */
+{
+ ut_ad(rec_get_1byte_offs_flag(rec));
+ ut_ad(n < rec_get_n_fields_old(rec));
+
+ return(mach_read_from_1(rec - (REC_N_OLD_EXTRA_BYTES + n + 1)));
+}
+
+/******************************************************//**
+Returns the offset of nth field end if the record is stored in the 2-byte
+offsets form. If the field is SQL null, the flag is ORed in the returned
+value.
+@return offset of the start of the field, SQL null flag and extern
+storage flag ORed */
+UNIV_INLINE
+ulint
+rec_2_get_field_end_info(
+/*=====================*/
+ const rec_t* rec, /*!< in: record */
+ ulint n) /*!< in: field index */
+{
+ ut_ad(!rec_get_1byte_offs_flag(rec));
+ ut_ad(n < rec_get_n_fields_old(rec));
+
+ return(mach_read_from_2(rec - (REC_N_OLD_EXTRA_BYTES + 2 * n + 2)));
+}
+
+/* Get the base address of offsets. The extra_size is stored at
+this position, and following positions hold the end offsets of
+the fields. */
+#define rec_offs_base(offsets) (offsets + REC_OFFS_HEADER_SIZE)
+
+/**********************************************************//**
+The following function returns the number of allocated elements
+for an array of offsets.
+@return number of elements */
+UNIV_INLINE
+ulint
+rec_offs_get_n_alloc(
+/*=================*/
+ const ulint* offsets)/*!< in: array for rec_get_offsets() */
+{
+ ulint n_alloc;
+ ut_ad(offsets);
+ n_alloc = offsets[0];
+ ut_ad(n_alloc > REC_OFFS_HEADER_SIZE);
+ UNIV_MEM_ASSERT_W(offsets, n_alloc * sizeof *offsets);
+ return(n_alloc);
+}
+
+/**********************************************************//**
+The following function sets the number of allocated elements
+for an array of offsets. */
+UNIV_INLINE
+void
+rec_offs_set_n_alloc(
+/*=================*/
+ ulint* offsets, /*!< out: array for rec_get_offsets(),
+ must be allocated */
+ ulint n_alloc) /*!< in: number of elements */
+{
+ ut_ad(offsets);
+ ut_ad(n_alloc > REC_OFFS_HEADER_SIZE);
+ UNIV_MEM_ASSERT_AND_ALLOC(offsets, n_alloc * sizeof *offsets);
+ offsets[0] = n_alloc;
+}
+
+/**********************************************************//**
+The following function returns the number of fields in a record.
+@return number of fields */
+UNIV_INLINE
+ulint
+rec_offs_n_fields(
+/*==============*/
+ const ulint* offsets)/*!< in: array returned by rec_get_offsets() */
+{
+ ulint n_fields;
+ ut_ad(offsets);
+ n_fields = offsets[1];
+ ut_ad(n_fields > 0);
+ ut_ad(n_fields <= REC_MAX_N_FIELDS);
+ ut_ad(n_fields + REC_OFFS_HEADER_SIZE
+ <= rec_offs_get_n_alloc(offsets));
+ return(n_fields);
+}
+
+/************************************************************//**
+Validates offsets returned by rec_get_offsets().
+@return TRUE if valid */
+UNIV_INLINE
+ibool
+rec_offs_validate(
+/*==============*/
+ const rec_t* rec, /*!< in: record or NULL */
+ const dict_index_t* index, /*!< in: record descriptor or NULL */
+ const ulint* offsets)/*!< in: array returned by
+ rec_get_offsets() */
+{
+ ulint i = rec_offs_n_fields(offsets);
+ ulint last = ULINT_MAX;
+ ulint comp = *rec_offs_base(offsets) & REC_OFFS_COMPACT;
+
+ if (rec) {
+ ut_ad((ulint) rec == offsets[2]);
+ if (!comp) {
+ ut_a(rec_get_n_fields_old(rec) >= i);
+ }
+ }
+ if (index) {
+ ulint max_n_fields;
+ ut_ad((ulint) index == offsets[3]);
+ max_n_fields = ut_max(
+ dict_index_get_n_fields(index),
+ dict_index_get_n_unique_in_tree(index) + 1);
+ if (comp && rec) {
+ switch (rec_get_status(rec)) {
+ case REC_STATUS_ORDINARY:
+ break;
+ case REC_STATUS_NODE_PTR:
+ max_n_fields = dict_index_get_n_unique_in_tree(
+ index) + 1;
+ break;
+ case REC_STATUS_INFIMUM:
+ case REC_STATUS_SUPREMUM:
+ max_n_fields = 1;
+ break;
+ default:
+ ut_error;
+ }
+ }
+ /* index->n_def == 0 for dummy indexes if !comp */
+ ut_a(!comp || index->n_def);
+ ut_a(!index->n_def || i <= max_n_fields);
+ }
+ while (i--) {
+ ulint curr = rec_offs_base(offsets)[1 + i] & REC_OFFS_MASK;
+ ut_a(curr <= last);
+ last = curr;
+ }
+ return(TRUE);
+}
+#ifdef UNIV_DEBUG
+/************************************************************//**
+Updates debug data in offsets, in order to avoid bogus
+rec_offs_validate() failures. */
+UNIV_INLINE
+void
+rec_offs_make_valid(
+/*================*/
+ const rec_t* rec, /*!< in: record */
+ const dict_index_t* index, /*!< in: record descriptor */
+ ulint* offsets)/*!< in: array returned by
+ rec_get_offsets() */
+{
+ ut_ad(rec);
+ ut_ad(index);
+ ut_ad(offsets);
+ ut_ad(rec_get_n_fields(rec, index) >= rec_offs_n_fields(offsets));
+ offsets[2] = (ulint) rec;
+ offsets[3] = (ulint) index;
+}
+#endif /* UNIV_DEBUG */
+
+/************************************************************//**
+The following function is used to get an offset to the nth
+data field in a record.
+@return offset from the origin of rec */
+UNIV_INLINE
+ulint
+rec_get_nth_field_offs(
+/*===================*/
+ const ulint* offsets,/*!< in: array returned by rec_get_offsets() */
+ ulint n, /*!< in: index of the field */
+ ulint* len) /*!< out: length of the field; UNIV_SQL_NULL
+ if SQL null */
+{
+ ulint offs;
+ ulint length;
+ ut_ad(n < rec_offs_n_fields(offsets));
+ ut_ad(len);
+
+ if (UNIV_UNLIKELY(n == 0)) {
+ offs = 0;
+ } else {
+ offs = rec_offs_base(offsets)[n] & REC_OFFS_MASK;
+ }
+
+ length = rec_offs_base(offsets)[1 + n];
+
+ if (length & REC_OFFS_SQL_NULL) {
+ length = UNIV_SQL_NULL;
+ } else {
+ length &= REC_OFFS_MASK;
+ length -= offs;
+ }
+
+ *len = length;
+ return(offs);
+}
+
+/******************************************************//**
+Determine if the offsets are for a record in the new
+compact format.
+@return nonzero if compact format */
+UNIV_INLINE
+ulint
+rec_offs_comp(
+/*==========*/
+ const ulint* offsets)/*!< in: array returned by rec_get_offsets() */
+{
+ ut_ad(rec_offs_validate(NULL, NULL, offsets));
+ return(*rec_offs_base(offsets) & REC_OFFS_COMPACT);
+}
+
+/******************************************************//**
+Determine if the offsets are for a record containing
+externally stored columns.
+@return nonzero if externally stored */
+UNIV_INLINE
+ulint
+rec_offs_any_extern(
+/*================*/
+ const ulint* offsets)/*!< in: array returned by rec_get_offsets() */
+{
+ ut_ad(rec_offs_validate(NULL, NULL, offsets));
+ return(UNIV_UNLIKELY(*rec_offs_base(offsets) & REC_OFFS_EXTERNAL));
+}
+
+/******************************************************//**
+Returns nonzero if the extern bit is set in nth field of rec.
+@return nonzero if externally stored */
+UNIV_INLINE
+ulint
+rec_offs_nth_extern(
+/*================*/
+ const ulint* offsets,/*!< in: array returned by rec_get_offsets() */
+ ulint n) /*!< in: nth field */
+{
+ ut_ad(rec_offs_validate(NULL, NULL, offsets));
+ ut_ad(n < rec_offs_n_fields(offsets));
+ return(UNIV_UNLIKELY(rec_offs_base(offsets)[1 + n]
+ & REC_OFFS_EXTERNAL));
+}
+
+/******************************************************//**
+Returns nonzero if the SQL NULL bit is set in nth field of rec.
+@return nonzero if SQL NULL */
+UNIV_INLINE
+ulint
+rec_offs_nth_sql_null(
+/*==================*/
+ const ulint* offsets,/*!< in: array returned by rec_get_offsets() */
+ ulint n) /*!< in: nth field */
+{
+ ut_ad(rec_offs_validate(NULL, NULL, offsets));
+ ut_ad(n < rec_offs_n_fields(offsets));
+ return(UNIV_UNLIKELY(rec_offs_base(offsets)[1 + n]
+ & REC_OFFS_SQL_NULL));
+}
+
+/******************************************************//**
+Gets the physical size of a field.
+@return length of field */
+UNIV_INLINE
+ulint
+rec_offs_nth_size(
+/*==============*/
+ const ulint* offsets,/*!< in: array returned by rec_get_offsets() */
+ ulint n) /*!< in: nth field */
+{
+ ut_ad(rec_offs_validate(NULL, NULL, offsets));
+ ut_ad(n < rec_offs_n_fields(offsets));
+ if (!n) {
+ return(rec_offs_base(offsets)[1 + n] & REC_OFFS_MASK);
+ }
+ return((rec_offs_base(offsets)[1 + n] - rec_offs_base(offsets)[n])
+ & REC_OFFS_MASK);
+}
+
+/******************************************************//**
+Returns the number of extern bits set in a record.
+@return number of externally stored fields */
+UNIV_INLINE
+ulint
+rec_offs_n_extern(
+/*==============*/
+ const ulint* offsets)/*!< in: array returned by rec_get_offsets() */
+{
+ ulint n = 0;
+
+ if (rec_offs_any_extern(offsets)) {
+ ulint i;
+
+ for (i = rec_offs_n_fields(offsets); i--; ) {
+ if (rec_offs_nth_extern(offsets, i)) {
+ n++;
+ }
+ }
+ }
+
+ return(n);
+}
+
+/******************************************************//**
+Returns the offset of n - 1th field end if the record is stored in the 1-byte
+offsets form. If the field is SQL null, the flag is ORed in the returned
+value. This function and the 2-byte counterpart are defined here because the
+C-compiler was not able to sum negative and positive constant offsets, and
+warned of constant arithmetic overflow within the compiler.
+@return offset of the start of the PREVIOUS field, SQL null flag ORed */
+UNIV_INLINE
+ulint
+rec_1_get_prev_field_end_info(
+/*==========================*/
+ const rec_t* rec, /*!< in: record */
+ ulint n) /*!< in: field index */
+{
+ ut_ad(rec_get_1byte_offs_flag(rec));
+ ut_ad(n <= rec_get_n_fields_old(rec));
+
+ return(mach_read_from_1(rec - (REC_N_OLD_EXTRA_BYTES + n)));
+}
+
+/******************************************************//**
+Returns the offset of n - 1th field end if the record is stored in the 2-byte
+offsets form. If the field is SQL null, the flag is ORed in the returned
+value.
+@return offset of the start of the PREVIOUS field, SQL null flag ORed */
+UNIV_INLINE
+ulint
+rec_2_get_prev_field_end_info(
+/*==========================*/
+ const rec_t* rec, /*!< in: record */
+ ulint n) /*!< in: field index */
+{
+ ut_ad(!rec_get_1byte_offs_flag(rec));
+ ut_ad(n <= rec_get_n_fields_old(rec));
+
+ return(mach_read_from_2(rec - (REC_N_OLD_EXTRA_BYTES + 2 * n)));
+}
+
+/******************************************************//**
+Sets the field end info for the nth field if the record is stored in the
+1-byte format. */
+UNIV_INLINE
+void
+rec_1_set_field_end_info(
+/*=====================*/
+ rec_t* rec, /*!< in: record */
+ ulint n, /*!< in: field index */
+ ulint info) /*!< in: value to set */
+{
+ ut_ad(rec_get_1byte_offs_flag(rec));
+ ut_ad(n < rec_get_n_fields_old(rec));
+
+ mach_write_to_1(rec - (REC_N_OLD_EXTRA_BYTES + n + 1), info);
+}
+
+/******************************************************//**
+Sets the field end info for the nth field if the record is stored in the
+2-byte format. */
+UNIV_INLINE
+void
+rec_2_set_field_end_info(
+/*=====================*/
+ rec_t* rec, /*!< in: record */
+ ulint n, /*!< in: field index */
+ ulint info) /*!< in: value to set */
+{
+ ut_ad(!rec_get_1byte_offs_flag(rec));
+ ut_ad(n < rec_get_n_fields_old(rec));
+
+ mach_write_to_2(rec - (REC_N_OLD_EXTRA_BYTES + 2 * n + 2), info);
+}
+
+/******************************************************//**
+Returns the offset of nth field start if the record is stored in the 1-byte
+offsets form.
+@return offset of the start of the field */
+UNIV_INLINE
+ulint
+rec_1_get_field_start_offs(
+/*=======================*/
+ const rec_t* rec, /*!< in: record */
+ ulint n) /*!< in: field index */
+{
+ ut_ad(rec_get_1byte_offs_flag(rec));
+ ut_ad(n <= rec_get_n_fields_old(rec));
+
+ if (n == 0) {
+
+ return(0);
+ }
+
+ return(rec_1_get_prev_field_end_info(rec, n)
+ & ~REC_1BYTE_SQL_NULL_MASK);
+}
+
+/******************************************************//**
+Returns the offset of nth field start if the record is stored in the 2-byte
+offsets form.
+@return offset of the start of the field */
+UNIV_INLINE
+ulint
+rec_2_get_field_start_offs(
+/*=======================*/
+ const rec_t* rec, /*!< in: record */
+ ulint n) /*!< in: field index */
+{
+ ut_ad(!rec_get_1byte_offs_flag(rec));
+ ut_ad(n <= rec_get_n_fields_old(rec));
+
+ if (n == 0) {
+
+ return(0);
+ }
+
+ return(rec_2_get_prev_field_end_info(rec, n)
+ & ~(REC_2BYTE_SQL_NULL_MASK | REC_2BYTE_EXTERN_MASK));
+}
+
+/******************************************************//**
+The following function is used to read the offset of the start of a data field
+in the record. The start of an SQL null field is the end offset of the
+previous non-null field, or 0, if none exists. If n is the number of the last
+field + 1, then the end offset of the last field is returned.
+@return offset of the start of the field */
+UNIV_INLINE
+ulint
+rec_get_field_start_offs(
+/*=====================*/
+ const rec_t* rec, /*!< in: record */
+ ulint n) /*!< in: field index */
+{
+ ut_ad(rec);
+ ut_ad(n <= rec_get_n_fields_old(rec));
+
+ if (n == 0) {
+
+ return(0);
+ }
+
+ if (rec_get_1byte_offs_flag(rec)) {
+
+ return(rec_1_get_field_start_offs(rec, n));
+ }
+
+ return(rec_2_get_field_start_offs(rec, n));
+}
+
+/************************************************************//**
+Gets the physical size of an old-style field.
+Also an SQL null may have a field of size > 0,
+if the data type is of a fixed size.
+@return field size in bytes */
+UNIV_INLINE
+ulint
+rec_get_nth_field_size(
+/*===================*/
+ const rec_t* rec, /*!< in: record */
+ ulint n) /*!< in: index of the field */
+{
+ ulint os;
+ ulint next_os;
+
+ os = rec_get_field_start_offs(rec, n);
+ next_os = rec_get_field_start_offs(rec, n + 1);
+
+ ut_ad(next_os - os < UNIV_PAGE_SIZE);
+
+ return(next_os - os);
+}
+
+/***********************************************************//**
+This is used to modify the value of an already existing field in a record.
+The previous value must have exactly the same size as the new value. If len
+is UNIV_SQL_NULL then the field is treated as an SQL null.
+For records in ROW_FORMAT=COMPACT (new-style records), len must not be
+UNIV_SQL_NULL unless the field already is SQL null. */
+UNIV_INLINE
+void
+rec_set_nth_field(
+/*==============*/
+ rec_t* rec, /*!< in: record */
+ const ulint* offsets,/*!< in: array returned by rec_get_offsets() */
+ ulint n, /*!< in: index number of the field */
+ const void* data, /*!< in: pointer to the data
+ if not SQL null */
+ ulint len) /*!< in: length of the data or UNIV_SQL_NULL */
+{
+ byte* data2;
+ ulint len2;
+
+ ut_ad(rec);
+ ut_ad(rec_offs_validate(rec, NULL, offsets));
+
+ if (UNIV_UNLIKELY(len == UNIV_SQL_NULL)) {
+ if (!rec_offs_nth_sql_null(offsets, n)) {
+ ut_a(!rec_offs_comp(offsets));
+ rec_set_nth_field_sql_null(rec, n);
+ }
+
+ return;
+ }
+
+ data2 = rec_get_nth_field(rec, offsets, n, &len2);
+ if (len2 == UNIV_SQL_NULL) {
+ ut_ad(!rec_offs_comp(offsets));
+ rec_set_nth_field_null_bit(rec, n, FALSE);
+ ut_ad(len == rec_get_nth_field_size(rec, n));
+ } else {
+ ut_ad(len2 == len);
+ }
+
+ ut_memcpy(data2, data, len);
+}
+
+/**********************************************************//**
+The following function returns the data size of an old-style physical
+record, that is the sum of field lengths. SQL null fields
+are counted as length 0 fields. The value returned by the function
+is the distance from record origin to record end in bytes.
+@return size */
+UNIV_INLINE
+ulint
+rec_get_data_size_old(
+/*==================*/
+ const rec_t* rec) /*!< in: physical record */
+{
+ ut_ad(rec);
+
+ return(rec_get_field_start_offs(rec, rec_get_n_fields_old(rec)));
+}
+
+/**********************************************************//**
+The following function sets the number of fields in offsets. */
+UNIV_INLINE
+void
+rec_offs_set_n_fields(
+/*==================*/
+ ulint* offsets, /*!< in/out: array returned by
+ rec_get_offsets() */
+ ulint n_fields) /*!< in: number of fields */
+{
+ ut_ad(offsets);
+ ut_ad(n_fields > 0);
+ ut_ad(n_fields <= REC_MAX_N_FIELDS);
+ ut_ad(n_fields + REC_OFFS_HEADER_SIZE
+ <= rec_offs_get_n_alloc(offsets));
+ offsets[1] = n_fields;
+}
+
+/**********************************************************//**
+The following function returns the data size of a physical
+record, that is the sum of field lengths. SQL null fields
+are counted as length 0 fields. The value returned by the function
+is the distance from record origin to record end in bytes.
+@return size */
+UNIV_INLINE
+ulint
+rec_offs_data_size(
+/*===============*/
+ const ulint* offsets)/*!< in: array returned by rec_get_offsets() */
+{
+ ulint size;
+
+ ut_ad(rec_offs_validate(NULL, NULL, offsets));
+ size = rec_offs_base(offsets)[rec_offs_n_fields(offsets)]
+ & REC_OFFS_MASK;
+ ut_ad(size < UNIV_PAGE_SIZE);
+ return(size);
+}
+
+/**********************************************************//**
+Returns the total size of record minus data size of record. The value
+returned by the function is the distance from record start to record origin
+in bytes.
+@return size */
+UNIV_INLINE
+ulint
+rec_offs_extra_size(
+/*================*/
+ const ulint* offsets)/*!< in: array returned by rec_get_offsets() */
+{
+ ulint size;
+ ut_ad(rec_offs_validate(NULL, NULL, offsets));
+ size = *rec_offs_base(offsets) & ~(REC_OFFS_COMPACT | REC_OFFS_EXTERNAL);
+ ut_ad(size < UNIV_PAGE_SIZE);
+ return(size);
+}
+
+/**********************************************************//**
+Returns the total size of a physical record.
+@return size */
+UNIV_INLINE
+ulint
+rec_offs_size(
+/*==========*/
+ const ulint* offsets)/*!< in: array returned by rec_get_offsets() */
+{
+ return(rec_offs_data_size(offsets) + rec_offs_extra_size(offsets));
+}
+
+/**********************************************************//**
+Returns a pointer to the end of the record.
+@return pointer to end */
+UNIV_INLINE
+byte*
+rec_get_end(
+/*========*/
+ rec_t* rec, /*!< in: pointer to record */
+ const ulint* offsets)/*!< in: array returned by rec_get_offsets() */
+{
+ ut_ad(rec_offs_validate(rec, NULL, offsets));
+ return(rec + rec_offs_data_size(offsets));
+}
+
+/**********************************************************//**
+Returns a pointer to the start of the record.
+@return pointer to start */
+UNIV_INLINE
+byte*
+rec_get_start(
+/*==========*/
+ rec_t* rec, /*!< in: pointer to record */
+ const ulint* offsets)/*!< in: array returned by rec_get_offsets() */
+{
+ ut_ad(rec_offs_validate(rec, NULL, offsets));
+ return(rec - rec_offs_extra_size(offsets));
+}
+
+/***************************************************************//**
+Copies a physical record to a buffer.
+@return pointer to the origin of the copy */
+UNIV_INLINE
+rec_t*
+rec_copy(
+/*=====*/
+ void* buf, /*!< in: buffer */
+ const rec_t* rec, /*!< in: physical record */
+ const ulint* offsets)/*!< in: array returned by rec_get_offsets() */
+{
+ ulint extra_len;
+ ulint data_len;
+
+ ut_ad(rec && buf);
+ ut_ad(rec_offs_validate((rec_t*) rec, NULL, offsets));
+ ut_ad(rec_validate(rec, offsets));
+
+ extra_len = rec_offs_extra_size(offsets);
+ data_len = rec_offs_data_size(offsets);
+
+ ut_memcpy(buf, rec - extra_len, extra_len + data_len);
+
+ return((byte*)buf + extra_len);
+}
+
+/**********************************************************//**
+Returns the extra size of an old-style physical record if we know its
+data size and number of fields.
+@return extra size */
+UNIV_INLINE
+ulint
+rec_get_converted_extra_size(
+/*=========================*/
+ ulint data_size, /*!< in: data size */
+ ulint n_fields, /*!< in: number of fields */
+ ulint n_ext) /*!< in: number of externally stored columns */
+{
+ if (!n_ext && data_size <= REC_1BYTE_OFFS_LIMIT) {
+
+ return(REC_N_OLD_EXTRA_BYTES + n_fields);
+ }
+
+ return(REC_N_OLD_EXTRA_BYTES + 2 * n_fields);
+}
+
+/**********************************************************//**
+The following function returns the size of a data tuple when converted to
+a physical record.
+@return size */
+UNIV_INLINE
+ulint
+rec_get_converted_size(
+/*===================*/
+ dict_index_t* index, /*!< in: record descriptor */
+ const dtuple_t* dtuple, /*!< in: data tuple */
+ ulint n_ext) /*!< in: number of externally stored columns */
+{
+ ulint data_size;
+ ulint extra_size;
+
+ ut_ad(index);
+ ut_ad(dtuple);
+ ut_ad(dtuple_check_typed(dtuple));
+
+ ut_ad(index->type & DICT_UNIVERSAL
+ || dtuple_get_n_fields(dtuple)
+ == (((dtuple_get_info_bits(dtuple) & REC_NEW_STATUS_MASK)
+ == REC_STATUS_NODE_PTR)
+ ? dict_index_get_n_unique_in_tree(index) + 1
+ : dict_index_get_n_fields(index)));
+
+ if (dict_table_is_comp(index->table)) {
+ return(rec_get_converted_size_comp(index,
+ dtuple_get_info_bits(dtuple)
+ & REC_NEW_STATUS_MASK,
+ dtuple->fields,
+ dtuple->n_fields, NULL));
+ }
+
+ data_size = dtuple_get_data_size(dtuple, 0);
+
+ extra_size = rec_get_converted_extra_size(
+ data_size, dtuple_get_n_fields(dtuple), n_ext);
+
+ return(data_size + extra_size);
+}
+
+#ifndef UNIV_HOTBACKUP
+/************************************************************//**
+Folds a prefix of a physical record to a ulint. Folds only existing fields,
+that is, checks that we do not run out of the record.
+@return the folded value */
+UNIV_INLINE
+ulint
+rec_fold(
+/*=====*/
+ const rec_t* rec, /*!< in: the physical record */
+ const ulint* offsets, /*!< in: array returned by
+ rec_get_offsets() */
+ ulint n_fields, /*!< in: number of complete
+ fields to fold */
+ ulint n_bytes, /*!< in: number of bytes to fold
+ in an incomplete last field */
+ dulint tree_id) /*!< in: index tree id */
+{
+ ulint i;
+ const byte* data;
+ ulint len;
+ ulint fold;
+ ulint n_fields_rec;
+
+ ut_ad(rec_offs_validate(rec, NULL, offsets));
+ ut_ad(rec_validate(rec, offsets));
+ ut_ad(n_fields + n_bytes > 0);
+
+ n_fields_rec = rec_offs_n_fields(offsets);
+ ut_ad(n_fields <= n_fields_rec);
+ ut_ad(n_fields < n_fields_rec || n_bytes == 0);
+
+ if (n_fields > n_fields_rec) {
+ n_fields = n_fields_rec;
+ }
+
+ if (n_fields == n_fields_rec) {
+ n_bytes = 0;
+ }
+
+ fold = ut_fold_dulint(tree_id);
+
+ for (i = 0; i < n_fields; i++) {
+ data = rec_get_nth_field(rec, offsets, i, &len);
+
+ if (len != UNIV_SQL_NULL) {
+ fold = ut_fold_ulint_pair(fold,
+ ut_fold_binary(data, len));
+ }
+ }
+
+ if (n_bytes > 0) {
+ data = rec_get_nth_field(rec, offsets, i, &len);
+
+ if (len != UNIV_SQL_NULL) {
+ if (len > n_bytes) {
+ len = n_bytes;
+ }
+
+ fold = ut_fold_ulint_pair(fold,
+ ut_fold_binary(data, len));
+ }
+ }
+
+ return(fold);
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/innobase/include/rem0types.h b/storage/innobase/include/rem0types.h
new file mode 100644
index 00000000000..8b84d4af233
--- /dev/null
+++ b/storage/innobase/include/rem0types.h
@@ -0,0 +1,46 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/rem0types.h
+Record manager global types
+
+Created 5/30/1994 Heikki Tuuri
+*************************************************************************/
+
+#ifndef rem0types_h
+#define rem0types_h
+
+/* We define the physical record simply as an array of bytes */
+typedef byte rec_t;
+
+/* Maximum values for various fields (for non-blob tuples) */
+#define REC_MAX_N_FIELDS (1024 - 1)
+#define REC_MAX_HEAP_NO (2 * 8192 - 1)
+#define REC_MAX_N_OWNED (16 - 1)
+
+/* REC_MAX_INDEX_COL_LEN is measured in bytes and is the maximum
+indexed column length (or indexed prefix length). It is set to 3*256,
+so that one can create a column prefix index on 256 characters of a
+TEXT or VARCHAR column also in the UTF-8 charset. In that charset,
+a character may take at most 3 bytes.
+This constant MUST NOT BE CHANGED, or the compatibility of InnoDB data
+files would be at risk! */
+#define REC_MAX_INDEX_COL_LEN 768
+
+#endif
diff --git a/storage/innobase/include/row0ext.h b/storage/innobase/include/row0ext.h
new file mode 100644
index 00000000000..43d82d644e6
--- /dev/null
+++ b/storage/innobase/include/row0ext.h
@@ -0,0 +1,95 @@
+/*****************************************************************************
+
+Copyright (c) 2006, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0ext.h
+Caching of externally stored column prefixes
+
+Created September 2006 Marko Makela
+*******************************************************/
+
+#ifndef row0ext_h
+#define row0ext_h
+
+#include "univ.i"
+#include "row0types.h"
+#include "data0types.h"
+#include "mem0mem.h"
+
+/********************************************************************//**
+Creates a cache of column prefixes of externally stored columns.
+@return own: column prefix cache */
+UNIV_INTERN
+row_ext_t*
+row_ext_create(
+/*===========*/
+ ulint n_ext, /*!< in: number of externally stored columns */
+ const ulint* ext, /*!< in: col_no's of externally stored columns
+ in the InnoDB table object, as reported by
+ dict_col_get_no(); NOT relative to the records
+ in the clustered index */
+ const dtuple_t* tuple, /*!< in: data tuple containing the field
+ references of the externally stored
+ columns; must be indexed by col_no;
+ the clustered index record must be
+ covered by a lock or a page latch
+ to prevent deletion (rollback or purge). */
+ ulint zip_size,/*!< compressed page size in bytes, or 0 */
+ mem_heap_t* heap); /*!< in: heap where created */
+
+/********************************************************************//**
+Looks up a column prefix of an externally stored column.
+@return column prefix, or NULL if the column is not stored externally,
+or pointer to field_ref_zero if the BLOB pointer is unset */
+UNIV_INLINE
+const byte*
+row_ext_lookup_ith(
+/*===============*/
+ const row_ext_t* ext, /*!< in/out: column prefix cache */
+ ulint i, /*!< in: index of ext->ext[] */
+ ulint* len); /*!< out: length of prefix, in bytes,
+ at most REC_MAX_INDEX_COL_LEN */
+/********************************************************************//**
+Looks up a column prefix of an externally stored column.
+@return column prefix, or NULL if the column is not stored externally,
+or pointer to field_ref_zero if the BLOB pointer is unset */
+UNIV_INLINE
+const byte*
+row_ext_lookup(
+/*===========*/
+ const row_ext_t* ext, /*!< in: column prefix cache */
+ ulint col, /*!< in: column number in the InnoDB
+ table object, as reported by
+ dict_col_get_no(); NOT relative to the
+ records in the clustered index */
+ ulint* len); /*!< out: length of prefix, in bytes,
+ at most REC_MAX_INDEX_COL_LEN */
+
+/** Prefixes of externally stored columns */
+struct row_ext_struct{
+ ulint n_ext; /*!< number of externally stored columns */
+ const ulint* ext; /*!< col_no's of externally stored columns */
+ byte* buf; /*!< backing store of the column prefix cache */
+ ulint len[1]; /*!< prefix lengths; 0 if not cached */
+};
+
+#ifndef UNIV_NONINL
+#include "row0ext.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/row0ext.ic b/storage/innobase/include/row0ext.ic
new file mode 100644
index 00000000000..82771a9312a
--- /dev/null
+++ b/storage/innobase/include/row0ext.ic
@@ -0,0 +1,84 @@
+/*****************************************************************************
+
+Copyright (c) 2006, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0ext.ic
+Caching of externally stored column prefixes
+
+Created September 2006 Marko Makela
+*******************************************************/
+
+#include "rem0types.h"
+#include "btr0types.h"
+
+/********************************************************************//**
+Looks up a column prefix of an externally stored column.
+@return column prefix, or NULL if the column is not stored externally,
+or pointer to field_ref_zero if the BLOB pointer is unset */
+UNIV_INLINE
+const byte*
+row_ext_lookup_ith(
+/*===============*/
+ const row_ext_t* ext, /*!< in/out: column prefix cache */
+ ulint i, /*!< in: index of ext->ext[] */
+ ulint* len) /*!< out: length of prefix, in bytes,
+ at most REC_MAX_INDEX_COL_LEN */
+{
+ ut_ad(ext);
+ ut_ad(len);
+ ut_ad(i < ext->n_ext);
+
+ *len = ext->len[i];
+
+ if (UNIV_UNLIKELY(*len == 0)) {
+ /* The BLOB could not be fetched to the cache. */
+ return(field_ref_zero);
+ } else {
+ return(ext->buf + i * REC_MAX_INDEX_COL_LEN);
+ }
+}
+
+/********************************************************************//**
+Looks up a column prefix of an externally stored column.
+@return column prefix, or NULL if the column is not stored externally,
+or pointer to field_ref_zero if the BLOB pointer is unset */
+UNIV_INLINE
+const byte*
+row_ext_lookup(
+/*===========*/
+ const row_ext_t* ext, /*!< in: column prefix cache */
+ ulint col, /*!< in: column number in the InnoDB
+ table object, as reported by
+ dict_col_get_no(); NOT relative to the
+ records in the clustered index */
+ ulint* len) /*!< out: length of prefix, in bytes,
+ at most REC_MAX_INDEX_COL_LEN */
+{
+ ulint i;
+
+ ut_ad(ext);
+ ut_ad(len);
+
+ for (i = 0; i < ext->n_ext; i++) {
+ if (col == ext->ext[i]) {
+ return(row_ext_lookup_ith(ext, i, len));
+ }
+ }
+
+ return(NULL);
+}
diff --git a/storage/innobase/include/row0ins.h b/storage/innobase/include/row0ins.h
new file mode 100644
index 00000000000..530622e6225
--- /dev/null
+++ b/storage/innobase/include/row0ins.h
@@ -0,0 +1,156 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0ins.h
+Insert into a table
+
+Created 4/20/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef row0ins_h
+#define row0ins_h
+
+#include "univ.i"
+#include "data0data.h"
+#include "que0types.h"
+#include "dict0types.h"
+#include "trx0types.h"
+#include "row0types.h"
+
+/***************************************************************//**
+Checks if foreign key constraint fails for an index entry. Sets shared locks
+which lock either the success or the failure of the constraint. NOTE that
+the caller must have a shared latch on dict_foreign_key_check_lock.
+@return DB_SUCCESS, DB_LOCK_WAIT, DB_NO_REFERENCED_ROW, or
+DB_ROW_IS_REFERENCED */
+UNIV_INTERN
+ulint
+row_ins_check_foreign_constraint(
+/*=============================*/
+ ibool check_ref,/*!< in: TRUE If we want to check that
+ the referenced table is ok, FALSE if we
+ want to to check the foreign key table */
+ dict_foreign_t* foreign,/*!< in: foreign constraint; NOTE that the
+ tables mentioned in it must be in the
+ dictionary cache if they exist at all */
+ dict_table_t* table, /*!< in: if check_ref is TRUE, then the foreign
+ table, else the referenced table */
+ dtuple_t* entry, /*!< in: index entry for index */
+ que_thr_t* thr); /*!< in: query thread */
+/*********************************************************************//**
+Creates an insert node struct.
+@return own: insert node struct */
+UNIV_INTERN
+ins_node_t*
+ins_node_create(
+/*============*/
+ ulint ins_type, /*!< in: INS_VALUES, ... */
+ dict_table_t* table, /*!< in: table where to insert */
+ mem_heap_t* heap); /*!< in: mem heap where created */
+/*********************************************************************//**
+Sets a new row to insert for an INS_DIRECT node. This function is only used
+if we have constructed the row separately, which is a rare case; this
+function is quite slow. */
+UNIV_INTERN
+void
+ins_node_set_new_row(
+/*=================*/
+ ins_node_t* node, /*!< in: insert node */
+ dtuple_t* row); /*!< in: new row (or first row) for the node */
+/***************************************************************//**
+Inserts an index entry to index. Tries first optimistic, then pessimistic
+descent down the tree. If the entry matches enough to a delete marked record,
+performs the insert by updating or delete unmarking the delete marked
+record.
+@return DB_SUCCESS, DB_LOCK_WAIT, DB_DUPLICATE_KEY, or some other error code */
+UNIV_INTERN
+ulint
+row_ins_index_entry(
+/*================*/
+ dict_index_t* index, /*!< in: index */
+ dtuple_t* entry, /*!< in: index entry to insert */
+ ulint n_ext, /*!< in: number of externally stored columns */
+ ibool foreign,/*!< in: TRUE=check foreign key constraints */
+ que_thr_t* thr); /*!< in: query thread */
+/***********************************************************//**
+Inserts a row to a table. This is a high-level function used in
+SQL execution graphs.
+@return query thread to run next or NULL */
+UNIV_INTERN
+que_thr_t*
+row_ins_step(
+/*=========*/
+ que_thr_t* thr); /*!< in: query thread */
+/***********************************************************//**
+Creates an entry template for each index of a table. */
+UNIV_INTERN
+void
+ins_node_create_entry_list(
+/*=======================*/
+ ins_node_t* node); /*!< in: row insert node */
+
+/* Insert node structure */
+
+struct ins_node_struct{
+ que_common_t common; /*!< node type: QUE_NODE_INSERT */
+ ulint ins_type;/* INS_VALUES, INS_SEARCHED, or INS_DIRECT */
+ dtuple_t* row; /*!< row to insert */
+ dict_table_t* table; /*!< table where to insert */
+ sel_node_t* select; /*!< select in searched insert */
+ que_node_t* values_list;/* list of expressions to evaluate and
+ insert in an INS_VALUES insert */
+ ulint state; /*!< node execution state */
+ dict_index_t* index; /*!< NULL, or the next index where the index
+ entry should be inserted */
+ dtuple_t* entry; /*!< NULL, or entry to insert in the index;
+ after a successful insert of the entry,
+ this should be reset to NULL */
+ UT_LIST_BASE_NODE_T(dtuple_t)
+ entry_list;/* list of entries, one for each index */
+ byte* row_id_buf;/* buffer for the row id sys field in row */
+ trx_id_t trx_id; /*!< trx id or the last trx which executed the
+ node */
+ byte* trx_id_buf;/* buffer for the trx id sys field in row */
+ mem_heap_t* entry_sys_heap;
+ /* memory heap used as auxiliary storage;
+ entry_list and sys fields are stored here;
+ if this is NULL, entry list should be created
+ and buffers for sys fields in row allocated */
+ ulint magic_n;
+};
+
+#define INS_NODE_MAGIC_N 15849075
+
+/* Insert node types */
+#define INS_SEARCHED 0 /* INSERT INTO ... SELECT ... */
+#define INS_VALUES 1 /* INSERT INTO ... VALUES ... */
+#define INS_DIRECT 2 /* this is for internal use in dict0crea:
+ insert the row directly */
+
+/* Node execution states */
+#define INS_NODE_SET_IX_LOCK 1 /* we should set an IX lock on table */
+#define INS_NODE_ALLOC_ROW_ID 2 /* row id should be allocated */
+#define INS_NODE_INSERT_ENTRIES 3 /* index entries should be built and
+ inserted */
+
+#ifndef UNIV_NONINL
+#include "row0ins.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/row0ins.ic b/storage/innobase/include/row0ins.ic
new file mode 100644
index 00000000000..84f6da255bf
--- /dev/null
+++ b/storage/innobase/include/row0ins.ic
@@ -0,0 +1,26 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0ins.ic
+Insert into a table
+
+Created 4/20/1996 Heikki Tuuri
+*******************************************************/
+
+
diff --git a/storage/innobase/include/row0merge.h b/storage/innobase/include/row0merge.h
new file mode 100644
index 00000000000..62a5efd11f7
--- /dev/null
+++ b/storage/innobase/include/row0merge.h
@@ -0,0 +1,197 @@
+/*****************************************************************************
+
+Copyright (c) 2005, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0merge.h
+Index build routines using a merge sort
+
+Created 13/06/2005 Jan Lindstrom
+*******************************************************/
+
+#ifndef row0merge_h
+#define row0merge_h
+
+#include "univ.i"
+#include "data0data.h"
+#include "dict0types.h"
+#include "trx0types.h"
+#include "que0types.h"
+#include "mtr0mtr.h"
+#include "rem0types.h"
+#include "rem0rec.h"
+#include "read0types.h"
+#include "btr0types.h"
+#include "row0mysql.h"
+#include "lock0types.h"
+
+/** Index field definition */
+struct merge_index_field_struct {
+ ulint prefix_len; /*!< column prefix length, or 0
+ if indexing the whole column */
+ const char* field_name; /*!< field name */
+};
+
+/** Index field definition */
+typedef struct merge_index_field_struct merge_index_field_t;
+
+/** Definition of an index being created */
+struct merge_index_def_struct {
+ const char* name; /*!< index name */
+ ulint ind_type; /*!< 0, DICT_UNIQUE,
+ or DICT_CLUSTERED */
+ ulint n_fields; /*!< number of fields
+ in index */
+ merge_index_field_t* fields; /*!< field definitions */
+};
+
+/** Definition of an index being created */
+typedef struct merge_index_def_struct merge_index_def_t;
+
+/*********************************************************************//**
+Sets an exclusive lock on a table, for the duration of creating indexes.
+@return error code or DB_SUCCESS */
+UNIV_INTERN
+ulint
+row_merge_lock_table(
+/*=================*/
+ trx_t* trx, /*!< in/out: transaction */
+ dict_table_t* table, /*!< in: table to lock */
+ enum lock_mode mode); /*!< in: LOCK_X or LOCK_S */
+/*********************************************************************//**
+Drop an index from the InnoDB system tables. The data dictionary must
+have been locked exclusively by the caller, because the transaction
+will not be committed. */
+UNIV_INTERN
+void
+row_merge_drop_index(
+/*=================*/
+ dict_index_t* index, /*!< in: index to be removed */
+ dict_table_t* table, /*!< in: table */
+ trx_t* trx); /*!< in: transaction handle */
+/*********************************************************************//**
+Drop those indexes which were created before an error occurred when
+building an index. The data dictionary must have been locked
+exclusively by the caller, because the transaction will not be
+committed. */
+UNIV_INTERN
+void
+row_merge_drop_indexes(
+/*===================*/
+ trx_t* trx, /*!< in: transaction */
+ dict_table_t* table, /*!< in: table containing the indexes */
+ dict_index_t** index, /*!< in: indexes to drop */
+ ulint num_created); /*!< in: number of elements in index[] */
+/*********************************************************************//**
+Drop all partially created indexes during crash recovery. */
+UNIV_INTERN
+void
+row_merge_drop_temp_indexes(void);
+/*=============================*/
+/*********************************************************************//**
+Rename the tables in the data dictionary. The data dictionary must
+have been locked exclusively by the caller, because the transaction
+will not be committed.
+@return error code or DB_SUCCESS */
+UNIV_INTERN
+ulint
+row_merge_rename_tables(
+/*====================*/
+ dict_table_t* old_table, /*!< in/out: old table, renamed to
+ tmp_name */
+ dict_table_t* new_table, /*!< in/out: new table, renamed to
+ old_table->name */
+ const char* tmp_name, /*!< in: new name for old_table */
+ trx_t* trx); /*!< in: transaction handle */
+
+/*********************************************************************//**
+Create a temporary table for creating a primary key, using the definition
+of an existing table.
+@return table, or NULL on error */
+UNIV_INTERN
+dict_table_t*
+row_merge_create_temporary_table(
+/*=============================*/
+ const char* table_name, /*!< in: new table name */
+ const merge_index_def_t*index_def, /*!< in: the index definition
+ of the primary key */
+ const dict_table_t* table, /*!< in: old table definition */
+ trx_t* trx); /*!< in/out: transaction
+ (sets error_state) */
+/*********************************************************************//**
+Rename the temporary indexes in the dictionary to permanent ones. The
+data dictionary must have been locked exclusively by the caller,
+because the transaction will not be committed.
+@return DB_SUCCESS if all OK */
+UNIV_INTERN
+ulint
+row_merge_rename_indexes(
+/*=====================*/
+ trx_t* trx, /*!< in/out: transaction */
+ dict_table_t* table); /*!< in/out: table with new indexes */
+/*********************************************************************//**
+Create the index and load in to the dictionary.
+@return index, or NULL on error */
+UNIV_INTERN
+dict_index_t*
+row_merge_create_index(
+/*===================*/
+ trx_t* trx, /*!< in/out: trx (sets error_state) */
+ dict_table_t* table, /*!< in: the index is on this table */
+ const merge_index_def_t*index_def);
+ /*!< in: the index definition */
+/*********************************************************************//**
+Check if a transaction can use an index.
+@return TRUE if index can be used by the transaction else FALSE */
+UNIV_INTERN
+ibool
+row_merge_is_index_usable(
+/*======================*/
+ const trx_t* trx, /*!< in: transaction */
+ const dict_index_t* index); /*!< in: index to check */
+/*********************************************************************//**
+If there are views that refer to the old table name then we "attach" to
+the new instance of the table else we drop it immediately.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+ulint
+row_merge_drop_table(
+/*=================*/
+ trx_t* trx, /*!< in: transaction */
+ dict_table_t* table); /*!< in: table instance to drop */
+
+/*********************************************************************//**
+Build indexes on a table by reading a clustered index,
+creating a temporary file containing index entries, merge sorting
+these index entries and inserting sorted index entries to indexes.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+ulint
+row_merge_build_indexes(
+/*====================*/
+ trx_t* trx, /*!< in: transaction */
+ dict_table_t* old_table, /*!< in: table where rows are
+ read from */
+ dict_table_t* new_table, /*!< in: table where indexes are
+ created; identical to old_table
+ unless creating a PRIMARY KEY */
+ dict_index_t** indexes, /*!< in: indexes to be created */
+ ulint n_indexes, /*!< in: size of indexes[] */
+ TABLE* table); /*!< in/out: MySQL table, for
+ reporting erroneous key value
+ if applicable */
+#endif /* row0merge.h */
diff --git a/storage/innobase/include/row0mysql.h b/storage/innobase/include/row0mysql.h
new file mode 100644
index 00000000000..97028622505
--- /dev/null
+++ b/storage/innobase/include/row0mysql.h
@@ -0,0 +1,784 @@
+/*****************************************************************************
+
+Copyright (c) 2000, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0mysql.h
+Interface between Innobase row operations and MySQL.
+Contains also create table and other data dictionary operations.
+
+Created 9/17/2000 Heikki Tuuri
+*******************************************************/
+
+#ifndef row0mysql_h
+#define row0mysql_h
+
+#include "univ.i"
+#include "data0data.h"
+#include "que0types.h"
+#include "dict0types.h"
+#include "trx0types.h"
+#include "row0types.h"
+#include "btr0pcur.h"
+#include "trx0types.h"
+
+extern ibool row_rollback_on_timeout;
+
+typedef struct row_prebuilt_struct row_prebuilt_t;
+
+/*******************************************************************//**
+Frees the blob heap in prebuilt when no longer needed. */
+UNIV_INTERN
+void
+row_mysql_prebuilt_free_blob_heap(
+/*==============================*/
+ row_prebuilt_t* prebuilt); /*!< in: prebuilt struct of a
+ ha_innobase:: table handle */
+/*******************************************************************//**
+Stores a >= 5.0.3 format true VARCHAR length to dest, in the MySQL row
+format.
+@return pointer to the data, we skip the 1 or 2 bytes at the start
+that are used to store the len */
+UNIV_INTERN
+byte*
+row_mysql_store_true_var_len(
+/*=========================*/
+ byte* dest, /*!< in: where to store */
+ ulint len, /*!< in: length, must fit in two bytes */
+ ulint lenlen);/*!< in: storage length of len: either 1 or 2 bytes */
+/*******************************************************************//**
+Reads a >= 5.0.3 format true VARCHAR length, in the MySQL row format, and
+returns a pointer to the data.
+@return pointer to the data, we skip the 1 or 2 bytes at the start
+that are used to store the len */
+UNIV_INTERN
+const byte*
+row_mysql_read_true_varchar(
+/*========================*/
+ ulint* len, /*!< out: variable-length field length */
+ const byte* field, /*!< in: field in the MySQL format */
+ ulint lenlen);/*!< in: storage length of len: either 1
+ or 2 bytes */
+/*******************************************************************//**
+Stores a reference to a BLOB in the MySQL format. */
+UNIV_INTERN
+void
+row_mysql_store_blob_ref(
+/*=====================*/
+ byte* dest, /*!< in: where to store */
+ ulint col_len,/*!< in: dest buffer size: determines into
+ how many bytes the BLOB length is stored,
+ the space for the length may vary from 1
+ to 4 bytes */
+ const void* data, /*!< in: BLOB data; if the value to store
+ is SQL NULL this should be NULL pointer */
+ ulint len); /*!< in: BLOB length; if the value to store
+ is SQL NULL this should be 0; remember
+ also to set the NULL bit in the MySQL record
+ header! */
+/*******************************************************************//**
+Reads a reference to a BLOB in the MySQL format.
+@return pointer to BLOB data */
+UNIV_INTERN
+const byte*
+row_mysql_read_blob_ref(
+/*====================*/
+ ulint* len, /*!< out: BLOB length */
+ const byte* ref, /*!< in: BLOB reference in the
+ MySQL format */
+ ulint col_len); /*!< in: BLOB reference length
+ (not BLOB length) */
+/**************************************************************//**
+Stores a non-SQL-NULL field given in the MySQL format in the InnoDB format.
+The counterpart of this function is row_sel_field_store_in_mysql_format() in
+row0sel.c.
+@return up to which byte we used buf in the conversion */
+UNIV_INTERN
+byte*
+row_mysql_store_col_in_innobase_format(
+/*===================================*/
+ dfield_t* dfield, /*!< in/out: dfield where dtype
+ information must be already set when
+ this function is called! */
+ byte* buf, /*!< in/out: buffer for a converted
+ integer value; this must be at least
+ col_len long then! */
+ ibool row_format_col, /*!< TRUE if the mysql_data is from
+ a MySQL row, FALSE if from a MySQL
+ key value;
+ in MySQL, a true VARCHAR storage
+ format differs in a row and in a
+ key value: in a key value the length
+ is always stored in 2 bytes! */
+ const byte* mysql_data, /*!< in: MySQL column value, not
+ SQL NULL; NOTE that dfield may also
+ get a pointer to mysql_data,
+ therefore do not discard this as long
+ as dfield is used! */
+ ulint col_len, /*!< in: MySQL column length; NOTE that
+ this is the storage length of the
+ column in the MySQL format row, not
+ necessarily the length of the actual
+ payload data; if the column is a true
+ VARCHAR then this is irrelevant */
+ ulint comp); /*!< in: nonzero=compact format */
+/****************************************************************//**
+Handles user errors and lock waits detected by the database engine.
+@return TRUE if it was a lock wait and we should continue running the
+query thread */
+UNIV_INTERN
+ibool
+row_mysql_handle_errors(
+/*====================*/
+ ulint* new_err,/*!< out: possible new error encountered in
+ rollback, or the old error which was
+ during the function entry */
+ trx_t* trx, /*!< in: transaction */
+ que_thr_t* thr, /*!< in: query thread */
+ trx_savept_t* savept);/*!< in: savepoint */
+/********************************************************************//**
+Create a prebuilt struct for a MySQL table handle.
+@return own: a prebuilt struct */
+UNIV_INTERN
+row_prebuilt_t*
+row_create_prebuilt(
+/*================*/
+ dict_table_t* table); /*!< in: Innobase table handle */
+/********************************************************************//**
+Free a prebuilt struct for a MySQL table handle. */
+UNIV_INTERN
+void
+row_prebuilt_free(
+/*==============*/
+ row_prebuilt_t* prebuilt, /*!< in, own: prebuilt struct */
+ ibool dict_locked); /*!< in: TRUE=data dictionary locked */
+/*********************************************************************//**
+Updates the transaction pointers in query graphs stored in the prebuilt
+struct. */
+UNIV_INTERN
+void
+row_update_prebuilt_trx(
+/*====================*/
+ row_prebuilt_t* prebuilt, /*!< in/out: prebuilt struct
+ in MySQL handle */
+ trx_t* trx); /*!< in: transaction handle */
+/*********************************************************************//**
+Unlocks AUTO_INC type locks that were possibly reserved by a trx. */
+UNIV_INTERN
+void
+row_unlock_table_autoinc_for_mysql(
+/*===============================*/
+ trx_t* trx); /*!< in/out: transaction */
+/*********************************************************************//**
+Sets an AUTO_INC type lock on the table mentioned in prebuilt. The
+AUTO_INC lock gives exclusive access to the auto-inc counter of the
+table. The lock is reserved only for the duration of an SQL statement.
+It is not compatible with another AUTO_INC or exclusive lock on the
+table.
+@return error code or DB_SUCCESS */
+UNIV_INTERN
+int
+row_lock_table_autoinc_for_mysql(
+/*=============================*/
+ row_prebuilt_t* prebuilt); /*!< in: prebuilt struct in the MySQL
+ table handle */
+/*********************************************************************//**
+Sets a table lock on the table mentioned in prebuilt.
+@return error code or DB_SUCCESS */
+UNIV_INTERN
+int
+row_lock_table_for_mysql(
+/*=====================*/
+ row_prebuilt_t* prebuilt, /*!< in: prebuilt struct in the MySQL
+ table handle */
+ dict_table_t* table, /*!< in: table to lock, or NULL
+ if prebuilt->table should be
+ locked as
+ prebuilt->select_lock_type */
+ ulint mode); /*!< in: lock mode of table
+ (ignored if table==NULL) */
+
+/*********************************************************************//**
+Does an insert for MySQL.
+@return error code or DB_SUCCESS */
+UNIV_INTERN
+int
+row_insert_for_mysql(
+/*=================*/
+ byte* mysql_rec, /*!< in: row in the MySQL format */
+ row_prebuilt_t* prebuilt); /*!< in: prebuilt struct in MySQL
+ handle */
+/*********************************************************************//**
+Builds a dummy query graph used in selects. */
+UNIV_INTERN
+void
+row_prebuild_sel_graph(
+/*===================*/
+ row_prebuilt_t* prebuilt); /*!< in: prebuilt struct in MySQL
+ handle */
+/*********************************************************************//**
+Gets pointer to a prebuilt update vector used in updates. If the update
+graph has not yet been built in the prebuilt struct, then this function
+first builds it.
+@return prebuilt update vector */
+UNIV_INTERN
+upd_t*
+row_get_prebuilt_update_vector(
+/*===========================*/
+ row_prebuilt_t* prebuilt); /*!< in: prebuilt struct in MySQL
+ handle */
+/*********************************************************************//**
+Checks if a table is such that we automatically created a clustered
+index on it (on row id).
+@return TRUE if the clustered index was generated automatically */
+UNIV_INTERN
+ibool
+row_table_got_default_clust_index(
+/*==============================*/
+ const dict_table_t* table); /*!< in: table */
+/*********************************************************************//**
+Calculates the key number used inside MySQL for an Innobase index. We have
+to take into account if we generated a default clustered index for the table
+@return the key number used inside MySQL */
+UNIV_INTERN
+ulint
+row_get_mysql_key_number_for_index(
+/*===============================*/
+ const dict_index_t* index); /*!< in: index */
+/*********************************************************************//**
+Does an update or delete of a row for MySQL.
+@return error code or DB_SUCCESS */
+UNIV_INTERN
+int
+row_update_for_mysql(
+/*=================*/
+ byte* mysql_rec, /*!< in: the row to be updated, in
+ the MySQL format */
+ row_prebuilt_t* prebuilt); /*!< in: prebuilt struct in MySQL
+ handle */
+/*********************************************************************//**
+This can only be used when srv_locks_unsafe_for_binlog is TRUE or
+session is using a READ COMMITTED isolation level. Before
+calling this function we must use trx_reset_new_rec_lock_info() and
+trx_register_new_rec_lock() to store the information which new record locks
+really were set. This function removes a newly set lock under prebuilt->pcur,
+and also under prebuilt->clust_pcur. Currently, this is only used and tested
+in the case of an UPDATE or a DELETE statement, where the row lock is of the
+LOCK_X type.
+Thus, this implements a 'mini-rollback' that releases the latest record
+locks we set.
+@return error code or DB_SUCCESS */
+UNIV_INTERN
+int
+row_unlock_for_mysql(
+/*=================*/
+ row_prebuilt_t* prebuilt, /*!< in: prebuilt struct in MySQL
+ handle */
+ ibool has_latches_on_recs);/*!< TRUE if called so that we have
+ the latches on the records under pcur
+ and clust_pcur, and we do not need to
+ reposition the cursors. */
+/*********************************************************************//**
+Creates an query graph node of 'update' type to be used in the MySQL
+interface.
+@return own: update node */
+UNIV_INTERN
+upd_node_t*
+row_create_update_node_for_mysql(
+/*=============================*/
+ dict_table_t* table, /*!< in: table to update */
+ mem_heap_t* heap); /*!< in: mem heap from which allocated */
+/**********************************************************************//**
+Does a cascaded delete or set null in a foreign key operation.
+@return error code or DB_SUCCESS */
+UNIV_INTERN
+ulint
+row_update_cascade_for_mysql(
+/*=========================*/
+ que_thr_t* thr, /*!< in: query thread */
+ upd_node_t* node, /*!< in: update node used in the cascade
+ or set null operation */
+ dict_table_t* table); /*!< in: table where we do the operation */
+/*********************************************************************//**
+Locks the data dictionary exclusively for performing a table create or other
+data dictionary modification operation. */
+UNIV_INTERN
+void
+row_mysql_lock_data_dictionary_func(
+/*================================*/
+ trx_t* trx, /*!< in/out: transaction */
+ const char* file, /*!< in: file name */
+ ulint line); /*!< in: line number */
+#define row_mysql_lock_data_dictionary(trx) \
+ row_mysql_lock_data_dictionary_func(trx, __FILE__, __LINE__)
+/*********************************************************************//**
+Unlocks the data dictionary exclusive lock. */
+UNIV_INTERN
+void
+row_mysql_unlock_data_dictionary(
+/*=============================*/
+ trx_t* trx); /*!< in/out: transaction */
+/*********************************************************************//**
+Locks the data dictionary in shared mode from modifications, for performing
+foreign key check, rollback, or other operation invisible to MySQL. */
+UNIV_INTERN
+void
+row_mysql_freeze_data_dictionary_func(
+/*==================================*/
+ trx_t* trx, /*!< in/out: transaction */
+ const char* file, /*!< in: file name */
+ ulint line); /*!< in: line number */
+#define row_mysql_freeze_data_dictionary(trx) \
+ row_mysql_freeze_data_dictionary_func(trx, __FILE__, __LINE__)
+/*********************************************************************//**
+Unlocks the data dictionary shared lock. */
+UNIV_INTERN
+void
+row_mysql_unfreeze_data_dictionary(
+/*===============================*/
+ trx_t* trx); /*!< in/out: transaction */
+/*********************************************************************//**
+Creates a table for MySQL. If the name of the table ends in
+one of "innodb_monitor", "innodb_lock_monitor", "innodb_tablespace_monitor",
+"innodb_table_monitor", then this will also start the printing of monitor
+output by the master thread. If the table name ends in "innodb_mem_validate",
+InnoDB will try to invoke mem_validate().
+@return error code or DB_SUCCESS */
+UNIV_INTERN
+int
+row_create_table_for_mysql(
+/*=======================*/
+ dict_table_t* table, /*!< in, own: table definition
+ (will be freed) */
+ trx_t* trx); /*!< in: transaction handle */
+/*********************************************************************//**
+Does an index creation operation for MySQL. TODO: currently failure
+to create an index results in dropping the whole table! This is no problem
+currently as all indexes must be created at the same time as the table.
+@return error number or DB_SUCCESS */
+UNIV_INTERN
+int
+row_create_index_for_mysql(
+/*=======================*/
+ dict_index_t* index, /*!< in, own: index definition
+ (will be freed) */
+ trx_t* trx, /*!< in: transaction handle */
+ const ulint* field_lengths); /*!< in: if not NULL, must contain
+ dict_index_get_n_fields(index)
+ actual field lengths for the
+ index columns, which are
+ then checked for not being too
+ large. */
+/*********************************************************************//**
+Scans a table create SQL string and adds to the data dictionary
+the foreign key constraints declared in the string. This function
+should be called after the indexes for a table have been created.
+Each foreign key constraint must be accompanied with indexes in
+bot participating tables. The indexes are allowed to contain more
+fields than mentioned in the constraint.
+@return error code or DB_SUCCESS */
+UNIV_INTERN
+int
+row_table_add_foreign_constraints(
+/*==============================*/
+ trx_t* trx, /*!< in: transaction */
+ const char* sql_string, /*!< in: table create statement where
+ foreign keys are declared like:
+ FOREIGN KEY (a, b) REFERENCES table2(c, d),
+ table2 can be written also with the
+ database name before it: test.table2 */
+ const char* name, /*!< in: table full name in the
+ normalized form
+ database_name/table_name */
+ ibool reject_fks); /*!< in: if TRUE, fail with error
+ code DB_CANNOT_ADD_CONSTRAINT if
+ any foreign keys are found. */
+
+/*********************************************************************//**
+The master thread in srv0srv.c calls this regularly to drop tables which
+we must drop in background after queries to them have ended. Such lazy
+dropping of tables is needed in ALTER TABLE on Unix.
+@return how many tables dropped + remaining tables in list */
+UNIV_INTERN
+ulint
+row_drop_tables_for_mysql_in_background(void);
+/*=========================================*/
+/*********************************************************************//**
+Get the background drop list length. NOTE: the caller must own the kernel
+mutex!
+@return how many tables in list */
+UNIV_INTERN
+ulint
+row_get_background_drop_list_len_low(void);
+/*======================================*/
+/*********************************************************************//**
+Truncates a table for MySQL.
+@return error code or DB_SUCCESS */
+UNIV_INTERN
+int
+row_truncate_table_for_mysql(
+/*=========================*/
+ dict_table_t* table, /*!< in: table handle */
+ trx_t* trx); /*!< in: transaction handle */
+/*********************************************************************//**
+Drops a table for MySQL. If the name of the dropped table ends in
+one of "innodb_monitor", "innodb_lock_monitor", "innodb_tablespace_monitor",
+"innodb_table_monitor", then this will also stop the printing of monitor
+output by the master thread. If the data dictionary was not already locked
+by the transaction, the transaction will be committed. Otherwise, the
+data dictionary will remain locked.
+@return error code or DB_SUCCESS */
+UNIV_INTERN
+int
+row_drop_table_for_mysql(
+/*=====================*/
+ const char* name, /*!< in: table name */
+ trx_t* trx, /*!< in: transaction handle */
+ ibool drop_db);/*!< in: TRUE=dropping whole database */
+
+/*********************************************************************//**
+Discards the tablespace of a table which stored in an .ibd file. Discarding
+means that this function deletes the .ibd file and assigns a new table id for
+the table. Also the flag table->ibd_file_missing is set TRUE.
+@return error code or DB_SUCCESS */
+UNIV_INTERN
+int
+row_discard_tablespace_for_mysql(
+/*=============================*/
+ const char* name, /*!< in: table name */
+ trx_t* trx); /*!< in: transaction handle */
+/*****************************************************************//**
+Imports a tablespace. The space id in the .ibd file must match the space id
+of the table in the data dictionary.
+@return error code or DB_SUCCESS */
+UNIV_INTERN
+int
+row_import_tablespace_for_mysql(
+/*============================*/
+ const char* name, /*!< in: table name */
+ trx_t* trx); /*!< in: transaction handle */
+/*********************************************************************//**
+Drops a database for MySQL.
+@return error code or DB_SUCCESS */
+UNIV_INTERN
+int
+row_drop_database_for_mysql(
+/*========================*/
+ const char* name, /*!< in: database name which ends to '/' */
+ trx_t* trx); /*!< in: transaction handle */
+/*********************************************************************//**
+Renames a table for MySQL.
+@return error code or DB_SUCCESS */
+UNIV_INTERN
+ulint
+row_rename_table_for_mysql(
+/*=======================*/
+ const char* old_name, /*!< in: old table name */
+ const char* new_name, /*!< in: new table name */
+ trx_t* trx, /*!< in: transaction handle */
+ ibool commit); /*!< in: if TRUE then commit trx */
+/*********************************************************************//**
+Checks a table for corruption.
+@return DB_ERROR or DB_SUCCESS */
+UNIV_INTERN
+ulint
+row_check_table_for_mysql(
+/*======================*/
+ row_prebuilt_t* prebuilt); /*!< in: prebuilt struct in MySQL
+ handle */
+
+/*********************************************************************//**
+Determines if a table is a magic monitor table.
+@return TRUE if monitor table */
+UNIV_INTERN
+ibool
+row_is_magic_monitor_table(
+/*=======================*/
+ const char* table_name); /*!< in: name of the table, in the
+ form database/table_name */
+
+/* A struct describing a place for an individual column in the MySQL
+row format which is presented to the table handler in ha_innobase.
+This template struct is used to speed up row transformations between
+Innobase and MySQL. */
+
+typedef struct mysql_row_templ_struct mysql_row_templ_t;
+struct mysql_row_templ_struct {
+ ulint col_no; /*!< column number of the column */
+ ulint rec_field_no; /*!< field number of the column in an
+ Innobase record in the current index;
+ not defined if template_type is
+ ROW_MYSQL_WHOLE_ROW */
+ ulint mysql_col_offset; /*!< offset of the column in the MySQL
+ row format */
+ ulint mysql_col_len; /*!< length of the column in the MySQL
+ row format */
+ ulint mysql_null_byte_offset; /*!< MySQL NULL bit byte offset in a
+ MySQL record */
+ ulint mysql_null_bit_mask; /*!< bit mask to get the NULL bit,
+ zero if column cannot be NULL */
+ ulint type; /*!< column type in Innobase mtype
+ numbers DATA_CHAR... */
+ ulint mysql_type; /*!< MySQL type code; this is always
+ < 256 */
+ ulint mysql_length_bytes; /*!< if mysql_type
+ == DATA_MYSQL_TRUE_VARCHAR, this tells
+ whether we should use 1 or 2 bytes to
+ store the MySQL true VARCHAR data
+ length at the start of row in the MySQL
+ format (NOTE that the MySQL key value
+ format always uses 2 bytes for the data
+ len) */
+ ulint charset; /*!< MySQL charset-collation code
+ of the column, or zero */
+ ulint mbminlen; /*!< minimum length of a char, in bytes,
+ or zero if not a char type */
+ ulint mbmaxlen; /*!< maximum length of a char, in bytes,
+ or zero if not a char type */
+ ulint is_unsigned; /*!< if a column type is an integer
+ type and this field is != 0, then
+ it is an unsigned integer type */
+};
+
+#define MYSQL_FETCH_CACHE_SIZE 8
+/* After fetching this many rows, we start caching them in fetch_cache */
+#define MYSQL_FETCH_CACHE_THRESHOLD 4
+
+#define ROW_PREBUILT_ALLOCATED 78540783
+#define ROW_PREBUILT_FREED 26423527
+
+/** A struct for (sometimes lazily) prebuilt structures in an Innobase table
+handle used within MySQL; these are used to save CPU time. */
+
+struct row_prebuilt_struct {
+ ulint magic_n; /*!< this magic number is set to
+ ROW_PREBUILT_ALLOCATED when created,
+ or ROW_PREBUILT_FREED when the
+ struct has been freed */
+ dict_table_t* table; /*!< Innobase table handle */
+ dict_index_t* index; /*!< current index for a search, if
+ any */
+ trx_t* trx; /*!< current transaction handle */
+ unsigned sql_stat_start:1;/*!< TRUE when we start processing of
+ an SQL statement: we may have to set
+ an intention lock on the table,
+ create a consistent read view etc. */
+ unsigned mysql_has_locked:1;/*!< this is set TRUE when MySQL
+ calls external_lock on this handle
+ with a lock flag, and set FALSE when
+ with the F_UNLOCK flag */
+ unsigned clust_index_was_generated:1;
+ /*!< if the user did not define a
+ primary key in MySQL, then Innobase
+ automatically generated a clustered
+ index where the ordering column is
+ the row id: in this case this flag
+ is set to TRUE */
+ unsigned index_usable:1; /*!< caches the value of
+ row_merge_is_index_usable(trx,index) */
+ unsigned read_just_key:1;/*!< set to 1 when MySQL calls
+ ha_innobase::extra with the
+ argument HA_EXTRA_KEYREAD; it is enough
+ to read just columns defined in
+ the index (i.e., no read of the
+ clustered index record necessary) */
+ unsigned used_in_HANDLER:1;/*!< TRUE if we have been using this
+ handle in a MySQL HANDLER low level
+ index cursor command: then we must
+ store the pcur position even in a
+ unique search from a clustered index,
+ because HANDLER allows NEXT and PREV
+ in such a situation */
+ unsigned template_type:2;/*!< ROW_MYSQL_WHOLE_ROW,
+ ROW_MYSQL_REC_FIELDS,
+ ROW_MYSQL_DUMMY_TEMPLATE, or
+ ROW_MYSQL_NO_TEMPLATE */
+ unsigned n_template:10; /*!< number of elements in the
+ template */
+ unsigned null_bitmap_len:10;/*!< number of bytes in the SQL NULL
+ bitmap at the start of a row in the
+ MySQL format */
+ unsigned need_to_access_clustered:1; /*!< if we are fetching
+ columns through a secondary index
+ and at least one column is not in
+ the secondary index, then this is
+ set to TRUE */
+ unsigned templ_contains_blob:1;/*!< TRUE if the template contains
+ BLOB column(s) */
+ mysql_row_templ_t* mysql_template;/*!< template used to transform
+ rows fast between MySQL and Innobase
+ formats; memory for this template
+ is not allocated from 'heap' */
+ mem_heap_t* heap; /*!< memory heap from which
+ these auxiliary structures are
+ allocated when needed */
+ ins_node_t* ins_node; /*!< Innobase SQL insert node
+ used to perform inserts
+ to the table */
+ byte* ins_upd_rec_buff;/*!< buffer for storing data converted
+ to the Innobase format from the MySQL
+ format */
+ const byte* default_rec; /*!< the default values of all columns
+ (a "default row") in MySQL format */
+ ulint hint_need_to_fetch_extra_cols;
+ /*!< normally this is set to 0; if this
+ is set to ROW_RETRIEVE_PRIMARY_KEY,
+ then we should at least retrieve all
+ columns in the primary key; if this
+ is set to ROW_RETRIEVE_ALL_COLS, then
+ we must retrieve all columns in the
+ key (if read_just_key == 1), or all
+ columns in the table */
+ upd_node_t* upd_node; /*!< Innobase SQL update node used
+ to perform updates and deletes */
+ que_fork_t* ins_graph; /*!< Innobase SQL query graph used
+ in inserts */
+ que_fork_t* upd_graph; /*!< Innobase SQL query graph used
+ in updates or deletes */
+ btr_pcur_t* pcur; /*!< persistent cursor used in selects
+ and updates */
+ btr_pcur_t* clust_pcur; /*!< persistent cursor used in
+ some selects and updates */
+ que_fork_t* sel_graph; /*!< dummy query graph used in
+ selects */
+ dtuple_t* search_tuple; /*!< prebuilt dtuple used in selects */
+ byte row_id[DATA_ROW_ID_LEN];
+ /*!< if the clustered index was
+ generated, the row id of the
+ last row fetched is stored
+ here */
+ dtuple_t* clust_ref; /*!< prebuilt dtuple used in
+ sel/upd/del */
+ ulint select_lock_type;/*!< LOCK_NONE, LOCK_S, or LOCK_X */
+ ulint stored_select_lock_type;/*!< this field is used to
+ remember the original select_lock_type
+ that was decided in ha_innodb.cc,
+ ::store_lock(), ::external_lock(),
+ etc. */
+ ulint row_read_type; /*!< ROW_READ_WITH_LOCKS if row locks
+ should be the obtained for records
+ under an UPDATE or DELETE cursor.
+ If innodb_locks_unsafe_for_binlog
+ is TRUE, this can be set to
+ ROW_READ_TRY_SEMI_CONSISTENT, so that
+ if the row under an UPDATE or DELETE
+ cursor was locked by another
+ transaction, InnoDB will resort
+ to reading the last committed value
+ ('semi-consistent read'). Then,
+ this field will be set to
+ ROW_READ_DID_SEMI_CONSISTENT to
+ indicate that. If the row does not
+ match the WHERE condition, MySQL will
+ invoke handler::unlock_row() to
+ clear the flag back to
+ ROW_READ_TRY_SEMI_CONSISTENT and
+ to simply skip the row. If
+ the row matches, the next call to
+ row_search_for_mysql() will lock
+ the row.
+ This eliminates lock waits in some
+ cases; note that this breaks
+ serializability. */
+ ulint new_rec_locks; /*!< normally 0; if
+ srv_locks_unsafe_for_binlog is
+ TRUE or session is using READ
+ COMMITTED isolation level, in a
+ cursor search, if we set a new
+ record lock on an index, this is
+ incremented; this is used in
+ releasing the locks under the
+ cursors if we are performing an
+ UPDATE and we determine after
+ retrieving the row that it does
+ not need to be locked; thus,
+ these can be used to implement a
+ 'mini-rollback' that releases
+ the latest record locks */
+ ulint mysql_prefix_len;/*!< byte offset of the end of
+ the last requested column */
+ ulint mysql_row_len; /*!< length in bytes of a row in the
+ MySQL format */
+ ulint n_rows_fetched; /*!< number of rows fetched after
+ positioning the current cursor */
+ ulint fetch_direction;/*!< ROW_SEL_NEXT or ROW_SEL_PREV */
+ byte* fetch_cache[MYSQL_FETCH_CACHE_SIZE];
+ /*!< a cache for fetched rows if we
+ fetch many rows from the same cursor:
+ it saves CPU time to fetch them in a
+ batch; we reserve mysql_row_len
+ bytes for each such row; these
+ pointers point 4 bytes past the
+ allocated mem buf start, because
+ there is a 4 byte magic number at the
+ start and at the end */
+ ibool keep_other_fields_on_keyread; /*!< when using fetch
+ cache with HA_EXTRA_KEYREAD, don't
+ overwrite other fields in mysql row
+ row buffer.*/
+ ulint fetch_cache_first;/*!< position of the first not yet
+ fetched row in fetch_cache */
+ ulint n_fetch_cached; /*!< number of not yet fetched rows
+ in fetch_cache */
+ mem_heap_t* blob_heap; /*!< in SELECTS BLOB fields are copied
+ to this heap */
+ mem_heap_t* old_vers_heap; /*!< memory heap where a previous
+ version is built in consistent read */
+ /*----------------------*/
+ ulonglong autoinc_last_value;
+ /*!< last value of AUTO-INC interval */
+ ulonglong autoinc_increment;/*!< The increment step of the auto
+ increment column. Value must be
+ greater than or equal to 1. Required to
+ calculate the next value */
+ ulonglong autoinc_offset; /*!< The offset passed to
+ get_auto_increment() by MySQL. Required
+ to calculate the next value */
+ ulint autoinc_error; /*!< The actual error code encountered
+ while trying to init or read the
+ autoinc value from the table. We
+ store it here so that we can return
+ it to MySQL */
+ /*----------------------*/
+ UT_LIST_NODE_T(row_prebuilt_t) prebuilts;
+ /*!< list node of table->prebuilts */
+ ulint magic_n2; /*!< this should be the same as
+ magic_n */
+};
+
+#define ROW_PREBUILT_FETCH_MAGIC_N 465765687
+
+#define ROW_MYSQL_WHOLE_ROW 0
+#define ROW_MYSQL_REC_FIELDS 1
+#define ROW_MYSQL_NO_TEMPLATE 2
+#define ROW_MYSQL_DUMMY_TEMPLATE 3 /* dummy template used in
+ row_scan_and_check_index */
+
+/* Values for hint_need_to_fetch_extra_cols */
+#define ROW_RETRIEVE_PRIMARY_KEY 1
+#define ROW_RETRIEVE_ALL_COLS 2
+
+/* Values for row_read_type */
+#define ROW_READ_WITH_LOCKS 0
+#define ROW_READ_TRY_SEMI_CONSISTENT 1
+#define ROW_READ_DID_SEMI_CONSISTENT 2
+
+#ifndef UNIV_NONINL
+#include "row0mysql.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/row0mysql.ic b/storage/innobase/include/row0mysql.ic
new file mode 100644
index 00000000000..35033aa2ad1
--- /dev/null
+++ b/storage/innobase/include/row0mysql.ic
@@ -0,0 +1,24 @@
+/*****************************************************************************
+
+Copyright (c) 2001, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0mysql.ic
+MySQL interface for Innobase
+
+Created 1/23/2001 Heikki Tuuri
+*******************************************************/
diff --git a/storage/innobase/include/row0purge.h b/storage/innobase/include/row0purge.h
new file mode 100644
index 00000000000..89ec54fb54a
--- /dev/null
+++ b/storage/innobase/include/row0purge.h
@@ -0,0 +1,96 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0purge.h
+Purge obsolete records
+
+Created 3/14/1997 Heikki Tuuri
+*******************************************************/
+
+#ifndef row0purge_h
+#define row0purge_h
+
+#include "univ.i"
+#include "data0data.h"
+#include "btr0types.h"
+#include "btr0pcur.h"
+#include "dict0types.h"
+#include "trx0types.h"
+#include "que0types.h"
+#include "row0types.h"
+
+/********************************************************************//**
+Creates a purge node to a query graph.
+@return own: purge node */
+UNIV_INTERN
+purge_node_t*
+row_purge_node_create(
+/*==================*/
+ que_thr_t* parent, /*!< in: parent node, i.e., a thr node */
+ mem_heap_t* heap); /*!< in: memory heap where created */
+/***********************************************************//**
+Does the purge operation for a single undo log record. This is a high-level
+function used in an SQL execution graph.
+@return query thread to run next or NULL */
+UNIV_INTERN
+que_thr_t*
+row_purge_step(
+/*===========*/
+ que_thr_t* thr); /*!< in: query thread */
+
+/* Purge node structure */
+
+struct purge_node_struct{
+ que_common_t common; /*!< node type: QUE_NODE_PURGE */
+ /*----------------------*/
+ /* Local storage for this graph node */
+ roll_ptr_t roll_ptr;/* roll pointer to undo log record */
+ trx_undo_rec_t* undo_rec;/* undo log record */
+ trx_undo_inf_t* reservation;/* reservation for the undo log record in
+ the purge array */
+ undo_no_t undo_no;/* undo number of the record */
+ ulint rec_type;/* undo log record type: TRX_UNDO_INSERT_REC,
+ ... */
+ btr_pcur_t pcur; /*!< persistent cursor used in searching the
+ clustered index record */
+ ibool found_clust;/* TRUE if the clustered index record
+ determined by ref was found in the clustered
+ index, and we were able to position pcur on
+ it */
+ dict_table_t* table; /*!< table where purge is done */
+ ulint cmpl_info;/* compiler analysis info of an update */
+ upd_t* update; /*!< update vector for a clustered index
+ record */
+ dtuple_t* ref; /*!< NULL, or row reference to the next row to
+ handle */
+ dtuple_t* row; /*!< NULL, or a copy (also fields copied to
+ heap) of the indexed fields of the row to
+ handle */
+ dict_index_t* index; /*!< NULL, or the next index whose record should
+ be handled */
+ mem_heap_t* heap; /*!< memory heap used as auxiliary storage for
+ row; this must be emptied after a successful
+ purge of a row */
+};
+
+#ifndef UNIV_NONINL
+#include "row0purge.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/row0purge.ic b/storage/innobase/include/row0purge.ic
new file mode 100644
index 00000000000..23d7d3845a4
--- /dev/null
+++ b/storage/innobase/include/row0purge.ic
@@ -0,0 +1,25 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+
+/**************************************************//**
+@file include/row0purge.ic
+Purge obsolete records
+
+Created 3/14/1997 Heikki Tuuri
+*******************************************************/
diff --git a/storage/innobase/include/row0row.h b/storage/innobase/include/row0row.h
new file mode 100644
index 00000000000..723b7b53395
--- /dev/null
+++ b/storage/innobase/include/row0row.h
@@ -0,0 +1,310 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0row.h
+General row routines
+
+Created 4/20/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef row0row_h
+#define row0row_h
+
+#include "univ.i"
+#include "data0data.h"
+#include "dict0types.h"
+#include "trx0types.h"
+#include "que0types.h"
+#include "mtr0mtr.h"
+#include "rem0types.h"
+#include "read0types.h"
+#include "row0types.h"
+#include "btr0types.h"
+
+/*********************************************************************//**
+Gets the offset of the trx id field, in bytes relative to the origin of
+a clustered index record.
+@return offset of DATA_TRX_ID */
+UNIV_INTERN
+ulint
+row_get_trx_id_offset(
+/*==================*/
+ const rec_t* rec, /*!< in: record */
+ dict_index_t* index, /*!< in: clustered index */
+ const ulint* offsets);/*!< in: rec_get_offsets(rec, index) */
+/*********************************************************************//**
+Reads the trx id field from a clustered index record.
+@return value of the field */
+UNIV_INLINE
+trx_id_t
+row_get_rec_trx_id(
+/*===============*/
+ const rec_t* rec, /*!< in: record */
+ dict_index_t* index, /*!< in: clustered index */
+ const ulint* offsets);/*!< in: rec_get_offsets(rec, index) */
+/*********************************************************************//**
+Reads the roll pointer field from a clustered index record.
+@return value of the field */
+UNIV_INLINE
+roll_ptr_t
+row_get_rec_roll_ptr(
+/*=================*/
+ const rec_t* rec, /*!< in: record */
+ dict_index_t* index, /*!< in: clustered index */
+ const ulint* offsets);/*!< in: rec_get_offsets(rec, index) */
+/*****************************************************************//**
+When an insert or purge to a table is performed, this function builds
+the entry to be inserted into or purged from an index on the table.
+@return index entry which should be inserted or purged, or NULL if the
+externally stored columns in the clustered index record are
+unavailable and ext != NULL */
+UNIV_INTERN
+dtuple_t*
+row_build_index_entry(
+/*==================*/
+ const dtuple_t* row, /*!< in: row which should be
+ inserted or purged */
+ row_ext_t* ext, /*!< in: externally stored column prefixes,
+ or NULL */
+ dict_index_t* index, /*!< in: index on the table */
+ mem_heap_t* heap); /*!< in: memory heap from which the memory for
+ the index entry is allocated */
+/*******************************************************************//**
+An inverse function to row_build_index_entry. Builds a row from a
+record in a clustered index.
+@return own: row built; see the NOTE below! */
+UNIV_INTERN
+dtuple_t*
+row_build(
+/*======*/
+ ulint type, /*!< in: ROW_COPY_POINTERS or
+ ROW_COPY_DATA; the latter
+ copies also the data fields to
+ heap while the first only
+ places pointers to data fields
+ on the index page, and thus is
+ more efficient */
+ const dict_index_t* index, /*!< in: clustered index */
+ const rec_t* rec, /*!< in: record in the clustered
+ index; NOTE: in the case
+ ROW_COPY_POINTERS the data
+ fields in the row will point
+ directly into this record,
+ therefore, the buffer page of
+ this record must be at least
+ s-latched and the latch held
+ as long as the row dtuple is used! */
+ const ulint* offsets,/*!< in: rec_get_offsets(rec,index)
+ or NULL, in which case this function
+ will invoke rec_get_offsets() */
+ const dict_table_t* col_table,
+ /*!< in: table, to check which
+ externally stored columns
+ occur in the ordering columns
+ of an index, or NULL if
+ index->table should be
+ consulted instead; the user
+ columns in this table should be
+ the same columns as in index->table */
+ row_ext_t** ext, /*!< out, own: cache of
+ externally stored column
+ prefixes, or NULL */
+ mem_heap_t* heap); /*!< in: memory heap from which
+ the memory needed is allocated */
+/*******************************************************************//**
+Converts an index record to a typed data tuple.
+@return index entry built; does not set info_bits, and the data fields
+in the entry will point directly to rec */
+UNIV_INTERN
+dtuple_t*
+row_rec_to_index_entry_low(
+/*=======================*/
+ const rec_t* rec, /*!< in: record in the index */
+ const dict_index_t* index, /*!< in: index */
+ const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */
+ ulint* n_ext, /*!< out: number of externally
+ stored columns */
+ mem_heap_t* heap); /*!< in: memory heap from which
+ the memory needed is allocated */
+/*******************************************************************//**
+Converts an index record to a typed data tuple. NOTE that externally
+stored (often big) fields are NOT copied to heap.
+@return own: index entry built; see the NOTE below! */
+UNIV_INTERN
+dtuple_t*
+row_rec_to_index_entry(
+/*===================*/
+ ulint type, /*!< in: ROW_COPY_DATA, or
+ ROW_COPY_POINTERS: the former
+ copies also the data fields to
+ heap as the latter only places
+ pointers to data fields on the
+ index page */
+ const rec_t* rec, /*!< in: record in the index;
+ NOTE: in the case
+ ROW_COPY_POINTERS the data
+ fields in the row will point
+ directly into this record,
+ therefore, the buffer page of
+ this record must be at least
+ s-latched and the latch held
+ as long as the dtuple is used! */
+ const dict_index_t* index, /*!< in: index */
+ ulint* offsets,/*!< in/out: rec_get_offsets(rec) */
+ ulint* n_ext, /*!< out: number of externally
+ stored columns */
+ mem_heap_t* heap); /*!< in: memory heap from which
+ the memory needed is allocated */
+/*******************************************************************//**
+Builds from a secondary index record a row reference with which we can
+search the clustered index record.
+@return own: row reference built; see the NOTE below! */
+UNIV_INTERN
+dtuple_t*
+row_build_row_ref(
+/*==============*/
+ ulint type, /*!< in: ROW_COPY_DATA, or ROW_COPY_POINTERS:
+ the former copies also the data fields to
+ heap, whereas the latter only places pointers
+ to data fields on the index page */
+ dict_index_t* index, /*!< in: secondary index */
+ const rec_t* rec, /*!< in: record in the index;
+ NOTE: in the case ROW_COPY_POINTERS
+ the data fields in the row will point
+ directly into this record, therefore,
+ the buffer page of this record must be
+ at least s-latched and the latch held
+ as long as the row reference is used! */
+ mem_heap_t* heap); /*!< in: memory heap from which the memory
+ needed is allocated */
+/*******************************************************************//**
+Builds from a secondary index record a row reference with which we can
+search the clustered index record. */
+UNIV_INTERN
+void
+row_build_row_ref_in_tuple(
+/*=======================*/
+ dtuple_t* ref, /*!< in/out: row reference built;
+ see the NOTE below! */
+ const rec_t* rec, /*!< in: record in the index;
+ NOTE: the data fields in ref
+ will point directly into this
+ record, therefore, the buffer
+ page of this record must be at
+ least s-latched and the latch
+ held as long as the row
+ reference is used! */
+ const dict_index_t* index, /*!< in: secondary index */
+ ulint* offsets,/*!< in: rec_get_offsets(rec, index)
+ or NULL */
+ trx_t* trx); /*!< in: transaction */
+/*******************************************************************//**
+Builds from a secondary index record a row reference with which we can
+search the clustered index record. */
+UNIV_INLINE
+void
+row_build_row_ref_fast(
+/*===================*/
+ dtuple_t* ref, /*!< in/out: typed data tuple where the
+ reference is built */
+ const ulint* map, /*!< in: array of field numbers in rec
+ telling how ref should be built from
+ the fields of rec */
+ const rec_t* rec, /*!< in: record in the index; must be
+ preserved while ref is used, as we do
+ not copy field values to heap */
+ const ulint* offsets);/*!< in: array returned by rec_get_offsets() */
+/***************************************************************//**
+Searches the clustered index record for a row, if we have the row
+reference.
+@return TRUE if found */
+UNIV_INTERN
+ibool
+row_search_on_row_ref(
+/*==================*/
+ btr_pcur_t* pcur, /*!< out: persistent cursor, which must
+ be closed by the caller */
+ ulint mode, /*!< in: BTR_MODIFY_LEAF, ... */
+ const dict_table_t* table, /*!< in: table */
+ const dtuple_t* ref, /*!< in: row reference */
+ mtr_t* mtr); /*!< in/out: mtr */
+/*********************************************************************//**
+Fetches the clustered index record for a secondary index record. The latches
+on the secondary index record are preserved.
+@return record or NULL, if no record found */
+UNIV_INTERN
+rec_t*
+row_get_clust_rec(
+/*==============*/
+ ulint mode, /*!< in: BTR_MODIFY_LEAF, ... */
+ const rec_t* rec, /*!< in: record in a secondary index */
+ dict_index_t* index, /*!< in: secondary index */
+ dict_index_t** clust_index,/*!< out: clustered index */
+ mtr_t* mtr); /*!< in: mtr */
+/***************************************************************//**
+Searches an index record.
+@return TRUE if found */
+UNIV_INTERN
+ibool
+row_search_index_entry(
+/*===================*/
+ dict_index_t* index, /*!< in: index */
+ const dtuple_t* entry, /*!< in: index entry */
+ ulint mode, /*!< in: BTR_MODIFY_LEAF, ... */
+ btr_pcur_t* pcur, /*!< in/out: persistent cursor, which must
+ be closed by the caller */
+ mtr_t* mtr); /*!< in: mtr */
+
+
+#define ROW_COPY_DATA 1
+#define ROW_COPY_POINTERS 2
+
+/* The allowed latching order of index records is the following:
+(1) a secondary index record ->
+(2) the clustered index record ->
+(3) rollback segment data for the clustered index record.
+
+No new latches may be obtained while the kernel mutex is reserved.
+However, the kernel mutex can be reserved while latches are owned. */
+
+/*******************************************************************//**
+Formats the raw data in "data" (in InnoDB on-disk format) using
+"dict_field" and writes the result to "buf".
+Not more than "buf_size" bytes are written to "buf".
+The result is always NUL-terminated (provided buf_size is positive) and the
+number of bytes that were written to "buf" is returned (including the
+terminating NUL).
+@return number of bytes that were written */
+UNIV_INTERN
+ulint
+row_raw_format(
+/*===========*/
+ const char* data, /*!< in: raw data */
+ ulint data_len, /*!< in: raw data length
+ in bytes */
+ const dict_field_t* dict_field, /*!< in: index field */
+ char* buf, /*!< out: output buffer */
+ ulint buf_size); /*!< in: output buffer size
+ in bytes */
+
+#ifndef UNIV_NONINL
+#include "row0row.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/row0row.ic b/storage/innobase/include/row0row.ic
new file mode 100644
index 00000000000..05c007641af
--- /dev/null
+++ b/storage/innobase/include/row0row.ic
@@ -0,0 +1,120 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0row.ic
+General row routines
+
+Created 4/20/1996 Heikki Tuuri
+*******************************************************/
+
+#include "dict0dict.h"
+#include "rem0rec.h"
+#include "trx0undo.h"
+
+/*********************************************************************//**
+Reads the trx id field from a clustered index record.
+@return value of the field */
+UNIV_INLINE
+trx_id_t
+row_get_rec_trx_id(
+/*===============*/
+ const rec_t* rec, /*!< in: record */
+ dict_index_t* index, /*!< in: clustered index */
+ const ulint* offsets)/*!< in: rec_get_offsets(rec, index) */
+{
+ ulint offset;
+
+ ut_ad(dict_index_is_clust(index));
+ ut_ad(rec_offs_validate(rec, index, offsets));
+
+ offset = index->trx_id_offset;
+
+ if (!offset) {
+ offset = row_get_trx_id_offset(rec, index, offsets);
+ }
+
+ return(trx_read_trx_id(rec + offset));
+}
+
+/*********************************************************************//**
+Reads the roll pointer field from a clustered index record.
+@return value of the field */
+UNIV_INLINE
+roll_ptr_t
+row_get_rec_roll_ptr(
+/*=================*/
+ const rec_t* rec, /*!< in: record */
+ dict_index_t* index, /*!< in: clustered index */
+ const ulint* offsets)/*!< in: rec_get_offsets(rec, index) */
+{
+ ulint offset;
+
+ ut_ad(dict_index_is_clust(index));
+ ut_ad(rec_offs_validate(rec, index, offsets));
+
+ offset = index->trx_id_offset;
+
+ if (!offset) {
+ offset = row_get_trx_id_offset(rec, index, offsets);
+ }
+
+ return(trx_read_roll_ptr(rec + offset + DATA_TRX_ID_LEN));
+}
+
+/*******************************************************************//**
+Builds from a secondary index record a row reference with which we can
+search the clustered index record. */
+UNIV_INLINE
+void
+row_build_row_ref_fast(
+/*===================*/
+ dtuple_t* ref, /*!< in/out: typed data tuple where the
+ reference is built */
+ const ulint* map, /*!< in: array of field numbers in rec
+ telling how ref should be built from
+ the fields of rec */
+ const rec_t* rec, /*!< in: record in the index; must be
+ preserved while ref is used, as we do
+ not copy field values to heap */
+ const ulint* offsets)/*!< in: array returned by rec_get_offsets() */
+{
+ dfield_t* dfield;
+ const byte* field;
+ ulint len;
+ ulint ref_len;
+ ulint field_no;
+ ulint i;
+
+ ut_ad(rec_offs_validate(rec, NULL, offsets));
+ ut_ad(!rec_offs_any_extern(offsets));
+ ref_len = dtuple_get_n_fields(ref);
+
+ for (i = 0; i < ref_len; i++) {
+ dfield = dtuple_get_nth_field(ref, i);
+
+ field_no = *(map + i);
+
+ if (field_no != ULINT_UNDEFINED) {
+
+ field = rec_get_nth_field(rec, offsets,
+ field_no, &len);
+ dfield_set_data(dfield, field, len);
+ }
+ }
+}
diff --git a/storage/innobase/include/row0sel.h b/storage/innobase/include/row0sel.h
new file mode 100644
index 00000000000..01a5afaa23e
--- /dev/null
+++ b/storage/innobase/include/row0sel.h
@@ -0,0 +1,413 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0sel.h
+Select
+
+Created 12/19/1997 Heikki Tuuri
+*******************************************************/
+
+#ifndef row0sel_h
+#define row0sel_h
+
+#include "univ.i"
+#include "data0data.h"
+#include "que0types.h"
+#include "dict0types.h"
+#include "trx0types.h"
+#include "row0types.h"
+#include "que0types.h"
+#include "pars0sym.h"
+#include "btr0pcur.h"
+#include "read0read.h"
+#include "row0mysql.h"
+
+/*********************************************************************//**
+Creates a select node struct.
+@return own: select node struct */
+UNIV_INTERN
+sel_node_t*
+sel_node_create(
+/*============*/
+ mem_heap_t* heap); /*!< in: memory heap where created */
+/*********************************************************************//**
+Frees the memory private to a select node when a query graph is freed,
+does not free the heap where the node was originally created. */
+UNIV_INTERN
+void
+sel_node_free_private(
+/*==================*/
+ sel_node_t* node); /*!< in: select node struct */
+/*********************************************************************//**
+Frees a prefetch buffer for a column, including the dynamically allocated
+memory for data stored there. */
+UNIV_INTERN
+void
+sel_col_prefetch_buf_free(
+/*======================*/
+ sel_buf_t* prefetch_buf); /*!< in, own: prefetch buffer */
+/*********************************************************************//**
+Gets the plan node for the nth table in a join.
+@return plan node */
+UNIV_INLINE
+plan_t*
+sel_node_get_nth_plan(
+/*==================*/
+ sel_node_t* node, /*!< in: select node */
+ ulint i); /*!< in: get ith plan node */
+/**********************************************************************//**
+Performs a select step. This is a high-level function used in SQL execution
+graphs.
+@return query thread to run next or NULL */
+UNIV_INTERN
+que_thr_t*
+row_sel_step(
+/*=========*/
+ que_thr_t* thr); /*!< in: query thread */
+/**********************************************************************//**
+Performs an execution step of an open or close cursor statement node.
+@return query thread to run next or NULL */
+UNIV_INLINE
+que_thr_t*
+open_step(
+/*======*/
+ que_thr_t* thr); /*!< in: query thread */
+/**********************************************************************//**
+Performs a fetch for a cursor.
+@return query thread to run next or NULL */
+UNIV_INTERN
+que_thr_t*
+fetch_step(
+/*=======*/
+ que_thr_t* thr); /*!< in: query thread */
+/****************************************************************//**
+Sample callback function for fetch that prints each row.
+@return always returns non-NULL */
+UNIV_INTERN
+void*
+row_fetch_print(
+/*============*/
+ void* row, /*!< in: sel_node_t* */
+ void* user_arg); /*!< in: not used */
+/****************************************************************//**
+Callback function for fetch that stores an unsigned 4 byte integer to the
+location pointed. The column's type must be DATA_INT, DATA_UNSIGNED, length
+= 4.
+@return always returns NULL */
+UNIV_INTERN
+void*
+row_fetch_store_uint4(
+/*==================*/
+ void* row, /*!< in: sel_node_t* */
+ void* user_arg); /*!< in: data pointer */
+/***********************************************************//**
+Prints a row in a select result.
+@return query thread to run next or NULL */
+UNIV_INTERN
+que_thr_t*
+row_printf_step(
+/*============*/
+ que_thr_t* thr); /*!< in: query thread */
+/****************************************************************//**
+Converts a key value stored in MySQL format to an Innobase dtuple. The last
+field of the key value may be just a prefix of a fixed length field: hence
+the parameter key_len. But currently we do not allow search keys where the
+last field is only a prefix of the full key field len and print a warning if
+such appears. */
+UNIV_INTERN
+void
+row_sel_convert_mysql_key_to_innobase(
+/*==================================*/
+ dtuple_t* tuple, /*!< in/out: tuple where to build;
+ NOTE: we assume that the type info
+ in the tuple is already according
+ to index! */
+ byte* buf, /*!< in: buffer to use in field
+ conversions */
+ ulint buf_len, /*!< in: buffer length */
+ dict_index_t* index, /*!< in: index of the key value */
+ const byte* key_ptr, /*!< in: MySQL key value */
+ ulint key_len, /*!< in: MySQL key value length */
+ trx_t* trx); /*!< in: transaction */
+/********************************************************************//**
+Searches for rows in the database. This is used in the interface to
+MySQL. This function opens a cursor, and also implements fetch next
+and fetch prev. NOTE that if we do a search with a full key value
+from a unique index (ROW_SEL_EXACT), then we will not store the cursor
+position and fetch next or fetch prev must not be tried to the cursor!
+@return DB_SUCCESS, DB_RECORD_NOT_FOUND, DB_END_OF_INDEX, DB_DEADLOCK,
+DB_LOCK_TABLE_FULL, or DB_TOO_BIG_RECORD */
+UNIV_INTERN
+ulint
+row_search_for_mysql(
+/*=================*/
+ byte* buf, /*!< in/out: buffer for the fetched
+ row in the MySQL format */
+ ulint mode, /*!< in: search mode PAGE_CUR_L, ... */
+ row_prebuilt_t* prebuilt, /*!< in: prebuilt struct for the
+ table handle; this contains the info
+ of search_tuple, index; if search
+ tuple contains 0 fields then we
+ position the cursor at the start or
+ the end of the index, depending on
+ 'mode' */
+ ulint match_mode, /*!< in: 0 or ROW_SEL_EXACT or
+ ROW_SEL_EXACT_PREFIX */
+ ulint direction); /*!< in: 0 or ROW_SEL_NEXT or
+ ROW_SEL_PREV; NOTE: if this is != 0,
+ then prebuilt must have a pcur
+ with stored position! In opening of a
+ cursor 'direction' should be 0. */
+/*******************************************************************//**
+Checks if MySQL at the moment is allowed for this table to retrieve a
+consistent read result, or store it to the query cache.
+@return TRUE if storing or retrieving from the query cache is permitted */
+UNIV_INTERN
+ibool
+row_search_check_if_query_cache_permitted(
+/*======================================*/
+ trx_t* trx, /*!< in: transaction object */
+ const char* norm_name); /*!< in: concatenation of database name,
+ '/' char, table name */
+/*******************************************************************//**
+Read the max AUTOINC value from an index.
+@return DB_SUCCESS if all OK else error code */
+UNIV_INTERN
+ulint
+row_search_max_autoinc(
+/*===================*/
+ dict_index_t* index, /*!< in: index to search */
+ const char* col_name, /*!< in: autoinc column name */
+ ib_uint64_t* value); /*!< out: AUTOINC value read */
+
+/** A structure for caching column values for prefetched rows */
+struct sel_buf_struct{
+ byte* data; /*!< data, or NULL; if not NULL, this field
+ has allocated memory which must be explicitly
+ freed; can be != NULL even when len is
+ UNIV_SQL_NULL */
+ ulint len; /*!< data length or UNIV_SQL_NULL */
+ ulint val_buf_size;
+ /*!< size of memory buffer allocated for data:
+ this can be more than len; this is defined
+ when data != NULL */
+};
+
+/** Query plan */
+struct plan_struct{
+ dict_table_t* table; /*!< table struct in the dictionary
+ cache */
+ dict_index_t* index; /*!< table index used in the search */
+ btr_pcur_t pcur; /*!< persistent cursor used to search
+ the index */
+ ibool asc; /*!< TRUE if cursor traveling upwards */
+ ibool pcur_is_open; /*!< TRUE if pcur has been positioned
+ and we can try to fetch new rows */
+ ibool cursor_at_end; /*!< TRUE if the cursor is open but
+ we know that there are no more
+ qualifying rows left to retrieve from
+ the index tree; NOTE though, that
+ there may still be unprocessed rows in
+ the prefetch stack; always FALSE when
+ pcur_is_open is FALSE */
+ ibool stored_cursor_rec_processed;
+ /*!< TRUE if the pcur position has been
+ stored and the record it is positioned
+ on has already been processed */
+ que_node_t** tuple_exps; /*!< array of expressions
+ which are used to calculate
+ the field values in the search
+ tuple: there is one expression
+ for each field in the search
+ tuple */
+ dtuple_t* tuple; /*!< search tuple */
+ ulint mode; /*!< search mode: PAGE_CUR_G, ... */
+ ulint n_exact_match; /*!< number of first fields in
+ the search tuple which must be
+ exactly matched */
+ ibool unique_search; /*!< TRUE if we are searching an
+ index record with a unique key */
+ ulint n_rows_fetched; /*!< number of rows fetched using pcur
+ after it was opened */
+ ulint n_rows_prefetched;/*!< number of prefetched rows cached
+ for fetch: fetching several rows in
+ the same mtr saves CPU time */
+ ulint first_prefetched;/*!< index of the first cached row in
+ select buffer arrays for each column */
+ ibool no_prefetch; /*!< no prefetch for this table */
+ sym_node_list_t columns; /*!< symbol table nodes for the columns
+ to retrieve from the table */
+ UT_LIST_BASE_NODE_T(func_node_t)
+ end_conds; /*!< conditions which determine the
+ fetch limit of the index segment we
+ have to look at: when one of these
+ fails, the result set has been
+ exhausted for the cursor in this
+ index; these conditions are normalized
+ so that in a comparison the column
+ for this table is the first argument */
+ UT_LIST_BASE_NODE_T(func_node_t)
+ other_conds; /*!< the rest of search conditions we can
+ test at this table in a join */
+ ibool must_get_clust; /*!< TRUE if index is a non-clustered
+ index and we must also fetch the
+ clustered index record; this is the
+ case if the non-clustered record does
+ not contain all the needed columns, or
+ if this is a single-table explicit
+ cursor, or a searched update or
+ delete */
+ ulint* clust_map; /*!< map telling how clust_ref is built
+ from the fields of a non-clustered
+ record */
+ dtuple_t* clust_ref; /*!< the reference to the clustered
+ index entry is built here if index is
+ a non-clustered index */
+ btr_pcur_t clust_pcur; /*!< if index is non-clustered, we use
+ this pcur to search the clustered
+ index */
+ mem_heap_t* old_vers_heap; /*!< memory heap used in building an old
+ version of a row, or NULL */
+};
+
+/** Select node states */
+enum sel_node_state {
+ SEL_NODE_CLOSED, /*!< it is a declared cursor which is not
+ currently open */
+ SEL_NODE_OPEN, /*!< intention locks not yet set on tables */
+ SEL_NODE_FETCH, /*!< intention locks have been set */
+ SEL_NODE_NO_MORE_ROWS /*!< cursor has reached the result set end */
+};
+
+/** Select statement node */
+struct sel_node_struct{
+ que_common_t common; /*!< node type: QUE_NODE_SELECT */
+ enum sel_node_state
+ state; /*!< node state */
+ que_node_t* select_list; /*!< select list */
+ sym_node_t* into_list; /*!< variables list or NULL */
+ sym_node_t* table_list; /*!< table list */
+ ibool asc; /*!< TRUE if the rows should be fetched
+ in an ascending order */
+ ibool set_x_locks; /*!< TRUE if the cursor is for update or
+ delete, which means that a row x-lock
+ should be placed on the cursor row */
+ ulint row_lock_mode; /*!< LOCK_X or LOCK_S */
+ ulint n_tables; /*!< number of tables */
+ ulint fetch_table; /*!< number of the next table to access
+ in the join */
+ plan_t* plans; /*!< array of n_tables many plan nodes
+ containing the search plan and the
+ search data structures */
+ que_node_t* search_cond; /*!< search condition */
+ read_view_t* read_view; /*!< if the query is a non-locking
+ consistent read, its read view is
+ placed here, otherwise NULL */
+ ibool consistent_read;/*!< TRUE if the select is a consistent,
+ non-locking read */
+ order_node_t* order_by; /*!< order by column definition, or
+ NULL */
+ ibool is_aggregate; /*!< TRUE if the select list consists of
+ aggregate functions */
+ ibool aggregate_already_fetched;
+ /*!< TRUE if the aggregate row has
+ already been fetched for the current
+ cursor */
+ ibool can_get_updated;/*!< this is TRUE if the select
+ is in a single-table explicit
+ cursor which can get updated
+ within the stored procedure,
+ or in a searched update or
+ delete; NOTE that to determine
+ of an explicit cursor if it
+ can get updated, the parser
+ checks from a stored procedure
+ if it contains positioned
+ update or delete statements */
+ sym_node_t* explicit_cursor;/*!< not NULL if an explicit cursor */
+ UT_LIST_BASE_NODE_T(sym_node_t)
+ copy_variables; /*!< variables whose values we have to
+ copy when an explicit cursor is opened,
+ so that they do not change between
+ fetches */
+};
+
+/** Fetch statement node */
+struct fetch_node_struct{
+ que_common_t common; /*!< type: QUE_NODE_FETCH */
+ sel_node_t* cursor_def; /*!< cursor definition */
+ sym_node_t* into_list; /*!< variables to set */
+
+ pars_user_func_t*
+ func; /*!< User callback function or NULL.
+ The first argument to the function
+ is a sel_node_t*, containing the
+ results of the SELECT operation for
+ one row. If the function returns
+ NULL, it is not interested in
+ further rows and the cursor is
+ modified so (cursor % NOTFOUND) is
+ true. If it returns not-NULL,
+ continue normally. See
+ row_fetch_print() for an example
+ (and a useful debugging tool). */
+};
+
+/** Open or close cursor operation type */
+enum open_node_op {
+ ROW_SEL_OPEN_CURSOR, /*!< open cursor */
+ ROW_SEL_CLOSE_CURSOR /*!< close cursor */
+};
+
+/** Open or close cursor statement node */
+struct open_node_struct{
+ que_common_t common; /*!< type: QUE_NODE_OPEN */
+ enum open_node_op
+ op_type; /*!< operation type: open or
+ close cursor */
+ sel_node_t* cursor_def; /*!< cursor definition */
+};
+
+/** Row printf statement node */
+struct row_printf_node_struct{
+ que_common_t common; /*!< type: QUE_NODE_ROW_PRINTF */
+ sel_node_t* sel_node; /*!< select */
+};
+
+/** Search direction for the MySQL interface */
+enum row_sel_direction {
+ ROW_SEL_NEXT = 1, /*!< ascending direction */
+ ROW_SEL_PREV = 2 /*!< descending direction */
+};
+
+/** Match mode for the MySQL interface */
+enum row_sel_match_mode {
+ ROW_SEL_EXACT = 1, /*!< search using a complete key value */
+ ROW_SEL_EXACT_PREFIX /*!< search using a key prefix which
+ must match rows: the prefix may
+ contain an incomplete field (the last
+ field in prefix may be just a prefix
+ of a fixed length column) */
+};
+
+#ifndef UNIV_NONINL
+#include "row0sel.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/row0sel.ic b/storage/innobase/include/row0sel.ic
new file mode 100644
index 00000000000..5907f9913da
--- /dev/null
+++ b/storage/innobase/include/row0sel.ic
@@ -0,0 +1,105 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0sel.ic
+Select
+
+Created 12/19/1997 Heikki Tuuri
+*******************************************************/
+
+#include "que0que.h"
+
+/*********************************************************************//**
+Gets the plan node for the nth table in a join.
+@return plan node */
+UNIV_INLINE
+plan_t*
+sel_node_get_nth_plan(
+/*==================*/
+ sel_node_t* node, /*!< in: select node */
+ ulint i) /*!< in: get ith plan node */
+{
+ ut_ad(i < node->n_tables);
+
+ return(node->plans + i);
+}
+
+/*********************************************************************//**
+Resets the cursor defined by sel_node to the SEL_NODE_OPEN state, which means
+that it will start fetching from the start of the result set again, regardless
+of where it was before, and it will set intention locks on the tables. */
+UNIV_INLINE
+void
+sel_node_reset_cursor(
+/*==================*/
+ sel_node_t* node) /*!< in: select node */
+{
+ node->state = SEL_NODE_OPEN;
+}
+
+/**********************************************************************//**
+Performs an execution step of an open or close cursor statement node.
+@return query thread to run next or NULL */
+UNIV_INLINE
+que_thr_t*
+open_step(
+/*======*/
+ que_thr_t* thr) /*!< in: query thread */
+{
+ sel_node_t* sel_node;
+ open_node_t* node;
+ ulint err;
+
+ ut_ad(thr);
+
+ node = (open_node_t*) thr->run_node;
+ ut_ad(que_node_get_type(node) == QUE_NODE_OPEN);
+
+ sel_node = node->cursor_def;
+
+ err = DB_SUCCESS;
+
+ if (node->op_type == ROW_SEL_OPEN_CURSOR) {
+
+ /* if (sel_node->state == SEL_NODE_CLOSED) { */
+
+ sel_node_reset_cursor(sel_node);
+ /* } else {
+ err = DB_ERROR;
+ } */
+ } else {
+ if (sel_node->state != SEL_NODE_CLOSED) {
+
+ sel_node->state = SEL_NODE_CLOSED;
+ } else {
+ err = DB_ERROR;
+ }
+ }
+
+ if (UNIV_EXPECT(err, DB_SUCCESS) != DB_SUCCESS) {
+ /* SQL error detected */
+ fprintf(stderr, "SQL error %lu\n", (ulong) err);
+
+ ut_error;
+ }
+
+ thr->run_node = que_node_get_parent(node);
+
+ return(thr);
+}
diff --git a/storage/innobase/include/row0types.h b/storage/innobase/include/row0types.h
new file mode 100644
index 00000000000..7920fd75061
--- /dev/null
+++ b/storage/innobase/include/row0types.h
@@ -0,0 +1,59 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0types.h
+Row operation global types
+
+Created 12/27/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef row0types_h
+#define row0types_h
+
+typedef struct plan_struct plan_t;
+
+typedef struct upd_struct upd_t;
+
+typedef struct upd_field_struct upd_field_t;
+
+typedef struct upd_node_struct upd_node_t;
+
+typedef struct del_node_struct del_node_t;
+
+typedef struct ins_node_struct ins_node_t;
+
+typedef struct sel_node_struct sel_node_t;
+
+typedef struct open_node_struct open_node_t;
+
+typedef struct fetch_node_struct fetch_node_t;
+
+typedef struct row_printf_node_struct row_printf_node_t;
+typedef struct sel_buf_struct sel_buf_t;
+
+typedef struct undo_node_struct undo_node_t;
+
+typedef struct purge_node_struct purge_node_t;
+
+typedef struct row_ext_struct row_ext_t;
+
+/* MySQL data types */
+typedef struct st_table TABLE;
+
+#endif
diff --git a/storage/innobase/include/row0uins.h b/storage/innobase/include/row0uins.h
new file mode 100644
index 00000000000..77b071c3a6b
--- /dev/null
+++ b/storage/innobase/include/row0uins.h
@@ -0,0 +1,54 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0uins.h
+Fresh insert undo
+
+Created 2/25/1997 Heikki Tuuri
+*******************************************************/
+
+#ifndef row0uins_h
+#define row0uins_h
+
+#include "univ.i"
+#include "data0data.h"
+#include "dict0types.h"
+#include "trx0types.h"
+#include "que0types.h"
+#include "row0types.h"
+#include "mtr0mtr.h"
+
+/***********************************************************//**
+Undoes a fresh insert of a row to a table. A fresh insert means that
+the same clustered index unique key did not have any record, even delete
+marked, at the time of the insert. InnoDB is eager in a rollback:
+if it figures out that an index record will be removed in the purge
+anyway, it will remove it in the rollback.
+@return DB_SUCCESS */
+UNIV_INTERN
+ulint
+row_undo_ins(
+/*=========*/
+ undo_node_t* node); /*!< in: row undo node */
+
+#ifndef UNIV_NONINL
+#include "row0uins.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/row0uins.ic b/storage/innobase/include/row0uins.ic
new file mode 100644
index 00000000000..27606150d8e
--- /dev/null
+++ b/storage/innobase/include/row0uins.ic
@@ -0,0 +1,25 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0uins.ic
+Fresh insert undo
+
+Created 2/25/1997 Heikki Tuuri
+*******************************************************/
+
diff --git a/storage/innobase/include/row0umod.h b/storage/innobase/include/row0umod.h
new file mode 100644
index 00000000000..ed44cc8d601
--- /dev/null
+++ b/storage/innobase/include/row0umod.h
@@ -0,0 +1,52 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0umod.h
+Undo modify of a row
+
+Created 2/27/1997 Heikki Tuuri
+*******************************************************/
+
+#ifndef row0umod_h
+#define row0umod_h
+
+#include "univ.i"
+#include "data0data.h"
+#include "dict0types.h"
+#include "trx0types.h"
+#include "que0types.h"
+#include "row0types.h"
+#include "mtr0mtr.h"
+
+/***********************************************************//**
+Undoes a modify operation on a row of a table.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+ulint
+row_undo_mod(
+/*=========*/
+ undo_node_t* node, /*!< in: row undo node */
+ que_thr_t* thr); /*!< in: query thread */
+
+
+#ifndef UNIV_NONINL
+#include "row0umod.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/row0umod.ic b/storage/innobase/include/row0umod.ic
new file mode 100644
index 00000000000..ea3fd3b43c7
--- /dev/null
+++ b/storage/innobase/include/row0umod.ic
@@ -0,0 +1,24 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0umod.ic
+Undo modify of a row
+
+Created 2/27/1997 Heikki Tuuri
+*******************************************************/
diff --git a/storage/innobase/include/row0undo.h b/storage/innobase/include/row0undo.h
new file mode 100644
index 00000000000..6eb4ca448b3
--- /dev/null
+++ b/storage/innobase/include/row0undo.h
@@ -0,0 +1,142 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0undo.h
+Row undo
+
+Created 1/8/1997 Heikki Tuuri
+*******************************************************/
+
+#ifndef row0undo_h
+#define row0undo_h
+
+#include "univ.i"
+#include "mtr0mtr.h"
+#include "trx0sys.h"
+#include "btr0types.h"
+#include "btr0pcur.h"
+#include "dict0types.h"
+#include "trx0types.h"
+#include "que0types.h"
+#include "row0types.h"
+
+/********************************************************************//**
+Creates a row undo node to a query graph.
+@return own: undo node */
+UNIV_INTERN
+undo_node_t*
+row_undo_node_create(
+/*=================*/
+ trx_t* trx, /*!< in: transaction */
+ que_thr_t* parent, /*!< in: parent node, i.e., a thr node */
+ mem_heap_t* heap); /*!< in: memory heap where created */
+/***********************************************************//**
+Looks for the clustered index record when node has the row reference.
+The pcur in node is used in the search. If found, stores the row to node,
+and stores the position of pcur, and detaches it. The pcur must be closed
+by the caller in any case.
+@return TRUE if found; NOTE the node->pcur must be closed by the
+caller, regardless of the return value */
+UNIV_INTERN
+ibool
+row_undo_search_clust_to_pcur(
+/*==========================*/
+ undo_node_t* node); /*!< in: row undo node */
+/***********************************************************//**
+Undoes a row operation in a table. This is a high-level function used
+in SQL execution graphs.
+@return query thread to run next or NULL */
+UNIV_INTERN
+que_thr_t*
+row_undo_step(
+/*==========*/
+ que_thr_t* thr); /*!< in: query thread */
+
+/* A single query thread will try to perform the undo for all successive
+versions of a clustered index record, if the transaction has modified it
+several times during the execution which is rolled back. It may happen
+that the task is transferred to another query thread, if the other thread
+is assigned to handle an undo log record in the chain of different versions
+of the record, and the other thread happens to get the x-latch to the
+clustered index record at the right time.
+ If a query thread notices that the clustered index record it is looking
+for is missing, or the roll ptr field in the record doed not point to the
+undo log record the thread was assigned to handle, then it gives up the undo
+task for that undo log record, and fetches the next. This situation can occur
+just in the case where the transaction modified the same record several times
+and another thread is currently doing the undo for successive versions of
+that index record. */
+
+/** Execution state of an undo node */
+enum undo_exec {
+ UNDO_NODE_FETCH_NEXT = 1, /*!< we should fetch the next
+ undo log record */
+ UNDO_NODE_PREV_VERS, /*!< the roll ptr to previous
+ version of a row is stored in
+ node, and undo should be done
+ based on it */
+ UNDO_NODE_INSERT, /*!< undo a fresh insert of a
+ row to a table */
+ UNDO_NODE_MODIFY /*!< undo a modify operation
+ (DELETE or UPDATE) on a row
+ of a table */
+};
+
+/** Undo node structure */
+struct undo_node_struct{
+ que_common_t common; /*!< node type: QUE_NODE_UNDO */
+ enum undo_exec state; /*!< node execution state */
+ trx_t* trx; /*!< trx for which undo is done */
+ roll_ptr_t roll_ptr;/*!< roll pointer to undo log record */
+ trx_undo_rec_t* undo_rec;/*!< undo log record */
+ undo_no_t undo_no;/*!< undo number of the record */
+ ulint rec_type;/*!< undo log record type: TRX_UNDO_INSERT_REC,
+ ... */
+ roll_ptr_t new_roll_ptr;
+ /*!< roll ptr to restore to clustered index
+ record */
+ trx_id_t new_trx_id; /*!< trx id to restore to clustered index
+ record */
+ btr_pcur_t pcur; /*!< persistent cursor used in searching the
+ clustered index record */
+ dict_table_t* table; /*!< table where undo is done */
+ ulint cmpl_info;/*!< compiler analysis of an update */
+ upd_t* update; /*!< update vector for a clustered index
+ record */
+ dtuple_t* ref; /*!< row reference to the next row to handle */
+ dtuple_t* row; /*!< a copy (also fields copied to heap) of the
+ row to handle */
+ row_ext_t* ext; /*!< NULL, or prefixes of the externally
+ stored columns of the row */
+ dtuple_t* undo_row;/*!< NULL, or the row after undo */
+ row_ext_t* undo_ext;/*!< NULL, or prefixes of the externally
+ stored columns of undo_row */
+ dict_index_t* index; /*!< the next index whose record should be
+ handled */
+ mem_heap_t* heap; /*!< memory heap used as auxiliary storage for
+ row; this must be emptied after undo is tried
+ on a row */
+};
+
+
+#ifndef UNIV_NONINL
+#include "row0undo.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/row0undo.ic b/storage/innobase/include/row0undo.ic
new file mode 100644
index 00000000000..dc788debc14
--- /dev/null
+++ b/storage/innobase/include/row0undo.ic
@@ -0,0 +1,24 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0undo.ic
+Row undo
+
+Created 1/8/1997 Heikki Tuuri
+*******************************************************/
diff --git a/storage/innobase/include/row0upd.h b/storage/innobase/include/row0upd.h
new file mode 100644
index 00000000000..635d746d5a1
--- /dev/null
+++ b/storage/innobase/include/row0upd.h
@@ -0,0 +1,483 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0upd.h
+Update of a row
+
+Created 12/27/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef row0upd_h
+#define row0upd_h
+
+#include "univ.i"
+#include "data0data.h"
+#include "row0types.h"
+#include "btr0types.h"
+#include "dict0types.h"
+#include "trx0types.h"
+
+#ifndef UNIV_HOTBACKUP
+# include "btr0pcur.h"
+# include "que0types.h"
+# include "pars0types.h"
+#endif /* !UNIV_HOTBACKUP */
+
+/*********************************************************************//**
+Creates an update vector object.
+@return own: update vector object */
+UNIV_INLINE
+upd_t*
+upd_create(
+/*=======*/
+ ulint n, /*!< in: number of fields */
+ mem_heap_t* heap); /*!< in: heap from which memory allocated */
+/*********************************************************************//**
+Returns the number of fields in the update vector == number of columns
+to be updated by an update vector.
+@return number of fields */
+UNIV_INLINE
+ulint
+upd_get_n_fields(
+/*=============*/
+ const upd_t* update); /*!< in: update vector */
+#ifdef UNIV_DEBUG
+/*********************************************************************//**
+Returns the nth field of an update vector.
+@return update vector field */
+UNIV_INLINE
+upd_field_t*
+upd_get_nth_field(
+/*==============*/
+ const upd_t* update, /*!< in: update vector */
+ ulint n); /*!< in: field position in update vector */
+#else
+# define upd_get_nth_field(update, n) ((update)->fields + (n))
+#endif
+#ifndef UNIV_HOTBACKUP
+/*********************************************************************//**
+Sets an index field number to be updated by an update vector field. */
+UNIV_INLINE
+void
+upd_field_set_field_no(
+/*===================*/
+ upd_field_t* upd_field, /*!< in: update vector field */
+ ulint field_no, /*!< in: field number in a clustered
+ index */
+ dict_index_t* index, /*!< in: index */
+ trx_t* trx); /*!< in: transaction */
+/*********************************************************************//**
+Returns a field of an update vector by field_no.
+@return update vector field, or NULL */
+UNIV_INLINE
+const upd_field_t*
+upd_get_field_by_field_no(
+/*======================*/
+ const upd_t* update, /*!< in: update vector */
+ ulint no) /*!< in: field_no */
+ __attribute__((nonnull, pure));
+/*********************************************************************//**
+Writes into the redo log the values of trx id and roll ptr and enough info
+to determine their positions within a clustered index record.
+@return new pointer to mlog */
+UNIV_INTERN
+byte*
+row_upd_write_sys_vals_to_log(
+/*==========================*/
+ dict_index_t* index, /*!< in: clustered index */
+ trx_t* trx, /*!< in: transaction */
+ roll_ptr_t roll_ptr,/*!< in: roll ptr of the undo log record */
+ byte* log_ptr,/*!< pointer to a buffer of size > 20 opened
+ in mlog */
+ mtr_t* mtr); /*!< in: mtr */
+/*********************************************************************//**
+Updates the trx id and roll ptr field in a clustered index record when
+a row is updated or marked deleted. */
+UNIV_INLINE
+void
+row_upd_rec_sys_fields(
+/*===================*/
+ rec_t* rec, /*!< in/out: record */
+ page_zip_des_t* page_zip,/*!< in/out: compressed page whose
+ uncompressed part will be updated, or NULL */
+ dict_index_t* index, /*!< in: clustered index */
+ const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */
+ trx_t* trx, /*!< in: transaction */
+ roll_ptr_t roll_ptr);/*!< in: roll ptr of the undo log record */
+/*********************************************************************//**
+Sets the trx id or roll ptr field of a clustered index entry. */
+UNIV_INTERN
+void
+row_upd_index_entry_sys_field(
+/*==========================*/
+ const dtuple_t* entry, /*!< in: index entry, where the memory buffers
+ for sys fields are already allocated:
+ the function just copies the new values to
+ them */
+ dict_index_t* index, /*!< in: clustered index */
+ ulint type, /*!< in: DATA_TRX_ID or DATA_ROLL_PTR */
+ dulint val); /*!< in: value to write */
+/*********************************************************************//**
+Creates an update node for a query graph.
+@return own: update node */
+UNIV_INTERN
+upd_node_t*
+upd_node_create(
+/*============*/
+ mem_heap_t* heap); /*!< in: mem heap where created */
+/***********************************************************//**
+Writes to the redo log the new values of the fields occurring in the index. */
+UNIV_INTERN
+void
+row_upd_index_write_log(
+/*====================*/
+ const upd_t* update, /*!< in: update vector */
+ byte* log_ptr,/*!< in: pointer to mlog buffer: must
+ contain at least MLOG_BUF_MARGIN bytes
+ of free space; the buffer is closed
+ within this function */
+ mtr_t* mtr); /*!< in: mtr into whose log to write */
+/***********************************************************//**
+Returns TRUE if row update changes size of some field in index or if some
+field to be updated is stored externally in rec or update.
+@return TRUE if the update changes the size of some field in index or
+the field is external in rec or update */
+UNIV_INTERN
+ibool
+row_upd_changes_field_size_or_external(
+/*===================================*/
+ dict_index_t* index, /*!< in: index */
+ const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */
+ const upd_t* update);/*!< in: update vector */
+#endif /* !UNIV_HOTBACKUP */
+/***********************************************************//**
+Replaces the new column values stored in the update vector to the record
+given. No field size changes are allowed. */
+UNIV_INTERN
+void
+row_upd_rec_in_place(
+/*=================*/
+ rec_t* rec, /*!< in/out: record where replaced */
+ dict_index_t* index, /*!< in: the index the record belongs to */
+ const ulint* offsets,/*!< in: array returned by rec_get_offsets() */
+ const upd_t* update, /*!< in: update vector */
+ page_zip_des_t* page_zip);/*!< in: compressed page with enough space
+ available, or NULL */
+#ifndef UNIV_HOTBACKUP
+/***************************************************************//**
+Builds an update vector from those fields which in a secondary index entry
+differ from a record that has the equal ordering fields. NOTE: we compare
+the fields as binary strings!
+@return own: update vector of differing fields */
+UNIV_INTERN
+upd_t*
+row_upd_build_sec_rec_difference_binary(
+/*====================================*/
+ dict_index_t* index, /*!< in: index */
+ const dtuple_t* entry, /*!< in: entry to insert */
+ const rec_t* rec, /*!< in: secondary index record */
+ trx_t* trx, /*!< in: transaction */
+ mem_heap_t* heap); /*!< in: memory heap from which allocated */
+/***************************************************************//**
+Builds an update vector from those fields, excluding the roll ptr and
+trx id fields, which in an index entry differ from a record that has
+the equal ordering fields. NOTE: we compare the fields as binary strings!
+@return own: update vector of differing fields, excluding roll ptr and
+trx id */
+UNIV_INTERN
+upd_t*
+row_upd_build_difference_binary(
+/*============================*/
+ dict_index_t* index, /*!< in: clustered index */
+ const dtuple_t* entry, /*!< in: entry to insert */
+ const rec_t* rec, /*!< in: clustered index record */
+ trx_t* trx, /*!< in: transaction */
+ mem_heap_t* heap); /*!< in: memory heap from which allocated */
+/***********************************************************//**
+Replaces the new column values stored in the update vector to the index entry
+given. */
+UNIV_INTERN
+void
+row_upd_index_replace_new_col_vals_index_pos(
+/*=========================================*/
+ dtuple_t* entry, /*!< in/out: index entry where replaced;
+ the clustered index record must be
+ covered by a lock or a page latch to
+ prevent deletion (rollback or purge) */
+ dict_index_t* index, /*!< in: index; NOTE that this may also be a
+ non-clustered index */
+ const upd_t* update, /*!< in: an update vector built for the index so
+ that the field number in an upd_field is the
+ index position */
+ ibool order_only,
+ /*!< in: if TRUE, limit the replacement to
+ ordering fields of index; note that this
+ does not work for non-clustered indexes. */
+ mem_heap_t* heap) /*!< in: memory heap for allocating and
+ copying the new values */
+ __attribute__((nonnull));
+/***********************************************************//**
+Replaces the new column values stored in the update vector to the index entry
+given. */
+UNIV_INTERN
+void
+row_upd_index_replace_new_col_vals(
+/*===============================*/
+ dtuple_t* entry, /*!< in/out: index entry where replaced;
+ the clustered index record must be
+ covered by a lock or a page latch to
+ prevent deletion (rollback or purge) */
+ dict_index_t* index, /*!< in: index; NOTE that this may also be a
+ non-clustered index */
+ const upd_t* update, /*!< in: an update vector built for the
+ CLUSTERED index so that the field number in
+ an upd_field is the clustered index position */
+ mem_heap_t* heap) /*!< in: memory heap for allocating and
+ copying the new values */
+ __attribute__((nonnull));
+/***********************************************************//**
+Replaces the new column values stored in the update vector. */
+UNIV_INTERN
+void
+row_upd_replace(
+/*============*/
+ dtuple_t* row, /*!< in/out: row where replaced,
+ indexed by col_no;
+ the clustered index record must be
+ covered by a lock or a page latch to
+ prevent deletion (rollback or purge) */
+ row_ext_t** ext, /*!< out, own: NULL, or externally
+ stored column prefixes */
+ const dict_index_t* index, /*!< in: clustered index */
+ const upd_t* update, /*!< in: an update vector built for the
+ clustered index */
+ mem_heap_t* heap); /*!< in: memory heap */
+/***********************************************************//**
+Checks if an update vector changes an ordering field of an index record.
+
+This function is fast if the update vector is short or the number of ordering
+fields in the index is small. Otherwise, this can be quadratic.
+NOTE: we compare the fields as binary strings!
+@return TRUE if update vector changes an ordering field in the index record */
+UNIV_INTERN
+ibool
+row_upd_changes_ord_field_binary(
+/*=============================*/
+ const dtuple_t* row, /*!< in: old value of row, or NULL if the
+ row and the data values in update are not
+ known when this function is called, e.g., at
+ compile time */
+ dict_index_t* index, /*!< in: index of the record */
+ const upd_t* update);/*!< in: update vector for the row; NOTE: the
+ field numbers in this MUST be clustered index
+ positions! */
+/***********************************************************//**
+Checks if an update vector changes an ordering field of an index record.
+This function is fast if the update vector is short or the number of ordering
+fields in the index is small. Otherwise, this can be quadratic.
+NOTE: we compare the fields as binary strings!
+@return TRUE if update vector may change an ordering field in an index
+record */
+UNIV_INTERN
+ibool
+row_upd_changes_some_index_ord_field_binary(
+/*========================================*/
+ const dict_table_t* table, /*!< in: table */
+ const upd_t* update);/*!< in: update vector for the row */
+/***********************************************************//**
+Updates a row in a table. This is a high-level function used
+in SQL execution graphs.
+@return query thread to run next or NULL */
+UNIV_INTERN
+que_thr_t*
+row_upd_step(
+/*=========*/
+ que_thr_t* thr); /*!< in: query thread */
+#endif /* !UNIV_HOTBACKUP */
+/*********************************************************************//**
+Parses the log data of system field values.
+@return log data end or NULL */
+UNIV_INTERN
+byte*
+row_upd_parse_sys_vals(
+/*===================*/
+ byte* ptr, /*!< in: buffer */
+ byte* end_ptr,/*!< in: buffer end */
+ ulint* pos, /*!< out: TRX_ID position in record */
+ trx_id_t* trx_id, /*!< out: trx id */
+ roll_ptr_t* roll_ptr);/*!< out: roll ptr */
+/*********************************************************************//**
+Updates the trx id and roll ptr field in a clustered index record in database
+recovery. */
+UNIV_INTERN
+void
+row_upd_rec_sys_fields_in_recovery(
+/*===============================*/
+ rec_t* rec, /*!< in/out: record */
+ page_zip_des_t* page_zip,/*!< in/out: compressed page, or NULL */
+ const ulint* offsets,/*!< in: array returned by rec_get_offsets() */
+ ulint pos, /*!< in: TRX_ID position in rec */
+ trx_id_t trx_id, /*!< in: transaction id */
+ roll_ptr_t roll_ptr);/*!< in: roll ptr of the undo log record */
+/*********************************************************************//**
+Parses the log data written by row_upd_index_write_log.
+@return log data end or NULL */
+UNIV_INTERN
+byte*
+row_upd_index_parse(
+/*================*/
+ byte* ptr, /*!< in: buffer */
+ byte* end_ptr,/*!< in: buffer end */
+ mem_heap_t* heap, /*!< in: memory heap where update vector is
+ built */
+ upd_t** update_out);/*!< out: update vector */
+
+
+/* Update vector field */
+struct upd_field_struct{
+ unsigned field_no:16; /*!< field number in an index, usually
+ the clustered index, but in updating
+ a secondary index record in btr0cur.c
+ this is the position in the secondary
+ index */
+#ifndef UNIV_HOTBACKUP
+ unsigned orig_len:16; /*!< original length of the locally
+ stored part of an externally stored
+ column, or 0 */
+ que_node_t* exp; /*!< expression for calculating a new
+ value: it refers to column values and
+ constants in the symbol table of the
+ query graph */
+#endif /* !UNIV_HOTBACKUP */
+ dfield_t new_val; /*!< new value for the column */
+};
+
+/* Update vector structure */
+struct upd_struct{
+ ulint info_bits; /*!< new value of info bits to record;
+ default is 0 */
+ ulint n_fields; /*!< number of update fields */
+ upd_field_t* fields; /*!< array of update fields */
+};
+
+#ifndef UNIV_HOTBACKUP
+/* Update node structure which also implements the delete operation
+of a row */
+
+struct upd_node_struct{
+ que_common_t common; /*!< node type: QUE_NODE_UPDATE */
+ ibool is_delete;/* TRUE if delete, FALSE if update */
+ ibool searched_update;
+ /* TRUE if searched update, FALSE if
+ positioned */
+ ibool in_mysql_interface;
+ /* TRUE if the update node was created
+ for the MySQL interface */
+ dict_foreign_t* foreign;/* NULL or pointer to a foreign key
+ constraint if this update node is used in
+ doing an ON DELETE or ON UPDATE operation */
+ upd_node_t* cascade_node;/* NULL or an update node template which
+ is used to implement ON DELETE/UPDATE CASCADE
+ or ... SET NULL for foreign keys */
+ mem_heap_t* cascade_heap;/* NULL or a mem heap where the cascade
+ node is created */
+ sel_node_t* select; /*!< query graph subtree implementing a base
+ table cursor: the rows returned will be
+ updated */
+ btr_pcur_t* pcur; /*!< persistent cursor placed on the clustered
+ index record which should be updated or
+ deleted; the cursor is stored in the graph
+ of 'select' field above, except in the case
+ of the MySQL interface */
+ dict_table_t* table; /*!< table where updated */
+ upd_t* update; /*!< update vector for the row */
+ ulint update_n_fields;
+ /* when this struct is used to implement
+ a cascade operation for foreign keys, we store
+ here the size of the buffer allocated for use
+ as the update vector */
+ sym_node_list_t columns;/* symbol table nodes for the columns
+ to retrieve from the table */
+ ibool has_clust_rec_x_lock;
+ /* TRUE if the select which retrieves the
+ records to update already sets an x-lock on
+ the clustered record; note that it must always
+ set at least an s-lock */
+ ulint cmpl_info;/* information extracted during query
+ compilation; speeds up execution:
+ UPD_NODE_NO_ORD_CHANGE and
+ UPD_NODE_NO_SIZE_CHANGE, ORed */
+ /*----------------------*/
+ /* Local storage for this graph node */
+ ulint state; /*!< node execution state */
+ dict_index_t* index; /*!< NULL, or the next index whose record should
+ be updated */
+ dtuple_t* row; /*!< NULL, or a copy (also fields copied to
+ heap) of the row to update; this must be reset
+ to NULL after a successful update */
+ row_ext_t* ext; /*!< NULL, or prefixes of the externally
+ stored columns in the old row */
+ dtuple_t* upd_row;/* NULL, or a copy of the updated row */
+ row_ext_t* upd_ext;/* NULL, or prefixes of the externally
+ stored columns in upd_row */
+ mem_heap_t* heap; /*!< memory heap used as auxiliary storage;
+ this must be emptied after a successful
+ update */
+ /*----------------------*/
+ sym_node_t* table_sym;/* table node in symbol table */
+ que_node_t* col_assign_list;
+ /* column assignment list */
+ ulint magic_n;
+};
+
+#define UPD_NODE_MAGIC_N 1579975
+
+/* Node execution states */
+#define UPD_NODE_SET_IX_LOCK 1 /* execution came to the node from
+ a node above and if the field
+ has_clust_rec_x_lock is FALSE, we
+ should set an intention x-lock on
+ the table */
+#define UPD_NODE_UPDATE_CLUSTERED 2 /* clustered index record should be
+ updated */
+#define UPD_NODE_INSERT_CLUSTERED 3 /* clustered index record should be
+ inserted, old record is already delete
+ marked */
+#define UPD_NODE_UPDATE_ALL_SEC 4 /* an ordering field of the clustered
+ index record was changed, or this is
+ a delete operation: should update
+ all the secondary index records */
+#define UPD_NODE_UPDATE_SOME_SEC 5 /* secondary index entries should be
+ looked at and updated if an ordering
+ field changed */
+
+/* Compilation info flags: these must fit within 3 bits; see trx0rec.h */
+#define UPD_NODE_NO_ORD_CHANGE 1 /* no secondary index record will be
+ changed in the update and no ordering
+ field of the clustered index */
+#define UPD_NODE_NO_SIZE_CHANGE 2 /* no record field size will be
+ changed in the update */
+
+#endif /* !UNIV_HOTBACKUP */
+
+#ifndef UNIV_NONINL
+#include "row0upd.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/row0upd.ic b/storage/innobase/include/row0upd.ic
new file mode 100644
index 00000000000..18e22f1eca9
--- /dev/null
+++ b/storage/innobase/include/row0upd.ic
@@ -0,0 +1,184 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0upd.ic
+Update of a row
+
+Created 12/27/1996 Heikki Tuuri
+*******************************************************/
+
+#include "mtr0log.h"
+#ifndef UNIV_HOTBACKUP
+# include "trx0trx.h"
+# include "trx0undo.h"
+# include "row0row.h"
+# include "btr0sea.h"
+#endif /* !UNIV_HOTBACKUP */
+#include "page0zip.h"
+
+/*********************************************************************//**
+Creates an update vector object.
+@return own: update vector object */
+UNIV_INLINE
+upd_t*
+upd_create(
+/*=======*/
+ ulint n, /*!< in: number of fields */
+ mem_heap_t* heap) /*!< in: heap from which memory allocated */
+{
+ upd_t* update;
+
+ update = (upd_t*) mem_heap_alloc(heap, sizeof(upd_t));
+
+ update->info_bits = 0;
+ update->n_fields = n;
+ update->fields = (upd_field_t*)
+ mem_heap_alloc(heap, sizeof(upd_field_t) * n);
+
+ return(update);
+}
+
+/*********************************************************************//**
+Returns the number of fields in the update vector == number of columns
+to be updated by an update vector.
+@return number of fields */
+UNIV_INLINE
+ulint
+upd_get_n_fields(
+/*=============*/
+ const upd_t* update) /*!< in: update vector */
+{
+ ut_ad(update);
+
+ return(update->n_fields);
+}
+
+#ifdef UNIV_DEBUG
+/*********************************************************************//**
+Returns the nth field of an update vector.
+@return update vector field */
+UNIV_INLINE
+upd_field_t*
+upd_get_nth_field(
+/*==============*/
+ const upd_t* update, /*!< in: update vector */
+ ulint n) /*!< in: field position in update vector */
+{
+ ut_ad(update);
+ ut_ad(n < update->n_fields);
+
+ return((upd_field_t*) update->fields + n);
+}
+#endif /* UNIV_DEBUG */
+
+#ifndef UNIV_HOTBACKUP
+/*********************************************************************//**
+Sets an index field number to be updated by an update vector field. */
+UNIV_INLINE
+void
+upd_field_set_field_no(
+/*===================*/
+ upd_field_t* upd_field, /*!< in: update vector field */
+ ulint field_no, /*!< in: field number in a clustered
+ index */
+ dict_index_t* index, /*!< in: index */
+ trx_t* trx) /*!< in: transaction */
+{
+ upd_field->field_no = field_no;
+ upd_field->orig_len = 0;
+
+ if (UNIV_UNLIKELY(field_no >= dict_index_get_n_fields(index))) {
+ fprintf(stderr,
+ "InnoDB: Error: trying to access field %lu in ",
+ (ulong) field_no);
+ dict_index_name_print(stderr, trx, index);
+ fprintf(stderr, "\n"
+ "InnoDB: but index only has %lu fields\n",
+ (ulong) dict_index_get_n_fields(index));
+ }
+
+ dict_col_copy_type(dict_index_get_nth_col(index, field_no),
+ dfield_get_type(&upd_field->new_val));
+}
+
+/*********************************************************************//**
+Returns a field of an update vector by field_no.
+@return update vector field, or NULL */
+UNIV_INLINE
+const upd_field_t*
+upd_get_field_by_field_no(
+/*======================*/
+ const upd_t* update, /*!< in: update vector */
+ ulint no) /*!< in: field_no */
+{
+ ulint i;
+ for (i = 0; i < upd_get_n_fields(update); i++) {
+ const upd_field_t* uf = upd_get_nth_field(update, i);
+
+ if (uf->field_no == no) {
+
+ return(uf);
+ }
+ }
+
+ return(NULL);
+}
+
+/*********************************************************************//**
+Updates the trx id and roll ptr field in a clustered index record when
+a row is updated or marked deleted. */
+UNIV_INLINE
+void
+row_upd_rec_sys_fields(
+/*===================*/
+ rec_t* rec, /*!< in/out: record */
+ page_zip_des_t* page_zip,/*!< in/out: compressed page whose
+ uncompressed part will be updated, or NULL */
+ dict_index_t* index, /*!< in: clustered index */
+ const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */
+ trx_t* trx, /*!< in: transaction */
+ roll_ptr_t roll_ptr)/*!< in: roll ptr of the undo log record */
+{
+ ut_ad(dict_index_is_clust(index));
+ ut_ad(rec_offs_validate(rec, index, offsets));
+#ifdef UNIV_SYNC_DEBUG
+ if (!rw_lock_own(&btr_search_latch, RW_LOCK_EX)) {
+ ut_ad(!buf_block_align(rec)->is_hashed);
+ }
+#endif /* UNIV_SYNC_DEBUG */
+
+ if (UNIV_LIKELY_NULL(page_zip)) {
+ ulint pos = dict_index_get_sys_col_pos(index, DATA_TRX_ID);
+ page_zip_write_trx_id_and_roll_ptr(page_zip, rec, offsets,
+ pos, trx->id, roll_ptr);
+ } else {
+ ulint offset = index->trx_id_offset;
+
+ if (!offset) {
+ offset = row_get_trx_id_offset(rec, index, offsets);
+ }
+
+#if DATA_TRX_ID + 1 != DATA_ROLL_PTR
+# error "DATA_TRX_ID + 1 != DATA_ROLL_PTR"
+#endif
+ trx_write_trx_id(rec + offset, trx->id);
+ trx_write_roll_ptr(rec + offset + DATA_TRX_ID_LEN, roll_ptr);
+ }
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/innobase/include/row0vers.h b/storage/innobase/include/row0vers.h
new file mode 100644
index 00000000000..5a2e38230d5
--- /dev/null
+++ b/storage/innobase/include/row0vers.h
@@ -0,0 +1,142 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0vers.h
+Row versions
+
+Created 2/6/1997 Heikki Tuuri
+*******************************************************/
+
+#ifndef row0vers_h
+#define row0vers_h
+
+#include "univ.i"
+#include "data0data.h"
+#include "dict0types.h"
+#include "trx0types.h"
+#include "que0types.h"
+#include "rem0types.h"
+#include "mtr0mtr.h"
+#include "read0types.h"
+
+/*****************************************************************//**
+Finds out if an active transaction has inserted or modified a secondary
+index record. NOTE: the kernel mutex is temporarily released in this
+function!
+@return NULL if committed, else the active transaction */
+UNIV_INTERN
+trx_t*
+row_vers_impl_x_locked_off_kernel(
+/*==============================*/
+ const rec_t* rec, /*!< in: record in a secondary index */
+ dict_index_t* index, /*!< in: the secondary index */
+ const ulint* offsets);/*!< in: rec_get_offsets(rec, index) */
+/*****************************************************************//**
+Finds out if we must preserve a delete marked earlier version of a clustered
+index record, because it is >= the purge view.
+@return TRUE if earlier version should be preserved */
+UNIV_INTERN
+ibool
+row_vers_must_preserve_del_marked(
+/*==============================*/
+ trx_id_t trx_id, /*!< in: transaction id in the version */
+ mtr_t* mtr); /*!< in: mtr holding the latch on the
+ clustered index record; it will also
+ hold the latch on purge_view */
+/*****************************************************************//**
+Finds out if a version of the record, where the version >= the current
+purge view, should have ientry as its secondary index entry. We check
+if there is any not delete marked version of the record where the trx
+id >= purge view, and the secondary index entry == ientry; exactly in
+this case we return TRUE.
+@return TRUE if earlier version should have */
+UNIV_INTERN
+ibool
+row_vers_old_has_index_entry(
+/*=========================*/
+ ibool also_curr,/*!< in: TRUE if also rec is included in the
+ versions to search; otherwise only versions
+ prior to it are searched */
+ const rec_t* rec, /*!< in: record in the clustered index; the
+ caller must have a latch on the page */
+ mtr_t* mtr, /*!< in: mtr holding the latch on rec; it will
+ also hold the latch on purge_view */
+ dict_index_t* index, /*!< in: the secondary index */
+ const dtuple_t* ientry);/*!< in: the secondary index entry */
+/*****************************************************************//**
+Constructs the version of a clustered index record which a consistent
+read should see. We assume that the trx id stored in rec is such that
+the consistent read should not see rec in its present version.
+@return DB_SUCCESS or DB_MISSING_HISTORY */
+UNIV_INTERN
+ulint
+row_vers_build_for_consistent_read(
+/*===============================*/
+ const rec_t* rec, /*!< in: record in a clustered index; the
+ caller must have a latch on the page; this
+ latch locks the top of the stack of versions
+ of this records */
+ mtr_t* mtr, /*!< in: mtr holding the latch on rec; it will
+ also hold the latch on purge_view */
+ dict_index_t* index, /*!< in: the clustered index */
+ ulint** offsets,/*!< in/out: offsets returned by
+ rec_get_offsets(rec, index) */
+ read_view_t* view, /*!< in: the consistent read view */
+ mem_heap_t** offset_heap,/*!< in/out: memory heap from which
+ the offsets are allocated */
+ mem_heap_t* in_heap,/*!< in: memory heap from which the memory for
+ *old_vers is allocated; memory for possible
+ intermediate versions is allocated and freed
+ locally within the function */
+ rec_t** old_vers);/*!< out, own: old version, or NULL if the
+ record does not exist in the view, that is,
+ it was freshly inserted afterwards */
+
+/*****************************************************************//**
+Constructs the last committed version of a clustered index record,
+which should be seen by a semi-consistent read.
+@return DB_SUCCESS or DB_MISSING_HISTORY */
+UNIV_INTERN
+ulint
+row_vers_build_for_semi_consistent_read(
+/*====================================*/
+ const rec_t* rec, /*!< in: record in a clustered index; the
+ caller must have a latch on the page; this
+ latch locks the top of the stack of versions
+ of this records */
+ mtr_t* mtr, /*!< in: mtr holding the latch on rec */
+ dict_index_t* index, /*!< in: the clustered index */
+ ulint** offsets,/*!< in/out: offsets returned by
+ rec_get_offsets(rec, index) */
+ mem_heap_t** offset_heap,/*!< in/out: memory heap from which
+ the offsets are allocated */
+ mem_heap_t* in_heap,/*!< in: memory heap from which the memory for
+ *old_vers is allocated; memory for possible
+ intermediate versions is allocated and freed
+ locally within the function */
+ const rec_t** old_vers);/*!< out: rec, old version, or NULL if the
+ record does not exist in the view, that is,
+ it was freshly inserted afterwards */
+
+
+#ifndef UNIV_NONINL
+#include "row0vers.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/row0vers.ic b/storage/innobase/include/row0vers.ic
new file mode 100644
index 00000000000..8bb3a5c0cb3
--- /dev/null
+++ b/storage/innobase/include/row0vers.ic
@@ -0,0 +1,30 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0vers.ic
+Row versions
+
+Created 2/6/1997 Heikki Tuuri
+*******************************************************/
+
+#include "row0row.h"
+#include "dict0dict.h"
+#include "read0read.h"
+#include "page0page.h"
+#include "log0recv.h"
diff --git a/storage/innobase/include/srv0que.h b/storage/innobase/include/srv0que.h
new file mode 100644
index 00000000000..82ee7739ef7
--- /dev/null
+++ b/storage/innobase/include/srv0que.h
@@ -0,0 +1,42 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/srv0que.h
+Server query execution
+
+Created 6/5/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef srv0que_h
+#define srv0que_h
+
+#include "univ.i"
+#include "que0types.h"
+
+/**********************************************************************//**
+Enqueues a task to server task queue and releases a worker thread, if there
+is a suspended one. */
+UNIV_INTERN
+void
+srv_que_task_enqueue_low(
+/*=====================*/
+ que_thr_t* thr); /*!< in: query thread */
+
+#endif
+
diff --git a/storage/innobase/include/srv0srv.h b/storage/innobase/include/srv0srv.h
new file mode 100644
index 00000000000..499bccfe2b8
--- /dev/null
+++ b/storage/innobase/include/srv0srv.h
@@ -0,0 +1,664 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 2008, 2009, Google Inc.
+
+Portions of this file contain modifications contributed and copyrighted by
+Google, Inc. Those modifications are gratefully acknowledged and are described
+briefly in the InnoDB documentation. The contributions by Google are
+incorporated with their permission, and subject to the conditions contained in
+the file COPYING.Google.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+/***********************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 2009, Percona Inc.
+
+Portions of this file contain modifications contributed and copyrighted
+by Percona Inc.. Those modifications are
+gratefully acknowledged and are described briefly in the InnoDB
+documentation. The contributions by Percona Inc. are incorporated with
+their permission, and subject to the conditions contained in the file
+COPYING.Percona.
+
+This program is free software; you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by the
+Free Software Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
+Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+***********************************************************************/
+
+/**************************************************//**
+@file include/srv0srv.h
+The server main program
+
+Created 10/10/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef srv0srv_h
+#define srv0srv_h
+
+#include "univ.i"
+#ifndef UNIV_HOTBACKUP
+#include "sync0sync.h"
+#include "os0sync.h"
+#include "que0types.h"
+#include "trx0types.h"
+
+extern const char* srv_main_thread_op_info;
+
+/** Prefix used by MySQL to indicate pre-5.1 table name encoding */
+extern const char srv_mysql50_table_name_prefix[9];
+
+/* When this event is set the lock timeout and InnoDB monitor
+thread starts running */
+extern os_event_t srv_lock_timeout_thread_event;
+
+/* If the last data file is auto-extended, we add this many pages to it
+at a time */
+#define SRV_AUTO_EXTEND_INCREMENT \
+ (srv_auto_extend_increment * ((1024 * 1024) / UNIV_PAGE_SIZE))
+
+/* This is set to TRUE if the MySQL user has set it in MySQL */
+extern ibool srv_lower_case_table_names;
+
+/* Mutex for locking srv_monitor_file */
+extern mutex_t srv_monitor_file_mutex;
+/* Temporary file for innodb monitor output */
+extern FILE* srv_monitor_file;
+/* Mutex for locking srv_dict_tmpfile.
+This mutex has a very high rank; threads reserving it should not
+be holding any InnoDB latches. */
+extern mutex_t srv_dict_tmpfile_mutex;
+/* Temporary file for output from the data dictionary */
+extern FILE* srv_dict_tmpfile;
+/* Mutex for locking srv_misc_tmpfile.
+This mutex has a very low rank; threads reserving it should not
+acquire any further latches or sleep before releasing this one. */
+extern mutex_t srv_misc_tmpfile_mutex;
+/* Temporary file for miscellanous diagnostic output */
+extern FILE* srv_misc_tmpfile;
+
+/* Server parameters which are read from the initfile */
+
+extern char* srv_data_home;
+#ifdef UNIV_LOG_ARCHIVE
+extern char* srv_arch_dir;
+#endif /* UNIV_LOG_ARCHIVE */
+
+/** store to its own file each table created by an user; data
+dictionary tables are in the system tablespace 0 */
+#ifndef UNIV_HOTBACKUP
+extern my_bool srv_file_per_table;
+#else
+extern ibool srv_file_per_table;
+#endif /* UNIV_HOTBACKUP */
+/** The file format to use on new *.ibd files. */
+extern ulint srv_file_format;
+/** Whether to check file format during startup. A value of
+DICT_TF_FORMAT_MAX + 1 means no checking ie. FALSE. The default is to
+set it to the highest format we support. */
+extern ulint srv_check_file_format_at_startup;
+/** Place locks to records only i.e. do not use next-key locking except
+on duplicate key checking and foreign key checking */
+extern ibool srv_locks_unsafe_for_binlog;
+#endif /* !UNIV_HOTBACKUP */
+
+extern ulint srv_n_data_files;
+extern char** srv_data_file_names;
+extern ulint* srv_data_file_sizes;
+extern ulint* srv_data_file_is_raw_partition;
+
+extern ibool srv_auto_extend_last_data_file;
+extern ulint srv_last_file_size_max;
+extern char** srv_log_group_home_dirs;
+#ifndef UNIV_HOTBACKUP
+extern ulong srv_auto_extend_increment;
+
+extern ibool srv_created_new_raw;
+
+extern ulint srv_n_log_groups;
+extern ulint srv_n_log_files;
+extern ulint srv_log_file_size;
+extern ulint srv_log_buffer_size;
+extern ulong srv_flush_log_at_trx_commit;
+extern char srv_adaptive_flushing;
+
+
+/* The sort order table of the MySQL latin1_swedish_ci character set
+collation */
+extern const byte* srv_latin1_ordering;
+#ifndef UNIV_HOTBACKUP
+extern my_bool srv_use_sys_malloc;
+#else
+extern ibool srv_use_sys_malloc;
+#endif /* UNIV_HOTBACKUP */
+extern ulint srv_buf_pool_size; /*!< requested size in bytes */
+extern ulint srv_buf_pool_old_size; /*!< previously requested size */
+extern ulint srv_buf_pool_curr_size; /*!< current size in bytes */
+extern ulint srv_mem_pool_size;
+extern ulint srv_lock_table_size;
+
+extern ulint srv_n_file_io_threads;
+extern ulong srv_read_ahead_threshold;
+extern ulint srv_n_read_io_threads;
+extern ulint srv_n_write_io_threads;
+
+/* Number of IO operations per second the server can do */
+extern ulong srv_io_capacity;
+/* Returns the number of IO operations that is X percent of the
+capacity. PCT_IO(5) -> returns the number of IO operations that
+is 5% of the max where max is srv_io_capacity. */
+#define PCT_IO(p) ((ulong) (srv_io_capacity * ((double) p / 100.0)))
+
+#ifdef UNIV_LOG_ARCHIVE
+extern ibool srv_log_archive_on;
+extern ibool srv_archive_recovery;
+extern dulint srv_archive_recovery_limit_lsn;
+#endif /* UNIV_LOG_ARCHIVE */
+
+extern char* srv_file_flush_method_str;
+extern ulint srv_unix_file_flush_method;
+extern ulint srv_win_file_flush_method;
+
+extern ulint srv_max_n_open_files;
+
+extern ulint srv_max_dirty_pages_pct;
+
+extern ulint srv_force_recovery;
+extern ulong srv_thread_concurrency;
+
+extern ulint srv_max_n_threads;
+
+extern lint srv_conc_n_threads;
+
+extern ulint srv_fast_shutdown; /* If this is 1, do not do a
+ purge and index buffer merge.
+ If this 2, do not even flush the
+ buffer pool to data files at the
+ shutdown: we effectively 'crash'
+ InnoDB (but lose no committed
+ transactions). */
+extern ibool srv_innodb_status;
+
+extern unsigned long long srv_stats_sample_pages;
+
+extern ibool srv_use_doublewrite_buf;
+extern ibool srv_use_checksums;
+
+extern ibool srv_set_thread_priorities;
+extern int srv_query_thread_priority;
+
+extern ulong srv_max_buf_pool_modified_pct;
+extern ulong srv_max_purge_lag;
+
+extern ulong srv_replication_delay;
+/*-------------------------------------------*/
+
+extern ulint srv_n_rows_inserted;
+extern ulint srv_n_rows_updated;
+extern ulint srv_n_rows_deleted;
+extern ulint srv_n_rows_read;
+
+extern ibool srv_print_innodb_monitor;
+extern ibool srv_print_innodb_lock_monitor;
+extern ibool srv_print_innodb_tablespace_monitor;
+extern ibool srv_print_verbose_log;
+extern ibool srv_print_innodb_table_monitor;
+
+extern ibool srv_lock_timeout_and_monitor_active;
+extern ibool srv_error_monitor_active;
+
+extern ulong srv_n_spin_wait_rounds;
+extern ulong srv_n_free_tickets_to_enter;
+extern ulong srv_thread_sleep_delay;
+extern ulong srv_spin_wait_delay;
+extern ibool srv_priority_boost;
+
+extern ulint srv_mem_pool_size;
+extern ulint srv_lock_table_size;
+
+#ifdef UNIV_DEBUG
+extern ibool srv_print_thread_releases;
+extern ibool srv_print_lock_waits;
+extern ibool srv_print_buf_io;
+extern ibool srv_print_log_io;
+extern ibool srv_print_latch_waits;
+#else /* UNIV_DEBUG */
+# define srv_print_thread_releases FALSE
+# define srv_print_lock_waits FALSE
+# define srv_print_buf_io FALSE
+# define srv_print_log_io FALSE
+# define srv_print_latch_waits FALSE
+#endif /* UNIV_DEBUG */
+
+extern ulint srv_activity_count;
+extern ulint srv_fatal_semaphore_wait_threshold;
+extern ulint srv_dml_needed_delay;
+
+extern mutex_t* kernel_mutex_temp;/* mutex protecting the server, trx structs,
+ query threads, and lock table: we allocate
+ it from dynamic memory to get it to the
+ same DRAM page as other hotspot semaphores */
+#define kernel_mutex (*kernel_mutex_temp)
+
+#define SRV_MAX_N_IO_THREADS 130
+
+/* Array of English strings describing the current state of an
+i/o handler thread */
+extern const char* srv_io_thread_op_info[];
+extern const char* srv_io_thread_function[];
+
+/* the number of the log write requests done */
+extern ulint srv_log_write_requests;
+
+/* the number of physical writes to the log performed */
+extern ulint srv_log_writes;
+
+/* amount of data written to the log files in bytes */
+extern ulint srv_os_log_written;
+
+/* amount of writes being done to the log files */
+extern ulint srv_os_log_pending_writes;
+
+/* we increase this counter, when there we don't have enough space in the
+log buffer and have to flush it */
+extern ulint srv_log_waits;
+
+/* variable that counts amount of data read in total (in bytes) */
+extern ulint srv_data_read;
+
+/* here we count the amount of data written in total (in bytes) */
+extern ulint srv_data_written;
+
+/* this variable counts the amount of times, when the doublewrite buffer
+was flushed */
+extern ulint srv_dblwr_writes;
+
+/* here we store the number of pages that have been flushed to the
+doublewrite buffer */
+extern ulint srv_dblwr_pages_written;
+
+/* in this variable we store the number of write requests issued */
+extern ulint srv_buf_pool_write_requests;
+
+/* here we store the number of times when we had to wait for a free page
+in the buffer pool. It happens when the buffer pool is full and we need
+to make a flush, in order to be able to read or create a page. */
+extern ulint srv_buf_pool_wait_free;
+
+/* variable to count the number of pages that were written from the
+buffer pool to disk */
+extern ulint srv_buf_pool_flushed;
+
+/** Number of buffer pool reads that led to the
+reading of a disk page */
+extern ulint srv_buf_pool_reads;
+/** Number of sequential read-aheads */
+extern ulint srv_read_ahead_seq;
+/** Number of random read-aheads */
+extern ulint srv_read_ahead_rnd;
+
+/** Status variables to be passed to MySQL */
+typedef struct export_var_struct export_struc;
+
+/** Status variables to be passed to MySQL */
+extern export_struc export_vars;
+
+/** The server system */
+typedef struct srv_sys_struct srv_sys_t;
+
+/** The server system */
+extern srv_sys_t* srv_sys;
+#endif /* !UNIV_HOTBACKUP */
+
+/** Types of raw partitions in innodb_data_file_path */
+enum {
+ SRV_NOT_RAW = 0, /*!< Not a raw partition */
+ SRV_NEW_RAW, /*!< A 'newraw' partition, only to be
+ initialized */
+ SRV_OLD_RAW /*!< An initialized raw partition */
+};
+
+/** Alternatives for the file flush option in Unix; see the InnoDB manual
+about what these mean */
+enum {
+ SRV_UNIX_FSYNC = 1, /*!< fsync, the default */
+ SRV_UNIX_O_DSYNC, /*!< open log files in O_SYNC mode */
+ SRV_UNIX_LITTLESYNC, /*!< do not call os_file_flush()
+ when writing data files, but do flush
+ after writing to log files */
+ SRV_UNIX_NOSYNC, /*!< do not flush after writing */
+ SRV_UNIX_O_DIRECT /*!< invoke os_file_set_nocache() on
+ data files */
+};
+
+/** Alternatives for file i/o in Windows */
+enum {
+ SRV_WIN_IO_NORMAL = 1, /*!< buffered I/O */
+ SRV_WIN_IO_UNBUFFERED /*!< unbuffered I/O; this is the default */
+};
+
+/** Alternatives for srv_force_recovery. Non-zero values are intended
+to help the user get a damaged database up so that he can dump intact
+tables and rows with SELECT INTO OUTFILE. The database must not otherwise
+be used with these options! A bigger number below means that all precautions
+of lower numbers are included. */
+enum {
+ SRV_FORCE_IGNORE_CORRUPT = 1, /*!< let the server run even if it
+ detects a corrupt page */
+ SRV_FORCE_NO_BACKGROUND = 2, /*!< prevent the main thread from
+ running: if a crash would occur
+ in purge, this prevents it */
+ SRV_FORCE_NO_TRX_UNDO = 3, /*!< do not run trx rollback after
+ recovery */
+ SRV_FORCE_NO_IBUF_MERGE = 4, /*!< prevent also ibuf operations:
+ if they would cause a crash, better
+ not do them */
+ SRV_FORCE_NO_UNDO_LOG_SCAN = 5, /*!< do not look at undo logs when
+ starting the database: InnoDB will
+ treat even incomplete transactions
+ as committed */
+ SRV_FORCE_NO_LOG_REDO = 6 /*!< do not do the log roll-forward
+ in connection with recovery */
+};
+
+#ifndef UNIV_HOTBACKUP
+/** Types of threads existing in the system. */
+enum srv_thread_type {
+ SRV_COM = 1, /**< threads serving communication and queries */
+ SRV_CONSOLE, /**< thread serving console */
+ SRV_WORKER, /**< threads serving parallelized queries and
+ queries released from lock wait */
+#if 0
+ /* Utility threads */
+ SRV_BUFFER, /**< thread flushing dirty buffer blocks */
+ SRV_RECOVERY, /**< threads finishing a recovery */
+ SRV_INSERT, /**< thread flushing the insert buffer to disk */
+#endif
+ SRV_MASTER /**< the master thread, (whose type number must
+ be biggest) */
+};
+
+/*********************************************************************//**
+Boots Innobase server.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+ulint
+srv_boot(void);
+/*==========*/
+/*********************************************************************//**
+Initializes the server. */
+UNIV_INTERN
+void
+srv_init(void);
+/*==========*/
+/*********************************************************************//**
+Frees the OS fast mutex created in srv_boot(). */
+UNIV_INTERN
+void
+srv_free(void);
+/*==========*/
+/*********************************************************************//**
+Initializes the synchronization primitives, memory system, and the thread
+local storage. */
+UNIV_INTERN
+void
+srv_general_init(void);
+/*==================*/
+/*********************************************************************//**
+Gets the number of threads in the system.
+@return sum of srv_n_threads[] */
+UNIV_INTERN
+ulint
+srv_get_n_threads(void);
+/*===================*/
+/*********************************************************************//**
+Returns the calling thread type.
+@return SRV_COM, ... */
+
+enum srv_thread_type
+srv_get_thread_type(void);
+/*=====================*/
+/*********************************************************************//**
+Sets the info describing an i/o thread current state. */
+UNIV_INTERN
+void
+srv_set_io_thread_op_info(
+/*======================*/
+ ulint i, /*!< in: the 'segment' of the i/o thread */
+ const char* str); /*!< in: constant char string describing the
+ state */
+/*********************************************************************//**
+Releases threads of the type given from suspension in the thread table.
+NOTE! The server mutex has to be reserved by the caller!
+@return number of threads released: this may be less than n if not
+enough threads were suspended at the moment */
+UNIV_INTERN
+ulint
+srv_release_threads(
+/*================*/
+ enum srv_thread_type type, /*!< in: thread type */
+ ulint n); /*!< in: number of threads to release */
+/*********************************************************************//**
+The master thread controlling the server.
+@return a dummy parameter */
+UNIV_INTERN
+os_thread_ret_t
+srv_master_thread(
+/*==============*/
+ void* arg); /*!< in: a dummy parameter required by
+ os_thread_create */
+/*******************************************************************//**
+Tells the Innobase server that there has been activity in the database
+and wakes up the master thread if it is suspended (not sleeping). Used
+in the MySQL interface. Note that there is a small chance that the master
+thread stays suspended (we do not protect our operation with the kernel
+mutex, for performace reasons). */
+UNIV_INTERN
+void
+srv_active_wake_master_thread(void);
+/*===============================*/
+/*******************************************************************//**
+Wakes up the master thread if it is suspended or being suspended. */
+UNIV_INTERN
+void
+srv_wake_master_thread(void);
+/*========================*/
+/*********************************************************************//**
+Puts an OS thread to wait if there are too many concurrent threads
+(>= srv_thread_concurrency) inside InnoDB. The threads wait in a FIFO queue. */
+UNIV_INTERN
+void
+srv_conc_enter_innodb(
+/*==================*/
+ trx_t* trx); /*!< in: transaction object associated with the
+ thread */
+/*********************************************************************//**
+This lets a thread enter InnoDB regardless of the number of threads inside
+InnoDB. This must be called when a thread ends a lock wait. */
+UNIV_INTERN
+void
+srv_conc_force_enter_innodb(
+/*========================*/
+ trx_t* trx); /*!< in: transaction object associated with the
+ thread */
+/*********************************************************************//**
+This must be called when a thread exits InnoDB in a lock wait or at the
+end of an SQL statement. */
+UNIV_INTERN
+void
+srv_conc_force_exit_innodb(
+/*=======================*/
+ trx_t* trx); /*!< in: transaction object associated with the
+ thread */
+/*********************************************************************//**
+This must be called when a thread exits InnoDB. */
+UNIV_INTERN
+void
+srv_conc_exit_innodb(
+/*=================*/
+ trx_t* trx); /*!< in: transaction object associated with the
+ thread */
+/***************************************************************//**
+Puts a MySQL OS thread to wait for a lock to be released. If an error
+occurs during the wait trx->error_state associated with thr is
+!= DB_SUCCESS when we return. DB_LOCK_WAIT_TIMEOUT and DB_DEADLOCK
+are possible errors. DB_DEADLOCK is returned if selective deadlock
+resolution chose this transaction as a victim. */
+UNIV_INTERN
+void
+srv_suspend_mysql_thread(
+/*=====================*/
+ que_thr_t* thr); /*!< in: query thread associated with the MySQL
+ OS thread */
+/********************************************************************//**
+Releases a MySQL OS thread waiting for a lock to be released, if the
+thread is already suspended. */
+UNIV_INTERN
+void
+srv_release_mysql_thread_if_suspended(
+/*==================================*/
+ que_thr_t* thr); /*!< in: query thread associated with the
+ MySQL OS thread */
+/*********************************************************************//**
+A thread which wakes up threads whose lock wait may have lasted too long.
+This also prints the info output by various InnoDB monitors.
+@return a dummy parameter */
+UNIV_INTERN
+os_thread_ret_t
+srv_lock_timeout_and_monitor_thread(
+/*================================*/
+ void* arg); /*!< in: a dummy parameter required by
+ os_thread_create */
+/*********************************************************************//**
+A thread which prints warnings about semaphore waits which have lasted
+too long. These can be used to track bugs which cause hangs.
+@return a dummy parameter */
+UNIV_INTERN
+os_thread_ret_t
+srv_error_monitor_thread(
+/*=====================*/
+ void* arg); /*!< in: a dummy parameter required by
+ os_thread_create */
+/******************************************************************//**
+Outputs to a file the output of the InnoDB Monitor. */
+UNIV_INTERN
+void
+srv_printf_innodb_monitor(
+/*======================*/
+ FILE* file, /*!< in: output stream */
+ ulint* trx_start, /*!< out: file position of the start of
+ the list of active transactions */
+ ulint* trx_end); /*!< out: file position of the end of
+ the list of active transactions */
+
+/******************************************************************//**
+Function to pass InnoDB status variables to MySQL */
+UNIV_INTERN
+void
+srv_export_innodb_status(void);
+/*==========================*/
+
+/** Thread slot in the thread table */
+typedef struct srv_slot_struct srv_slot_t;
+
+/** Thread table is an array of slots */
+typedef srv_slot_t srv_table_t;
+
+/** Status variables to be passed to MySQL */
+struct export_var_struct{
+ ulint innodb_data_pending_reads; /*!< Pending reads */
+ ulint innodb_data_pending_writes; /*!< Pending writes */
+ ulint innodb_data_pending_fsyncs; /*!< Pending fsyncs */
+ ulint innodb_data_fsyncs; /*!< Number of fsyncs so far */
+ ulint innodb_data_read; /*!< Data bytes read */
+ ulint innodb_data_writes; /*!< I/O write requests */
+ ulint innodb_data_written; /*!< Data bytes written */
+ ulint innodb_data_reads; /*!< I/O read requests */
+ ulint innodb_buffer_pool_pages_total; /*!< Buffer pool size */
+ ulint innodb_buffer_pool_pages_data; /*!< Data pages */
+ ulint innodb_buffer_pool_pages_dirty; /*!< Dirty data pages */
+ ulint innodb_buffer_pool_pages_misc; /*!< Miscellanous pages */
+ ulint innodb_buffer_pool_pages_free; /*!< Free pages */
+#ifdef UNIV_DEBUG
+ ulint innodb_buffer_pool_pages_latched; /*!< Latched pages */
+#endif /* UNIV_DEBUG */
+ ulint innodb_buffer_pool_read_requests; /*!< buf_pool->n_page_gets */
+ ulint innodb_buffer_pool_reads; /*!< srv_buf_pool_reads */
+ ulint innodb_buffer_pool_wait_free; /*!< srv_buf_pool_wait_free */
+ ulint innodb_buffer_pool_pages_flushed; /*!< srv_buf_pool_flushed */
+ ulint innodb_buffer_pool_write_requests;/*!< srv_buf_pool_write_requests */
+ ulint innodb_buffer_pool_read_ahead_seq;/*!< srv_read_ahead_seq */
+ ulint innodb_buffer_pool_read_ahead_rnd;/*!< srv_read_ahead_rnd */
+ ulint innodb_dblwr_pages_written; /*!< srv_dblwr_pages_written */
+ ulint innodb_dblwr_writes; /*!< srv_dblwr_writes */
+ ibool innodb_have_atomic_builtins; /*!< HAVE_ATOMIC_BUILTINS */
+ ulint innodb_log_waits; /*!< srv_log_waits */
+ ulint innodb_log_write_requests; /*!< srv_log_write_requests */
+ ulint innodb_log_writes; /*!< srv_log_writes */
+ ulint innodb_os_log_written; /*!< srv_os_log_written */
+ ulint innodb_os_log_fsyncs; /*!< fil_n_log_flushes */
+ ulint innodb_os_log_pending_writes; /*!< srv_os_log_pending_writes */
+ ulint innodb_os_log_pending_fsyncs; /*!< fil_n_pending_log_flushes */
+ ulint innodb_page_size; /*!< UNIV_PAGE_SIZE */
+ ulint innodb_pages_created; /*!< buf_pool->n_pages_created */
+ ulint innodb_pages_read; /*!< buf_pool->n_pages_read */
+ ulint innodb_pages_written; /*!< buf_pool->n_pages_written */
+ ulint innodb_row_lock_waits; /*!< srv_n_lock_wait_count */
+ ulint innodb_row_lock_current_waits; /*!< srv_n_lock_wait_current_count */
+ ib_int64_t innodb_row_lock_time; /*!< srv_n_lock_wait_time
+ / 1000 */
+ ulint innodb_row_lock_time_avg; /*!< srv_n_lock_wait_time
+ / 1000
+ / srv_n_lock_wait_count */
+ ulint innodb_row_lock_time_max; /*!< srv_n_lock_max_wait_time
+ / 1000 */
+ ulint innodb_rows_read; /*!< srv_n_rows_read */
+ ulint innodb_rows_inserted; /*!< srv_n_rows_inserted */
+ ulint innodb_rows_updated; /*!< srv_n_rows_updated */
+ ulint innodb_rows_deleted; /*!< srv_n_rows_deleted */
+};
+
+/** The server system struct */
+struct srv_sys_struct{
+ srv_table_t* threads; /*!< server thread table */
+ UT_LIST_BASE_NODE_T(que_thr_t)
+ tasks; /*!< task queue */
+};
+
+extern ulint srv_n_threads_active[];
+#else /* !UNIV_HOTBACKUP */
+# define srv_use_checksums TRUE
+# define srv_use_adaptive_hash_indexes FALSE
+# define srv_force_recovery 0UL
+# define srv_set_io_thread_op_info(t,info) ((void) 0)
+# define srv_is_being_started 0
+# define srv_win_file_flush_method SRV_WIN_IO_UNBUFFERED
+# define srv_unix_file_flush_method SRV_UNIX_O_DSYNC
+# define srv_start_raw_disk_in_use 0
+# define srv_file_per_table 1
+#endif /* !UNIV_HOTBACKUP */
+
+#endif
diff --git a/storage/innobase/include/srv0srv.ic b/storage/innobase/include/srv0srv.ic
new file mode 100644
index 00000000000..8a1a678a016
--- /dev/null
+++ b/storage/innobase/include/srv0srv.ic
@@ -0,0 +1,24 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/srv0srv.ic
+Server main program
+
+Created 10/4/1995 Heikki Tuuri
+*******************************************************/
diff --git a/storage/innobase/include/srv0start.h b/storage/innobase/include/srv0start.h
new file mode 100644
index 00000000000..8abf15da9c1
--- /dev/null
+++ b/storage/innobase/include/srv0start.h
@@ -0,0 +1,134 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/srv0start.h
+Starts the Innobase database server
+
+Created 10/10/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef srv0start_h
+#define srv0start_h
+
+#include "univ.i"
+#include "ut0byte.h"
+
+/*********************************************************************//**
+Normalizes a directory path for Windows: converts slashes to backslashes. */
+UNIV_INTERN
+void
+srv_normalize_path_for_win(
+/*=======================*/
+ char* str); /*!< in/out: null-terminated character string */
+/*********************************************************************//**
+Reads the data files and their sizes from a character string given in
+the .cnf file.
+@return TRUE if ok, FALSE on parse error */
+UNIV_INTERN
+ibool
+srv_parse_data_file_paths_and_sizes(
+/*================================*/
+ char* str); /*!< in/out: the data file path string */
+/*********************************************************************//**
+Reads log group home directories from a character string given in
+the .cnf file.
+@return TRUE if ok, FALSE on parse error */
+UNIV_INTERN
+ibool
+srv_parse_log_group_home_dirs(
+/*==========================*/
+ char* str); /*!< in/out: character string */
+/*********************************************************************//**
+Frees the memory allocated by srv_parse_data_file_paths_and_sizes()
+and srv_parse_log_group_home_dirs(). */
+UNIV_INTERN
+void
+srv_free_paths_and_sizes(void);
+/*==========================*/
+/*********************************************************************//**
+Adds a slash or a backslash to the end of a string if it is missing
+and the string is not empty.
+@return string which has the separator if the string is not empty */
+UNIV_INTERN
+char*
+srv_add_path_separator_if_needed(
+/*=============================*/
+ char* str); /*!< in: null-terminated character string */
+#ifndef UNIV_HOTBACKUP
+/****************************************************************//**
+Starts Innobase and creates a new database if database files
+are not found and the user wants.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+int
+innobase_start_or_create_for_mysql(void);
+/*====================================*/
+/****************************************************************//**
+Shuts down the Innobase database.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+int
+innobase_shutdown_for_mysql(void);
+/*=============================*/
+/** Log sequence number at shutdown */
+extern ib_uint64_t srv_shutdown_lsn;
+/** Log sequence number immediately after startup */
+extern ib_uint64_t srv_start_lsn;
+
+#ifdef __NETWARE__
+void set_panic_flag_for_netware(void);
+#endif
+
+#ifdef HAVE_DARWIN_THREADS
+/** TRUE if the F_FULLFSYNC option is available */
+extern ibool srv_have_fullfsync;
+#endif
+
+/** TRUE if the server is being started */
+extern ibool srv_is_being_started;
+/** TRUE if the server was successfully started */
+extern ibool srv_was_started;
+/** TRUE if the server is being started, before rolling back any
+incomplete transactions */
+extern ibool srv_startup_is_before_trx_rollback_phase;
+
+/** TRUE if a raw partition is in use */
+extern ibool srv_start_raw_disk_in_use;
+
+
+/** Shutdown state */
+enum srv_shutdown_state {
+ SRV_SHUTDOWN_NONE = 0, /*!< Database running normally */
+ SRV_SHUTDOWN_CLEANUP, /*!< Cleaning up in
+ logs_empty_and_mark_files_at_shutdown() */
+ SRV_SHUTDOWN_LAST_PHASE,/*!< Last phase after ensuring that
+ the buffer pool can be freed: flush
+ all file spaces and close all files */
+ SRV_SHUTDOWN_EXIT_THREADS/*!< Exit all threads */
+};
+
+/** At a shutdown this value climbs from SRV_SHUTDOWN_NONE to
+SRV_SHUTDOWN_CLEANUP and then to SRV_SHUTDOWN_LAST_PHASE, and so on */
+extern enum srv_shutdown_state srv_shutdown_state;
+#endif /* !UNIV_HOTBACKUP */
+
+/** Log 'spaces' have id's >= this */
+#define SRV_LOG_SPACE_FIRST_ID 0xFFFFFFF0UL
+
+#endif
diff --git a/storage/innobase/include/sync0arr.h b/storage/innobase/include/sync0arr.h
new file mode 100644
index 00000000000..5f1280f5e28
--- /dev/null
+++ b/storage/innobase/include/sync0arr.h
@@ -0,0 +1,142 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/sync0arr.h
+The wait array used in synchronization primitives
+
+Created 9/5/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef sync0arr_h
+#define sync0arr_h
+
+#include "univ.i"
+#include "ut0lst.h"
+#include "ut0mem.h"
+#include "os0thread.h"
+
+/** Synchronization wait array cell */
+typedef struct sync_cell_struct sync_cell_t;
+/** Synchronization wait array */
+typedef struct sync_array_struct sync_array_t;
+
+/** Parameters for sync_array_create() @{ */
+#define SYNC_ARRAY_OS_MUTEX 1 /*!< protected by os_mutex_t */
+#define SYNC_ARRAY_MUTEX 2 /*!< protected by mutex_t */
+/* @} */
+
+/*******************************************************************//**
+Creates a synchronization wait array. It is protected by a mutex
+which is automatically reserved when the functions operating on it
+are called.
+@return own: created wait array */
+UNIV_INTERN
+sync_array_t*
+sync_array_create(
+/*==============*/
+ ulint n_cells, /*!< in: number of cells in the array
+ to create */
+ ulint protection); /*!< in: either SYNC_ARRAY_OS_MUTEX or
+ SYNC_ARRAY_MUTEX: determines the type
+ of mutex protecting the data structure */
+/******************************************************************//**
+Frees the resources in a wait array. */
+UNIV_INTERN
+void
+sync_array_free(
+/*============*/
+ sync_array_t* arr); /*!< in, own: sync wait array */
+/******************************************************************//**
+Reserves a wait array cell for waiting for an object.
+The event of the cell is reset to nonsignalled state. */
+UNIV_INTERN
+void
+sync_array_reserve_cell(
+/*====================*/
+ sync_array_t* arr, /*!< in: wait array */
+ void* object, /*!< in: pointer to the object to wait for */
+ ulint type, /*!< in: lock request type */
+ const char* file, /*!< in: file where requested */
+ ulint line, /*!< in: line where requested */
+ ulint* index); /*!< out: index of the reserved cell */
+/******************************************************************//**
+This function should be called when a thread starts to wait on
+a wait array cell. In the debug version this function checks
+if the wait for a semaphore will result in a deadlock, in which
+case prints info and asserts. */
+UNIV_INTERN
+void
+sync_array_wait_event(
+/*==================*/
+ sync_array_t* arr, /*!< in: wait array */
+ ulint index); /*!< in: index of the reserved cell */
+/******************************************************************//**
+Frees the cell. NOTE! sync_array_wait_event frees the cell
+automatically! */
+UNIV_INTERN
+void
+sync_array_free_cell(
+/*=================*/
+ sync_array_t* arr, /*!< in: wait array */
+ ulint index); /*!< in: index of the cell in array */
+/**********************************************************************//**
+Note that one of the wait objects was signalled. */
+UNIV_INTERN
+void
+sync_array_object_signalled(
+/*========================*/
+ sync_array_t* arr); /*!< in: wait array */
+/**********************************************************************//**
+If the wakeup algorithm does not work perfectly at semaphore relases,
+this function will do the waking (see the comment in mutex_exit). This
+function should be called about every 1 second in the server. */
+UNIV_INTERN
+void
+sync_arr_wake_threads_if_sema_free(void);
+/*====================================*/
+/**********************************************************************//**
+Prints warnings of long semaphore waits to stderr.
+@return TRUE if fatal semaphore wait threshold was exceeded */
+UNIV_INTERN
+ibool
+sync_array_print_long_waits(void);
+/*=============================*/
+/********************************************************************//**
+Validates the integrity of the wait array. Checks
+that the number of reserved cells equals the count variable. */
+UNIV_INTERN
+void
+sync_array_validate(
+/*================*/
+ sync_array_t* arr); /*!< in: sync wait array */
+/**********************************************************************//**
+Prints info of the wait array. */
+UNIV_INTERN
+void
+sync_array_print_info(
+/*==================*/
+ FILE* file, /*!< in: file where to print */
+ sync_array_t* arr); /*!< in: wait array */
+
+
+#ifndef UNIV_NONINL
+#include "sync0arr.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/sync0arr.ic b/storage/innobase/include/sync0arr.ic
new file mode 100644
index 00000000000..bf57f5b2dc2
--- /dev/null
+++ b/storage/innobase/include/sync0arr.ic
@@ -0,0 +1,27 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/sync0arr.ic
+The wait array for synchronization primitives
+
+Inline code
+
+Created 9/5/1995 Heikki Tuuri
+*******************************************************/
+
diff --git a/storage/innobase/include/sync0rw.h b/storage/innobase/include/sync0rw.h
new file mode 100644
index 00000000000..aedfd5f3f86
--- /dev/null
+++ b/storage/innobase/include/sync0rw.h
@@ -0,0 +1,585 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 2008, Google Inc.
+
+Portions of this file contain modifications contributed and copyrighted by
+Google, Inc. Those modifications are gratefully acknowledged and are described
+briefly in the InnoDB documentation. The contributions by Google are
+incorporated with their permission, and subject to the conditions contained in
+the file COPYING.Google.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/sync0rw.h
+The read-write lock (for threads, not for database transactions)
+
+Created 9/11/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef sync0rw_h
+#define sync0rw_h
+
+#include "univ.i"
+#ifndef UNIV_HOTBACKUP
+#include "ut0lst.h"
+#include "sync0sync.h"
+#include "os0sync.h"
+
+/* The following undef is to prevent a name conflict with a macro
+in MySQL: */
+#undef rw_lock_t
+#endif /* !UNIV_HOTBACKUP */
+
+/* Latch types; these are used also in btr0btr.h: keep the numerical values
+smaller than 30 and the order of the numerical values like below! */
+#define RW_S_LATCH 1
+#define RW_X_LATCH 2
+#define RW_NO_LATCH 3
+
+#ifndef UNIV_HOTBACKUP
+/* We decrement lock_word by this amount for each x_lock. It is also the
+start value for the lock_word, meaning that it limits the maximum number
+of concurrent read locks before the rw_lock breaks. The current value of
+0x00100000 allows 1,048,575 concurrent readers and 2047 recursive writers.*/
+#define X_LOCK_DECR 0x00100000
+
+typedef struct rw_lock_struct rw_lock_t;
+#ifdef UNIV_SYNC_DEBUG
+typedef struct rw_lock_debug_struct rw_lock_debug_t;
+#endif /* UNIV_SYNC_DEBUG */
+
+typedef UT_LIST_BASE_NODE_T(rw_lock_t) rw_lock_list_t;
+
+extern rw_lock_list_t rw_lock_list;
+extern mutex_t rw_lock_list_mutex;
+
+#ifdef UNIV_SYNC_DEBUG
+/* The global mutex which protects debug info lists of all rw-locks.
+To modify the debug info list of an rw-lock, this mutex has to be
+
+acquired in addition to the mutex protecting the lock. */
+extern mutex_t rw_lock_debug_mutex;
+extern os_event_t rw_lock_debug_event; /*!< If deadlock detection does
+ not get immediately the mutex it
+ may wait for this event */
+extern ibool rw_lock_debug_waiters; /*!< This is set to TRUE, if
+ there may be waiters for the event */
+#endif /* UNIV_SYNC_DEBUG */
+
+/** number of spin waits on rw-latches,
+resulted during exclusive (write) locks */
+extern ib_int64_t rw_s_spin_wait_count;
+/** number of spin loop rounds on rw-latches,
+resulted during exclusive (write) locks */
+extern ib_int64_t rw_s_spin_round_count;
+/** number of unlocks (that unlock shared locks),
+set only when UNIV_SYNC_PERF_STAT is defined */
+extern ib_int64_t rw_s_exit_count;
+/** number of OS waits on rw-latches,
+resulted during shared (read) locks */
+extern ib_int64_t rw_s_os_wait_count;
+/** number of spin waits on rw-latches,
+resulted during shared (read) locks */
+extern ib_int64_t rw_x_spin_wait_count;
+/** number of spin loop rounds on rw-latches,
+resulted during shared (read) locks */
+extern ib_int64_t rw_x_spin_round_count;
+/** number of OS waits on rw-latches,
+resulted during exclusive (write) locks */
+extern ib_int64_t rw_x_os_wait_count;
+/** number of unlocks (that unlock exclusive locks),
+set only when UNIV_SYNC_PERF_STAT is defined */
+extern ib_int64_t rw_x_exit_count;
+
+/******************************************************************//**
+Creates, or rather, initializes an rw-lock object in a specified memory
+location (which must be appropriately aligned). The rw-lock is initialized
+to the non-locked state. Explicit freeing of the rw-lock with rw_lock_free
+is necessary only if the memory block containing it is freed. */
+#ifdef UNIV_DEBUG
+# ifdef UNIV_SYNC_DEBUG
+# define rw_lock_create(L, level) \
+ rw_lock_create_func((L), (level), #L, __FILE__, __LINE__)
+# else /* UNIV_SYNC_DEBUG */
+# define rw_lock_create(L, level) \
+ rw_lock_create_func((L), #L, __FILE__, __LINE__)
+# endif /* UNIV_SYNC_DEBUG */
+#else /* UNIV_DEBUG */
+# define rw_lock_create(L, level) \
+ rw_lock_create_func((L), __FILE__, __LINE__)
+#endif /* UNIV_DEBUG */
+
+/******************************************************************//**
+Creates, or rather, initializes an rw-lock object in a specified memory
+location (which must be appropriately aligned). The rw-lock is initialized
+to the non-locked state. Explicit freeing of the rw-lock with rw_lock_free
+is necessary only if the memory block containing it is freed. */
+UNIV_INTERN
+void
+rw_lock_create_func(
+/*================*/
+ rw_lock_t* lock, /*!< in: pointer to memory */
+#ifdef UNIV_DEBUG
+# ifdef UNIV_SYNC_DEBUG
+ ulint level, /*!< in: level */
+# endif /* UNIV_SYNC_DEBUG */
+ const char* cmutex_name, /*!< in: mutex name */
+#endif /* UNIV_DEBUG */
+ const char* cfile_name, /*!< in: file name where created */
+ ulint cline); /*!< in: file line where created */
+/******************************************************************//**
+Calling this function is obligatory only if the memory buffer containing
+the rw-lock is freed. Removes an rw-lock object from the global list. The
+rw-lock is checked to be in the non-locked state. */
+UNIV_INTERN
+void
+rw_lock_free(
+/*=========*/
+ rw_lock_t* lock); /*!< in: rw-lock */
+#ifdef UNIV_DEBUG
+/******************************************************************//**
+Checks that the rw-lock has been initialized and that there are no
+simultaneous shared and exclusive locks.
+@return TRUE */
+UNIV_INTERN
+ibool
+rw_lock_validate(
+/*=============*/
+ rw_lock_t* lock); /*!< in: rw-lock */
+#endif /* UNIV_DEBUG */
+/**************************************************************//**
+NOTE! The following macros should be used in rw s-locking, not the
+corresponding function. */
+
+#define rw_lock_s_lock(M) rw_lock_s_lock_func(\
+ (M), 0, __FILE__, __LINE__)
+/**************************************************************//**
+NOTE! The following macros should be used in rw s-locking, not the
+corresponding function. */
+
+#define rw_lock_s_lock_gen(M, P) rw_lock_s_lock_func(\
+ (M), (P), __FILE__, __LINE__)
+/**************************************************************//**
+NOTE! The following macros should be used in rw s-locking, not the
+corresponding function. */
+
+#define rw_lock_s_lock_nowait(M, F, L) rw_lock_s_lock_low(\
+ (M), 0, (F), (L))
+/******************************************************************//**
+Low-level function which tries to lock an rw-lock in s-mode. Performs no
+spinning.
+@return TRUE if success */
+UNIV_INLINE
+ibool
+rw_lock_s_lock_low(
+/*===============*/
+ rw_lock_t* lock, /*!< in: pointer to rw-lock */
+ ulint pass __attribute__((unused)),
+ /*!< in: pass value; != 0, if the lock will be
+ passed to another thread to unlock */
+ const char* file_name, /*!< in: file name where lock requested */
+ ulint line); /*!< in: line where requested */
+/******************************************************************//**
+NOTE! Use the corresponding macro, not directly this function, except if
+you supply the file name and line number. Lock an rw-lock in shared mode
+for the current thread. If the rw-lock is locked in exclusive mode, or
+there is an exclusive lock request waiting, the function spins a preset
+time (controlled by SYNC_SPIN_ROUNDS), waiting for the lock, before
+suspending the thread. */
+UNIV_INLINE
+void
+rw_lock_s_lock_func(
+/*================*/
+ rw_lock_t* lock, /*!< in: pointer to rw-lock */
+ ulint pass, /*!< in: pass value; != 0, if the lock will
+ be passed to another thread to unlock */
+ const char* file_name,/*!< in: file name where lock requested */
+ ulint line); /*!< in: line where requested */
+/******************************************************************//**
+NOTE! Use the corresponding macro, not directly this function! Lock an
+rw-lock in exclusive mode for the current thread if the lock can be
+obtained immediately.
+@return TRUE if success */
+UNIV_INLINE
+ibool
+rw_lock_x_lock_func_nowait(
+/*=======================*/
+ rw_lock_t* lock, /*!< in: pointer to rw-lock */
+ const char* file_name,/*!< in: file name where lock requested */
+ ulint line); /*!< in: line where requested */
+/******************************************************************//**
+Releases a shared mode lock. */
+UNIV_INLINE
+void
+rw_lock_s_unlock_func(
+/*==================*/
+#ifdef UNIV_SYNC_DEBUG
+ ulint pass, /*!< in: pass value; != 0, if the lock may have
+ been passed to another thread to unlock */
+#endif
+ rw_lock_t* lock); /*!< in/out: rw-lock */
+
+#ifdef UNIV_SYNC_DEBUG
+# define rw_lock_s_unlock_gen(L, P) rw_lock_s_unlock_func(P, L)
+#else
+# define rw_lock_s_unlock_gen(L, P) rw_lock_s_unlock_func(L)
+#endif
+/*******************************************************************//**
+Releases a shared mode lock. */
+#define rw_lock_s_unlock(L) rw_lock_s_unlock_gen(L, 0)
+
+/**************************************************************//**
+NOTE! The following macro should be used in rw x-locking, not the
+corresponding function. */
+
+#define rw_lock_x_lock(M) rw_lock_x_lock_func(\
+ (M), 0, __FILE__, __LINE__)
+/**************************************************************//**
+NOTE! The following macro should be used in rw x-locking, not the
+corresponding function. */
+
+#define rw_lock_x_lock_gen(M, P) rw_lock_x_lock_func(\
+ (M), (P), __FILE__, __LINE__)
+/**************************************************************//**
+NOTE! The following macros should be used in rw x-locking, not the
+corresponding function. */
+
+#define rw_lock_x_lock_nowait(M) rw_lock_x_lock_func_nowait(\
+ (M), __FILE__, __LINE__)
+/******************************************************************//**
+NOTE! Use the corresponding macro, not directly this function! Lock an
+rw-lock in exclusive mode for the current thread. If the rw-lock is locked
+in shared or exclusive mode, or there is an exclusive lock request waiting,
+the function spins a preset time (controlled by SYNC_SPIN_ROUNDS), waiting
+for the lock, before suspending the thread. If the same thread has an x-lock
+on the rw-lock, locking succeed, with the following exception: if pass != 0,
+only a single x-lock may be taken on the lock. NOTE: If the same thread has
+an s-lock, locking does not succeed! */
+UNIV_INTERN
+void
+rw_lock_x_lock_func(
+/*================*/
+ rw_lock_t* lock, /*!< in: pointer to rw-lock */
+ ulint pass, /*!< in: pass value; != 0, if the lock will
+ be passed to another thread to unlock */
+ const char* file_name,/*!< in: file name where lock requested */
+ ulint line); /*!< in: line where requested */
+/******************************************************************//**
+Releases an exclusive mode lock. */
+UNIV_INLINE
+void
+rw_lock_x_unlock_func(
+/*==================*/
+#ifdef UNIV_SYNC_DEBUG
+ ulint pass, /*!< in: pass value; != 0, if the lock may have
+ been passed to another thread to unlock */
+#endif
+ rw_lock_t* lock); /*!< in/out: rw-lock */
+
+#ifdef UNIV_SYNC_DEBUG
+# define rw_lock_x_unlock_gen(L, P) rw_lock_x_unlock_func(P, L)
+#else
+# define rw_lock_x_unlock_gen(L, P) rw_lock_x_unlock_func(L)
+#endif
+/*******************************************************************//**
+Releases an exclusive mode lock. */
+#define rw_lock_x_unlock(L) rw_lock_x_unlock_gen(L, 0)
+
+/******************************************************************//**
+Low-level function which locks an rw-lock in s-mode when we know that it
+is possible and none else is currently accessing the rw-lock structure.
+Then we can do the locking without reserving the mutex. */
+UNIV_INLINE
+void
+rw_lock_s_lock_direct(
+/*==================*/
+ rw_lock_t* lock, /*!< in/out: rw-lock */
+ const char* file_name, /*!< in: file name where requested */
+ ulint line); /*!< in: line where lock requested */
+/******************************************************************//**
+Low-level function which locks an rw-lock in x-mode when we know that it
+is not locked and none else is currently accessing the rw-lock structure.
+Then we can do the locking without reserving the mutex. */
+UNIV_INLINE
+void
+rw_lock_x_lock_direct(
+/*==================*/
+ rw_lock_t* lock, /*!< in/out: rw-lock */
+ const char* file_name, /*!< in: file name where requested */
+ ulint line); /*!< in: line where lock requested */
+/******************************************************************//**
+This function is used in the insert buffer to move the ownership of an
+x-latch on a buffer frame to the current thread. The x-latch was set by
+the buffer read operation and it protected the buffer frame while the
+read was done. The ownership is moved because we want that the current
+thread is able to acquire a second x-latch which is stored in an mtr.
+This, in turn, is needed to pass the debug checks of index page
+operations. */
+UNIV_INTERN
+void
+rw_lock_x_lock_move_ownership(
+/*==========================*/
+ rw_lock_t* lock); /*!< in: lock which was x-locked in the
+ buffer read */
+/******************************************************************//**
+Releases a shared mode lock when we know there are no waiters and none
+else will access the lock during the time this function is executed. */
+UNIV_INLINE
+void
+rw_lock_s_unlock_direct(
+/*====================*/
+ rw_lock_t* lock); /*!< in/out: rw-lock */
+/******************************************************************//**
+Releases an exclusive mode lock when we know there are no waiters, and
+none else will access the lock durint the time this function is executed. */
+UNIV_INLINE
+void
+rw_lock_x_unlock_direct(
+/*====================*/
+ rw_lock_t* lock); /*!< in/out: rw-lock */
+/******************************************************************//**
+Returns the value of writer_count for the lock. Does not reserve the lock
+mutex, so the caller must be sure it is not changed during the call.
+@return value of writer_count */
+UNIV_INLINE
+ulint
+rw_lock_get_x_lock_count(
+/*=====================*/
+ const rw_lock_t* lock); /*!< in: rw-lock */
+/********************************************************************//**
+Check if there are threads waiting for the rw-lock.
+@return 1 if waiters, 0 otherwise */
+UNIV_INLINE
+ulint
+rw_lock_get_waiters(
+/*================*/
+ const rw_lock_t* lock); /*!< in: rw-lock */
+/******************************************************************//**
+Returns the write-status of the lock - this function made more sense
+with the old rw_lock implementation.
+@return RW_LOCK_NOT_LOCKED, RW_LOCK_EX, RW_LOCK_WAIT_EX */
+UNIV_INLINE
+ulint
+rw_lock_get_writer(
+/*===============*/
+ const rw_lock_t* lock); /*!< in: rw-lock */
+/******************************************************************//**
+Returns the number of readers.
+@return number of readers */
+UNIV_INLINE
+ulint
+rw_lock_get_reader_count(
+/*=====================*/
+ const rw_lock_t* lock); /*!< in: rw-lock */
+/******************************************************************//**
+Decrements lock_word the specified amount if it is greater than 0.
+This is used by both s_lock and x_lock operations.
+@return TRUE if decr occurs */
+UNIV_INLINE
+ibool
+rw_lock_lock_word_decr(
+/*===================*/
+ rw_lock_t* lock, /*!< in/out: rw-lock */
+ ulint amount); /*!< in: amount to decrement */
+/******************************************************************//**
+Increments lock_word the specified amount and returns new value.
+@return lock->lock_word after increment */
+UNIV_INLINE
+lint
+rw_lock_lock_word_incr(
+/*===================*/
+ rw_lock_t* lock, /*!< in/out: rw-lock */
+ ulint amount); /*!< in: amount to increment */
+/******************************************************************//**
+This function sets the lock->writer_thread and lock->recursive fields.
+For platforms where we are using atomic builtins instead of lock->mutex
+it sets the lock->writer_thread field using atomics to ensure memory
+ordering. Note that it is assumed that the caller of this function
+effectively owns the lock i.e.: nobody else is allowed to modify
+lock->writer_thread at this point in time.
+The protocol is that lock->writer_thread MUST be updated BEFORE the
+lock->recursive flag is set. */
+UNIV_INLINE
+void
+rw_lock_set_writer_id_and_recursion_flag(
+/*=====================================*/
+ rw_lock_t* lock, /*!< in/out: lock to work on */
+ ibool recursive); /*!< in: TRUE if recursion
+ allowed */
+#ifdef UNIV_SYNC_DEBUG
+/******************************************************************//**
+Checks if the thread has locked the rw-lock in the specified mode, with
+the pass value == 0. */
+UNIV_INTERN
+ibool
+rw_lock_own(
+/*========*/
+ rw_lock_t* lock, /*!< in: rw-lock */
+ ulint lock_type); /*!< in: lock type: RW_LOCK_SHARED,
+ RW_LOCK_EX */
+#endif /* UNIV_SYNC_DEBUG */
+/******************************************************************//**
+Checks if somebody has locked the rw-lock in the specified mode. */
+UNIV_INTERN
+ibool
+rw_lock_is_locked(
+/*==============*/
+ rw_lock_t* lock, /*!< in: rw-lock */
+ ulint lock_type); /*!< in: lock type: RW_LOCK_SHARED,
+ RW_LOCK_EX */
+#ifdef UNIV_SYNC_DEBUG
+/***************************************************************//**
+Prints debug info of an rw-lock. */
+UNIV_INTERN
+void
+rw_lock_print(
+/*==========*/
+ rw_lock_t* lock); /*!< in: rw-lock */
+/***************************************************************//**
+Prints debug info of currently locked rw-locks. */
+UNIV_INTERN
+void
+rw_lock_list_print_info(
+/*====================*/
+ FILE* file); /*!< in: file where to print */
+/***************************************************************//**
+Returns the number of currently locked rw-locks.
+Works only in the debug version.
+@return number of locked rw-locks */
+UNIV_INTERN
+ulint
+rw_lock_n_locked(void);
+/*==================*/
+
+/*#####################################################################*/
+
+/******************************************************************//**
+Acquires the debug mutex. We cannot use the mutex defined in sync0sync,
+because the debug mutex is also acquired in sync0arr while holding the OS
+mutex protecting the sync array, and the ordinary mutex_enter might
+recursively call routines in sync0arr, leading to a deadlock on the OS
+mutex. */
+UNIV_INTERN
+void
+rw_lock_debug_mutex_enter(void);
+/*==========================*/
+/******************************************************************//**
+Releases the debug mutex. */
+UNIV_INTERN
+void
+rw_lock_debug_mutex_exit(void);
+/*==========================*/
+/*********************************************************************//**
+Prints info of a debug struct. */
+UNIV_INTERN
+void
+rw_lock_debug_print(
+/*================*/
+ rw_lock_debug_t* info); /*!< in: debug struct */
+#endif /* UNIV_SYNC_DEBUG */
+
+/* NOTE! The structure appears here only for the compiler to know its size.
+Do not use its fields directly! */
+
+/** The structure used in the spin lock implementation of a read-write
+lock. Several threads may have a shared lock simultaneously in this
+lock, but only one writer may have an exclusive lock, in which case no
+shared locks are allowed. To prevent starving of a writer blocked by
+readers, a writer may queue for x-lock by decrementing lock_word: no
+new readers will be let in while the thread waits for readers to
+exit. */
+struct rw_lock_struct {
+ volatile lint lock_word;
+ /*!< Holds the state of the lock. */
+ volatile ulint waiters;/*!< 1: there are waiters */
+ volatile ibool recursive;/*!< Default value FALSE which means the lock
+ is non-recursive. The value is typically set
+ to TRUE making normal rw_locks recursive. In
+ case of asynchronous IO, when a non-zero
+ value of 'pass' is passed then we keep the
+ lock non-recursive.
+ This flag also tells us about the state of
+ writer_thread field. If this flag is set
+ then writer_thread MUST contain the thread
+ id of the current x-holder or wait-x thread.
+ This flag must be reset in x_unlock
+ functions before incrementing the lock_word */
+ volatile os_thread_id_t writer_thread;
+ /*!< Thread id of writer thread. Is only
+ guaranteed to have sane and non-stale
+ value iff recursive flag is set. */
+ os_event_t event; /*!< Used by sync0arr.c for thread queueing */
+ os_event_t wait_ex_event;
+ /*!< Event for next-writer to wait on. A thread
+ must decrement lock_word before waiting. */
+#ifndef INNODB_RW_LOCKS_USE_ATOMICS
+ mutex_t mutex; /*!< The mutex protecting rw_lock_struct */
+#endif /* INNODB_RW_LOCKS_USE_ATOMICS */
+
+ UT_LIST_NODE_T(rw_lock_t) list;
+ /*!< All allocated rw locks are put into a
+ list */
+#ifdef UNIV_SYNC_DEBUG
+ UT_LIST_BASE_NODE_T(rw_lock_debug_t) debug_list;
+ /*!< In the debug version: pointer to the debug
+ info list of the lock */
+ ulint level; /*!< Level in the global latching order. */
+#endif /* UNIV_SYNC_DEBUG */
+ ulint count_os_wait; /*!< Count of os_waits. May not be accurate */
+ const char* cfile_name;/*!< File name where lock created */
+ /* last s-lock file/line is not guaranteed to be correct */
+ const char* last_s_file_name;/*!< File name where last s-locked */
+ const char* last_x_file_name;/*!< File name where last x-locked */
+ ibool writer_is_wait_ex;
+ /*!< This is TRUE if the writer field is
+ RW_LOCK_WAIT_EX; this field is located far
+ from the memory update hotspot fields which
+ are at the start of this struct, thus we can
+ peek this field without causing much memory
+ bus traffic */
+ unsigned cline:14; /*!< Line where created */
+ unsigned last_s_line:14; /*!< Line number where last time s-locked */
+ unsigned last_x_line:14; /*!< Line number where last time x-locked */
+ ulint magic_n; /*!< RW_LOCK_MAGIC_N */
+};
+
+/** Value of rw_lock_struct::magic_n */
+#define RW_LOCK_MAGIC_N 22643
+
+#ifdef UNIV_SYNC_DEBUG
+/** The structure for storing debug info of an rw-lock */
+struct rw_lock_debug_struct {
+
+ os_thread_id_t thread_id; /*!< The thread id of the thread which
+ locked the rw-lock */
+ ulint pass; /*!< Pass value given in the lock operation */
+ ulint lock_type; /*!< Type of the lock: RW_LOCK_EX,
+ RW_LOCK_SHARED, RW_LOCK_WAIT_EX */
+ const char* file_name;/*!< File name where the lock was obtained */
+ ulint line; /*!< Line where the rw-lock was locked */
+ UT_LIST_NODE_T(rw_lock_debug_t) list;
+ /*!< Debug structs are linked in a two-way
+ list */
+};
+#endif /* UNIV_SYNC_DEBUG */
+
+#ifndef UNIV_NONINL
+#include "sync0rw.ic"
+#endif
+#endif /* !UNIV_HOTBACKUP */
+
+#endif
diff --git a/storage/innobase/include/sync0rw.ic b/storage/innobase/include/sync0rw.ic
new file mode 100644
index 00000000000..7116f1b7c9b
--- /dev/null
+++ b/storage/innobase/include/sync0rw.ic
@@ -0,0 +1,624 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 2008, Google Inc.
+
+Portions of this file contain modifications contributed and copyrighted by
+Google, Inc. Those modifications are gratefully acknowledged and are described
+briefly in the InnoDB documentation. The contributions by Google are
+incorporated with their permission, and subject to the conditions contained in
+the file COPYING.Google.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/sync0rw.ic
+The read-write lock (for threads)
+
+Created 9/11/1995 Heikki Tuuri
+*******************************************************/
+
+/******************************************************************//**
+Lock an rw-lock in shared mode for the current thread. If the rw-lock is
+locked in exclusive mode, or there is an exclusive lock request waiting,
+the function spins a preset time (controlled by SYNC_SPIN_ROUNDS),
+waiting for the lock before suspending the thread. */
+UNIV_INTERN
+void
+rw_lock_s_lock_spin(
+/*================*/
+ rw_lock_t* lock, /*!< in: pointer to rw-lock */
+ ulint pass, /*!< in: pass value; != 0, if the lock will
+ be passed to another thread to unlock */
+ const char* file_name,/*!< in: file name where lock requested */
+ ulint line); /*!< in: line where requested */
+#ifdef UNIV_SYNC_DEBUG
+/******************************************************************//**
+Inserts the debug information for an rw-lock. */
+UNIV_INTERN
+void
+rw_lock_add_debug_info(
+/*===================*/
+ rw_lock_t* lock, /*!< in: rw-lock */
+ ulint pass, /*!< in: pass value */
+ ulint lock_type, /*!< in: lock type */
+ const char* file_name, /*!< in: file where requested */
+ ulint line); /*!< in: line where requested */
+/******************************************************************//**
+Removes a debug information struct for an rw-lock. */
+UNIV_INTERN
+void
+rw_lock_remove_debug_info(
+/*======================*/
+ rw_lock_t* lock, /*!< in: rw-lock */
+ ulint pass, /*!< in: pass value */
+ ulint lock_type); /*!< in: lock type */
+#endif /* UNIV_SYNC_DEBUG */
+
+/********************************************************************//**
+Check if there are threads waiting for the rw-lock.
+@return 1 if waiters, 0 otherwise */
+UNIV_INLINE
+ulint
+rw_lock_get_waiters(
+/*================*/
+ const rw_lock_t* lock) /*!< in: rw-lock */
+{
+ return(lock->waiters);
+}
+
+/********************************************************************//**
+Sets lock->waiters to 1. It is not an error if lock->waiters is already
+1. On platforms where ATOMIC builtins are used this function enforces a
+memory barrier. */
+UNIV_INLINE
+void
+rw_lock_set_waiter_flag(
+/*====================*/
+ rw_lock_t* lock) /*!< in/out: rw-lock */
+{
+#ifdef INNODB_RW_LOCKS_USE_ATOMICS
+ os_compare_and_swap_ulint(&lock->waiters, 0, 1);
+#else /* INNODB_RW_LOCKS_USE_ATOMICS */
+ lock->waiters = 1;
+#endif /* INNODB_RW_LOCKS_USE_ATOMICS */
+}
+
+/********************************************************************//**
+Resets lock->waiters to 0. It is not an error if lock->waiters is already
+0. On platforms where ATOMIC builtins are used this function enforces a
+memory barrier. */
+UNIV_INLINE
+void
+rw_lock_reset_waiter_flag(
+/*======================*/
+ rw_lock_t* lock) /*!< in/out: rw-lock */
+{
+#ifdef INNODB_RW_LOCKS_USE_ATOMICS
+ os_compare_and_swap_ulint(&lock->waiters, 1, 0);
+#else /* INNODB_RW_LOCKS_USE_ATOMICS */
+ lock->waiters = 0;
+#endif /* INNODB_RW_LOCKS_USE_ATOMICS */
+}
+
+/******************************************************************//**
+Returns the write-status of the lock - this function made more sense
+with the old rw_lock implementation.
+@return RW_LOCK_NOT_LOCKED, RW_LOCK_EX, RW_LOCK_WAIT_EX */
+UNIV_INLINE
+ulint
+rw_lock_get_writer(
+/*===============*/
+ const rw_lock_t* lock) /*!< in: rw-lock */
+{
+ lint lock_word = lock->lock_word;
+ if (lock_word > 0) {
+ /* return NOT_LOCKED in s-lock state, like the writer
+ member of the old lock implementation. */
+ return(RW_LOCK_NOT_LOCKED);
+ } else if (((-lock_word) % X_LOCK_DECR) == 0) {
+ return(RW_LOCK_EX);
+ } else {
+ ut_ad(lock_word > -X_LOCK_DECR);
+ return(RW_LOCK_WAIT_EX);
+ }
+}
+
+/******************************************************************//**
+Returns the number of readers.
+@return number of readers */
+UNIV_INLINE
+ulint
+rw_lock_get_reader_count(
+/*=====================*/
+ const rw_lock_t* lock) /*!< in: rw-lock */
+{
+ lint lock_word = lock->lock_word;
+ if (lock_word > 0) {
+ /* s-locked, no x-waiters */
+ return(X_LOCK_DECR - lock_word);
+ } else if (lock_word < 0 && lock_word > -X_LOCK_DECR) {
+ /* s-locked, with x-waiters */
+ return((ulint)(-lock_word));
+ }
+ return(0);
+}
+
+#ifndef INNODB_RW_LOCKS_USE_ATOMICS
+UNIV_INLINE
+mutex_t*
+rw_lock_get_mutex(
+/*==============*/
+ rw_lock_t* lock)
+{
+ return(&(lock->mutex));
+}
+#endif
+
+/******************************************************************//**
+Returns the value of writer_count for the lock. Does not reserve the lock
+mutex, so the caller must be sure it is not changed during the call.
+@return value of writer_count */
+UNIV_INLINE
+ulint
+rw_lock_get_x_lock_count(
+/*=====================*/
+ const rw_lock_t* lock) /*!< in: rw-lock */
+{
+ lint lock_copy = lock->lock_word;
+ /* If there is a reader, lock_word is not divisible by X_LOCK_DECR */
+ if (lock_copy > 0 || (-lock_copy) % X_LOCK_DECR != 0) {
+ return(0);
+ }
+ return(((-lock_copy) / X_LOCK_DECR) + 1);
+}
+
+/******************************************************************//**
+Two different implementations for decrementing the lock_word of a rw_lock:
+one for systems supporting atomic operations, one for others. This does
+does not support recusive x-locks: they should be handled by the caller and
+need not be atomic since they are performed by the current lock holder.
+Returns true if the decrement was made, false if not.
+@return TRUE if decr occurs */
+UNIV_INLINE
+ibool
+rw_lock_lock_word_decr(
+/*===================*/
+ rw_lock_t* lock, /*!< in/out: rw-lock */
+ ulint amount) /*!< in: amount to decrement */
+{
+#ifdef INNODB_RW_LOCKS_USE_ATOMICS
+ lint local_lock_word = lock->lock_word;
+ while (local_lock_word > 0) {
+ if (os_compare_and_swap_lint(&lock->lock_word,
+ local_lock_word,
+ local_lock_word - amount)) {
+ return(TRUE);
+ }
+ local_lock_word = lock->lock_word;
+ }
+ return(FALSE);
+#else /* INNODB_RW_LOCKS_USE_ATOMICS */
+ ibool success = FALSE;
+ mutex_enter(&(lock->mutex));
+ if (lock->lock_word > 0) {
+ lock->lock_word -= amount;
+ success = TRUE;
+ }
+ mutex_exit(&(lock->mutex));
+ return(success);
+#endif /* INNODB_RW_LOCKS_USE_ATOMICS */
+}
+
+/******************************************************************//**
+Increments lock_word the specified amount and returns new value.
+@return lock->lock_word after increment */
+UNIV_INLINE
+lint
+rw_lock_lock_word_incr(
+/*===================*/
+ rw_lock_t* lock, /*!< in/out: rw-lock */
+ ulint amount) /*!< in: amount of increment */
+{
+#ifdef INNODB_RW_LOCKS_USE_ATOMICS
+ return(os_atomic_increment_lint(&lock->lock_word, amount));
+#else /* INNODB_RW_LOCKS_USE_ATOMICS */
+ lint local_lock_word;
+
+ mutex_enter(&(lock->mutex));
+
+ lock->lock_word += amount;
+ local_lock_word = lock->lock_word;
+
+ mutex_exit(&(lock->mutex));
+
+ return(local_lock_word);
+#endif /* INNODB_RW_LOCKS_USE_ATOMICS */
+}
+
+/******************************************************************//**
+This function sets the lock->writer_thread and lock->recursive fields.
+For platforms where we are using atomic builtins instead of lock->mutex
+it sets the lock->writer_thread field using atomics to ensure memory
+ordering. Note that it is assumed that the caller of this function
+effectively owns the lock i.e.: nobody else is allowed to modify
+lock->writer_thread at this point in time.
+The protocol is that lock->writer_thread MUST be updated BEFORE the
+lock->recursive flag is set. */
+UNIV_INLINE
+void
+rw_lock_set_writer_id_and_recursion_flag(
+/*=====================================*/
+ rw_lock_t* lock, /*!< in/out: lock to work on */
+ ibool recursive) /*!< in: TRUE if recursion
+ allowed */
+{
+ os_thread_id_t curr_thread = os_thread_get_curr_id();
+
+#ifdef INNODB_RW_LOCKS_USE_ATOMICS
+ os_thread_id_t local_thread;
+ ibool success;
+
+ /* Prevent Valgrind warnings about writer_thread being
+ uninitialized. It does not matter if writer_thread is
+ uninitialized, because we are comparing writer_thread against
+ itself, and the operation should always succeed. */
+ UNIV_MEM_VALID(&lock->writer_thread, sizeof lock->writer_thread);
+
+ local_thread = lock->writer_thread;
+ success = os_compare_and_swap_thread_id(
+ &lock->writer_thread, local_thread, curr_thread);
+ ut_a(success);
+ lock->recursive = recursive;
+
+#else /* INNODB_RW_LOCKS_USE_ATOMICS */
+
+ mutex_enter(&lock->mutex);
+ lock->writer_thread = curr_thread;
+ lock->recursive = recursive;
+ mutex_exit(&lock->mutex);
+
+#endif /* INNODB_RW_LOCKS_USE_ATOMICS */
+}
+
+/******************************************************************//**
+Low-level function which tries to lock an rw-lock in s-mode. Performs no
+spinning.
+@return TRUE if success */
+UNIV_INLINE
+ibool
+rw_lock_s_lock_low(
+/*===============*/
+ rw_lock_t* lock, /*!< in: pointer to rw-lock */
+ ulint pass __attribute__((unused)),
+ /*!< in: pass value; != 0, if the lock will be
+ passed to another thread to unlock */
+ const char* file_name, /*!< in: file name where lock requested */
+ ulint line) /*!< in: line where requested */
+{
+ /* TODO: study performance of UNIV_LIKELY branch prediction hints. */
+ if (!rw_lock_lock_word_decr(lock, 1)) {
+ /* Locking did not succeed */
+ return(FALSE);
+ }
+
+#ifdef UNIV_SYNC_DEBUG
+ rw_lock_add_debug_info(lock, pass, RW_LOCK_SHARED, file_name, line);
+#endif
+ /* These debugging values are not set safely: they may be incorrect
+ or even refer to a line that is invalid for the file name. */
+ lock->last_s_file_name = file_name;
+ lock->last_s_line = line;
+
+ return(TRUE); /* locking succeeded */
+}
+
+/******************************************************************//**
+Low-level function which locks an rw-lock in s-mode when we know that it
+is possible and none else is currently accessing the rw-lock structure.
+Then we can do the locking without reserving the mutex. */
+UNIV_INLINE
+void
+rw_lock_s_lock_direct(
+/*==================*/
+ rw_lock_t* lock, /*!< in/out: rw-lock */
+ const char* file_name, /*!< in: file name where requested */
+ ulint line) /*!< in: line where lock requested */
+{
+ ut_ad(lock->lock_word == X_LOCK_DECR);
+
+ /* Indicate there is a new reader by decrementing lock_word */
+ lock->lock_word--;
+
+ lock->last_s_file_name = file_name;
+ lock->last_s_line = line;
+
+#ifdef UNIV_SYNC_DEBUG
+ rw_lock_add_debug_info(lock, 0, RW_LOCK_SHARED, file_name, line);
+#endif
+}
+
+/******************************************************************//**
+Low-level function which locks an rw-lock in x-mode when we know that it
+is not locked and none else is currently accessing the rw-lock structure.
+Then we can do the locking without reserving the mutex. */
+UNIV_INLINE
+void
+rw_lock_x_lock_direct(
+/*==================*/
+ rw_lock_t* lock, /*!< in/out: rw-lock */
+ const char* file_name, /*!< in: file name where requested */
+ ulint line) /*!< in: line where lock requested */
+{
+ ut_ad(rw_lock_validate(lock));
+ ut_ad(lock->lock_word == X_LOCK_DECR);
+
+ lock->lock_word -= X_LOCK_DECR;
+ lock->writer_thread = os_thread_get_curr_id();
+ lock->recursive = TRUE;
+
+ lock->last_x_file_name = file_name;
+ lock->last_x_line = line;
+
+#ifdef UNIV_SYNC_DEBUG
+ rw_lock_add_debug_info(lock, 0, RW_LOCK_EX, file_name, line);
+#endif
+}
+
+/******************************************************************//**
+NOTE! Use the corresponding macro, not directly this function! Lock an
+rw-lock in shared mode for the current thread. If the rw-lock is locked
+in exclusive mode, or there is an exclusive lock request waiting, the
+function spins a preset time (controlled by SYNC_SPIN_ROUNDS), waiting for
+the lock, before suspending the thread. */
+UNIV_INLINE
+void
+rw_lock_s_lock_func(
+/*================*/
+ rw_lock_t* lock, /*!< in: pointer to rw-lock */
+ ulint pass, /*!< in: pass value; != 0, if the lock will
+ be passed to another thread to unlock */
+ const char* file_name,/*!< in: file name where lock requested */
+ ulint line) /*!< in: line where requested */
+{
+ /* NOTE: As we do not know the thread ids for threads which have
+ s-locked a latch, and s-lockers will be served only after waiting
+ x-lock requests have been fulfilled, then if this thread already
+ owns an s-lock here, it may end up in a deadlock with another thread
+ which requests an x-lock here. Therefore, we will forbid recursive
+ s-locking of a latch: the following assert will warn the programmer
+ of the possibility of this kind of a deadlock. If we want to implement
+ safe recursive s-locking, we should keep in a list the thread ids of
+ the threads which have s-locked a latch. This would use some CPU
+ time. */
+
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(!rw_lock_own(lock, RW_LOCK_SHARED)); /* see NOTE above */
+#endif /* UNIV_SYNC_DEBUG */
+
+ /* TODO: study performance of UNIV_LIKELY branch prediction hints. */
+ if (rw_lock_s_lock_low(lock, pass, file_name, line)) {
+
+ return; /* Success */
+ } else {
+ /* Did not succeed, try spin wait */
+
+ rw_lock_s_lock_spin(lock, pass, file_name, line);
+
+ return;
+ }
+}
+
+/******************************************************************//**
+NOTE! Use the corresponding macro, not directly this function! Lock an
+rw-lock in exclusive mode for the current thread if the lock can be
+obtained immediately.
+@return TRUE if success */
+UNIV_INLINE
+ibool
+rw_lock_x_lock_func_nowait(
+/*=======================*/
+ rw_lock_t* lock, /*!< in: pointer to rw-lock */
+ const char* file_name,/*!< in: file name where lock requested */
+ ulint line) /*!< in: line where requested */
+{
+ os_thread_id_t curr_thread = os_thread_get_curr_id();
+
+ ibool success;
+
+#ifdef INNODB_RW_LOCKS_USE_ATOMICS
+ success = os_compare_and_swap_lint(&lock->lock_word, X_LOCK_DECR, 0);
+#else
+
+ success = FALSE;
+ mutex_enter(&(lock->mutex));
+ if (lock->lock_word == X_LOCK_DECR) {
+ lock->lock_word = 0;
+ success = TRUE;
+ }
+ mutex_exit(&(lock->mutex));
+
+#endif
+ if (success) {
+ rw_lock_set_writer_id_and_recursion_flag(lock, TRUE);
+
+ } else if (lock->recursive
+ && os_thread_eq(lock->writer_thread, curr_thread)) {
+ /* Relock: this lock_word modification is safe since no other
+ threads can modify (lock, unlock, or reserve) lock_word while
+ there is an exclusive writer and this is the writer thread. */
+ lock->lock_word -= X_LOCK_DECR;
+
+ ut_ad(((-lock->lock_word) % X_LOCK_DECR) == 0);
+
+ } else {
+ /* Failure */
+ return(FALSE);
+ }
+#ifdef UNIV_SYNC_DEBUG
+ rw_lock_add_debug_info(lock, 0, RW_LOCK_EX, file_name, line);
+#endif
+
+ lock->last_x_file_name = file_name;
+ lock->last_x_line = line;
+
+ ut_ad(rw_lock_validate(lock));
+
+ return(TRUE);
+}
+
+/******************************************************************//**
+Releases a shared mode lock. */
+UNIV_INLINE
+void
+rw_lock_s_unlock_func(
+/*==================*/
+#ifdef UNIV_SYNC_DEBUG
+ ulint pass, /*!< in: pass value; != 0, if the lock may have
+ been passed to another thread to unlock */
+#endif
+ rw_lock_t* lock) /*!< in/out: rw-lock */
+{
+ ut_ad((lock->lock_word % X_LOCK_DECR) != 0);
+
+#ifdef UNIV_SYNC_DEBUG
+ rw_lock_remove_debug_info(lock, pass, RW_LOCK_SHARED);
+#endif
+
+ /* Increment lock_word to indicate 1 less reader */
+ if (rw_lock_lock_word_incr(lock, 1) == 0) {
+
+ /* wait_ex waiter exists. It may not be asleep, but we signal
+ anyway. We do not wake other waiters, because they can't
+ exist without wait_ex waiter and wait_ex waiter goes first.*/
+ os_event_set(lock->wait_ex_event);
+ sync_array_object_signalled(sync_primary_wait_array);
+
+ }
+
+ ut_ad(rw_lock_validate(lock));
+
+#ifdef UNIV_SYNC_PERF_STAT
+ rw_s_exit_count++;
+#endif
+}
+
+/******************************************************************//**
+Releases a shared mode lock when we know there are no waiters and none
+else will access the lock during the time this function is executed. */
+UNIV_INLINE
+void
+rw_lock_s_unlock_direct(
+/*====================*/
+ rw_lock_t* lock) /*!< in/out: rw-lock */
+{
+ ut_ad(lock->lock_word < X_LOCK_DECR);
+
+#ifdef UNIV_SYNC_DEBUG
+ rw_lock_remove_debug_info(lock, 0, RW_LOCK_SHARED);
+#endif
+
+ /* Decrease reader count by incrementing lock_word */
+ lock->lock_word++;
+
+ ut_ad(!lock->waiters);
+ ut_ad(rw_lock_validate(lock));
+#ifdef UNIV_SYNC_PERF_STAT
+ rw_s_exit_count++;
+#endif
+}
+
+/******************************************************************//**
+Releases an exclusive mode lock. */
+UNIV_INLINE
+void
+rw_lock_x_unlock_func(
+/*==================*/
+#ifdef UNIV_SYNC_DEBUG
+ ulint pass, /*!< in: pass value; != 0, if the lock may have
+ been passed to another thread to unlock */
+#endif
+ rw_lock_t* lock) /*!< in/out: rw-lock */
+{
+ ut_ad((lock->lock_word % X_LOCK_DECR) == 0);
+
+ /* lock->recursive flag also indicates if lock->writer_thread is
+ valid or stale. If we are the last of the recursive callers
+ then we must unset lock->recursive flag to indicate that the
+ lock->writer_thread is now stale.
+ Note that since we still hold the x-lock we can safely read the
+ lock_word. */
+ if (lock->lock_word == 0) {
+ /* Last caller in a possible recursive chain. */
+ lock->recursive = FALSE;
+ UNIV_MEM_INVALID(&lock->writer_thread,
+ sizeof lock->writer_thread);
+ }
+
+#ifdef UNIV_SYNC_DEBUG
+ rw_lock_remove_debug_info(lock, pass, RW_LOCK_EX);
+#endif
+
+ if (rw_lock_lock_word_incr(lock, X_LOCK_DECR) == X_LOCK_DECR) {
+ /* Lock is now free. May have to signal read/write waiters.
+ We do not need to signal wait_ex waiters, since they cannot
+ exist when there is a writer. */
+ if (lock->waiters) {
+ rw_lock_reset_waiter_flag(lock);
+ os_event_set(lock->event);
+ sync_array_object_signalled(sync_primary_wait_array);
+ }
+ }
+
+ ut_ad(rw_lock_validate(lock));
+
+#ifdef UNIV_SYNC_PERF_STAT
+ rw_x_exit_count++;
+#endif
+}
+
+/******************************************************************//**
+Releases an exclusive mode lock when we know there are no waiters, and
+none else will access the lock during the time this function is executed. */
+UNIV_INLINE
+void
+rw_lock_x_unlock_direct(
+/*====================*/
+ rw_lock_t* lock) /*!< in/out: rw-lock */
+{
+ /* Reset the exclusive lock if this thread no longer has an x-mode
+ lock */
+
+ ut_ad((lock->lock_word % X_LOCK_DECR) == 0);
+
+#ifdef UNIV_SYNC_DEBUG
+ rw_lock_remove_debug_info(lock, 0, RW_LOCK_EX);
+#endif
+
+ if (lock->lock_word == 0) {
+ lock->recursive = FALSE;
+ UNIV_MEM_INVALID(&lock->writer_thread,
+ sizeof lock->writer_thread);
+ }
+
+ lock->lock_word += X_LOCK_DECR;
+
+ ut_ad(!lock->waiters);
+ ut_ad(rw_lock_validate(lock));
+
+#ifdef UNIV_SYNC_PERF_STAT
+ rw_x_exit_count++;
+#endif
+}
diff --git a/storage/innobase/include/sync0sync.h b/storage/innobase/include/sync0sync.h
new file mode 100644
index 00000000000..df990823cc4
--- /dev/null
+++ b/storage/innobase/include/sync0sync.h
@@ -0,0 +1,578 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 2008, Google Inc.
+
+Portions of this file contain modifications contributed and copyrighted by
+Google, Inc. Those modifications are gratefully acknowledged and are described
+briefly in the InnoDB documentation. The contributions by Google are
+incorporated with their permission, and subject to the conditions contained in
+the file COPYING.Google.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/sync0sync.h
+Mutex, the basic synchronization primitive
+
+Created 9/5/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef sync0sync_h
+#define sync0sync_h
+
+#include "univ.i"
+#include "sync0types.h"
+#include "ut0lst.h"
+#include "ut0mem.h"
+#include "os0thread.h"
+#include "os0sync.h"
+#include "sync0arr.h"
+
+#if defined(UNIV_DEBUG) && !defined(UNIV_HOTBACKUP)
+extern my_bool timed_mutexes;
+#endif /* UNIV_DEBUG && !UNIV_HOTBACKUP */
+
+#ifdef HAVE_WINDOWS_ATOMICS
+typedef LONG lock_word_t; /*!< On Windows, InterlockedExchange operates
+ on LONG variable */
+#else
+typedef byte lock_word_t;
+#endif
+
+/******************************************************************//**
+Initializes the synchronization data structures. */
+UNIV_INTERN
+void
+sync_init(void);
+/*===========*/
+/******************************************************************//**
+Frees the resources in synchronization data structures. */
+UNIV_INTERN
+void
+sync_close(void);
+/*===========*/
+/******************************************************************//**
+Creates, or rather, initializes a mutex object to a specified memory
+location (which must be appropriately aligned). The mutex is initialized
+in the reset state. Explicit freeing of the mutex with mutex_free is
+necessary only if the memory block containing it is freed. */
+
+#ifdef UNIV_DEBUG
+# ifdef UNIV_SYNC_DEBUG
+# define mutex_create(M, level) \
+ mutex_create_func((M), #M, (level), __FILE__, __LINE__)
+# else
+# define mutex_create(M, level) \
+ mutex_create_func((M), #M, __FILE__, __LINE__)
+# endif
+#else
+# define mutex_create(M, level) \
+ mutex_create_func((M), __FILE__, __LINE__)
+#endif
+
+/******************************************************************//**
+Creates, or rather, initializes a mutex object in a specified memory
+location (which must be appropriately aligned). The mutex is initialized
+in the reset state. Explicit freeing of the mutex with mutex_free is
+necessary only if the memory block containing it is freed. */
+UNIV_INTERN
+void
+mutex_create_func(
+/*==============*/
+ mutex_t* mutex, /*!< in: pointer to memory */
+#ifdef UNIV_DEBUG
+ const char* cmutex_name, /*!< in: mutex name */
+# ifdef UNIV_SYNC_DEBUG
+ ulint level, /*!< in: level */
+# endif /* UNIV_SYNC_DEBUG */
+#endif /* UNIV_DEBUG */
+ const char* cfile_name, /*!< in: file name where created */
+ ulint cline); /*!< in: file line where created */
+
+#undef mutex_free /* Fix for MacOS X */
+
+/******************************************************************//**
+Calling this function is obligatory only if the memory buffer containing
+the mutex is freed. Removes a mutex object from the mutex list. The mutex
+is checked to be in the reset state. */
+UNIV_INTERN
+void
+mutex_free(
+/*=======*/
+ mutex_t* mutex); /*!< in: mutex */
+/**************************************************************//**
+NOTE! The following macro should be used in mutex locking, not the
+corresponding function. */
+
+#define mutex_enter(M) mutex_enter_func((M), __FILE__, __LINE__)
+/**************************************************************//**
+NOTE! The following macro should be used in mutex locking, not the
+corresponding function. */
+
+/* NOTE! currently same as mutex_enter! */
+
+#define mutex_enter_fast(M) mutex_enter_func((M), __FILE__, __LINE__)
+/******************************************************************//**
+NOTE! Use the corresponding macro in the header file, not this function
+directly. Locks a mutex for the current thread. If the mutex is reserved
+the function spins a preset time (controlled by SYNC_SPIN_ROUNDS) waiting
+for the mutex before suspending the thread. */
+UNIV_INLINE
+void
+mutex_enter_func(
+/*=============*/
+ mutex_t* mutex, /*!< in: pointer to mutex */
+ const char* file_name, /*!< in: file name where locked */
+ ulint line); /*!< in: line where locked */
+/**************************************************************//**
+NOTE! The following macro should be used in mutex locking, not the
+corresponding function. */
+
+#define mutex_enter_nowait(M) \
+ mutex_enter_nowait_func((M), __FILE__, __LINE__)
+/********************************************************************//**
+NOTE! Use the corresponding macro in the header file, not this function
+directly. Tries to lock the mutex for the current thread. If the lock is not
+acquired immediately, returns with return value 1.
+@return 0 if succeed, 1 if not */
+UNIV_INTERN
+ulint
+mutex_enter_nowait_func(
+/*====================*/
+ mutex_t* mutex, /*!< in: pointer to mutex */
+ const char* file_name, /*!< in: file name where mutex
+ requested */
+ ulint line); /*!< in: line where requested */
+/******************************************************************//**
+Unlocks a mutex owned by the current thread. */
+UNIV_INLINE
+void
+mutex_exit(
+/*=======*/
+ mutex_t* mutex); /*!< in: pointer to mutex */
+#ifdef UNIV_SYNC_DEBUG
+/******************************************************************//**
+Returns TRUE if no mutex or rw-lock is currently locked.
+Works only in the debug version.
+@return TRUE if no mutexes and rw-locks reserved */
+UNIV_INTERN
+ibool
+sync_all_freed(void);
+/*================*/
+#endif /* UNIV_SYNC_DEBUG */
+/*#####################################################################
+FUNCTION PROTOTYPES FOR DEBUGGING */
+/*******************************************************************//**
+Prints wait info of the sync system. */
+UNIV_INTERN
+void
+sync_print_wait_info(
+/*=================*/
+ FILE* file); /*!< in: file where to print */
+/*******************************************************************//**
+Prints info of the sync system. */
+UNIV_INTERN
+void
+sync_print(
+/*=======*/
+ FILE* file); /*!< in: file where to print */
+#ifdef UNIV_DEBUG
+/******************************************************************//**
+Checks that the mutex has been initialized.
+@return TRUE */
+UNIV_INTERN
+ibool
+mutex_validate(
+/*===========*/
+ const mutex_t* mutex); /*!< in: mutex */
+/******************************************************************//**
+Checks that the current thread owns the mutex. Works only
+in the debug version.
+@return TRUE if owns */
+UNIV_INTERN
+ibool
+mutex_own(
+/*======*/
+ const mutex_t* mutex); /*!< in: mutex */
+#endif /* UNIV_DEBUG */
+#ifdef UNIV_SYNC_DEBUG
+/******************************************************************//**
+Adds a latch and its level in the thread level array. Allocates the memory
+for the array if called first time for this OS thread. Makes the checks
+against other latch levels stored in the array for this thread. */
+UNIV_INTERN
+void
+sync_thread_add_level(
+/*==================*/
+ void* latch, /*!< in: pointer to a mutex or an rw-lock */
+ ulint level); /*!< in: level in the latching order; if
+ SYNC_LEVEL_VARYING, nothing is done */
+/******************************************************************//**
+Removes a latch from the thread level array if it is found there.
+@return TRUE if found in the array; it is no error if the latch is
+not found, as we presently are not able to determine the level for
+every latch reservation the program does */
+UNIV_INTERN
+ibool
+sync_thread_reset_level(
+/*====================*/
+ void* latch); /*!< in: pointer to a mutex or an rw-lock */
+/******************************************************************//**
+Checks that the level array for the current thread is empty.
+@return TRUE if empty */
+UNIV_INTERN
+ibool
+sync_thread_levels_empty(void);
+/*==========================*/
+/******************************************************************//**
+Checks that the level array for the current thread is empty.
+@return TRUE if empty except the exceptions specified below */
+UNIV_INTERN
+ibool
+sync_thread_levels_empty_gen(
+/*=========================*/
+ ibool dict_mutex_allowed); /*!< in: TRUE if dictionary mutex is
+ allowed to be owned by the thread,
+ also purge_is_running mutex is
+ allowed */
+/******************************************************************//**
+Gets the debug information for a reserved mutex. */
+UNIV_INTERN
+void
+mutex_get_debug_info(
+/*=================*/
+ mutex_t* mutex, /*!< in: mutex */
+ const char** file_name, /*!< out: file where requested */
+ ulint* line, /*!< out: line where requested */
+ os_thread_id_t* thread_id); /*!< out: id of the thread which owns
+ the mutex */
+/******************************************************************//**
+Counts currently reserved mutexes. Works only in the debug version.
+@return number of reserved mutexes */
+UNIV_INTERN
+ulint
+mutex_n_reserved(void);
+/*==================*/
+#endif /* UNIV_SYNC_DEBUG */
+/******************************************************************//**
+NOT to be used outside this module except in debugging! Gets the value
+of the lock word. */
+UNIV_INLINE
+lock_word_t
+mutex_get_lock_word(
+/*================*/
+ const mutex_t* mutex); /*!< in: mutex */
+#ifdef UNIV_SYNC_DEBUG
+/******************************************************************//**
+NOT to be used outside this module except in debugging! Gets the waiters
+field in a mutex.
+@return value to set */
+UNIV_INLINE
+ulint
+mutex_get_waiters(
+/*==============*/
+ const mutex_t* mutex); /*!< in: mutex */
+#endif /* UNIV_SYNC_DEBUG */
+
+/*
+ LATCHING ORDER WITHIN THE DATABASE
+ ==================================
+
+The mutex or latch in the central memory object, for instance, a rollback
+segment object, must be acquired before acquiring the latch or latches to
+the corresponding file data structure. In the latching order below, these
+file page object latches are placed immediately below the corresponding
+central memory object latch or mutex.
+
+Synchronization object Notes
+---------------------- -----
+
+Dictionary mutex If we have a pointer to a dictionary
+| object, e.g., a table, it can be
+| accessed without reserving the
+| dictionary mutex. We must have a
+| reservation, a memoryfix, to the
+| appropriate table object in this case,
+| and the table must be explicitly
+| released later.
+V
+Dictionary header
+|
+V
+Secondary index tree latch The tree latch protects also all
+| the B-tree non-leaf pages. These
+V can be read with the page only
+Secondary index non-leaf bufferfixed to save CPU time,
+| no s-latch is needed on the page.
+| Modification of a page requires an
+| x-latch on the page, however. If a
+| thread owns an x-latch to the tree,
+| it is allowed to latch non-leaf pages
+| even after it has acquired the fsp
+| latch.
+V
+Secondary index leaf The latch on the secondary index leaf
+| can be kept while accessing the
+| clustered index, to save CPU time.
+V
+Clustered index tree latch To increase concurrency, the tree
+| latch is usually released when the
+| leaf page latch has been acquired.
+V
+Clustered index non-leaf
+|
+V
+Clustered index leaf
+|
+V
+Transaction system header
+|
+V
+Transaction undo mutex The undo log entry must be written
+| before any index page is modified.
+| Transaction undo mutex is for the undo
+| logs the analogue of the tree latch
+| for a B-tree. If a thread has the
+| trx undo mutex reserved, it is allowed
+| to latch the undo log pages in any
+| order, and also after it has acquired
+| the fsp latch.
+V
+Rollback segment mutex The rollback segment mutex must be
+| reserved, if, e.g., a new page must
+| be added to an undo log. The rollback
+| segment and the undo logs in its
+| history list can be seen as an
+| analogue of a B-tree, and the latches
+| reserved similarly, using a version of
+| lock-coupling. If an undo log must be
+| extended by a page when inserting an
+| undo log record, this corresponds to
+| a pessimistic insert in a B-tree.
+V
+Rollback segment header
+|
+V
+Purge system latch
+|
+V
+Undo log pages If a thread owns the trx undo mutex,
+| or for a log in the history list, the
+| rseg mutex, it is allowed to latch
+| undo log pages in any order, and even
+| after it has acquired the fsp latch.
+| If a thread does not have the
+| appropriate mutex, it is allowed to
+| latch only a single undo log page in
+| a mini-transaction.
+V
+File space management latch If a mini-transaction must allocate
+| several file pages, it can do that,
+| because it keeps the x-latch to the
+| file space management in its memo.
+V
+File system pages
+|
+V
+Kernel mutex If a kernel operation needs a file
+| page allocation, it must reserve the
+| fsp x-latch before acquiring the kernel
+| mutex.
+V
+Search system mutex
+|
+V
+Buffer pool mutex
+|
+V
+Log mutex
+|
+Any other latch
+|
+V
+Memory pool mutex */
+
+/* Latching order levels */
+
+/* User transaction locks are higher than any of the latch levels below:
+no latches are allowed when a thread goes to wait for a normal table
+or row lock! */
+#define SYNC_USER_TRX_LOCK 9999
+#define SYNC_NO_ORDER_CHECK 3000 /* this can be used to suppress
+ latching order checking */
+#define SYNC_LEVEL_VARYING 2000 /* Level is varying. Only used with
+ buffer pool page locks, which do not
+ have a fixed level, but instead have
+ their level set after the page is
+ locked; see e.g.
+ ibuf_bitmap_get_map_page(). */
+#define SYNC_TRX_I_S_RWLOCK 1910 /* Used for
+ trx_i_s_cache_t::rw_lock */
+#define SYNC_TRX_I_S_LAST_READ 1900 /* Used for
+ trx_i_s_cache_t::last_read_mutex */
+#define SYNC_FILE_FORMAT_TAG 1200 /* Used to serialize access to the
+ file format tag */
+#define SYNC_DICT_OPERATION 1001 /* table create, drop, etc. reserve
+ this in X-mode, implicit or backround
+ operations purge, rollback, foreign
+ key checks reserve this in S-mode */
+#define SYNC_DICT 1000
+#define SYNC_DICT_AUTOINC_MUTEX 999
+#define SYNC_DICT_HEADER 995
+#define SYNC_IBUF_HEADER 914
+#define SYNC_IBUF_PESS_INSERT_MUTEX 912
+#define SYNC_IBUF_MUTEX 910 /* ibuf mutex is really below
+ SYNC_FSP_PAGE: we assign a value this
+ high only to make the program to pass
+ the debug checks */
+/*-------------------------------*/
+#define SYNC_INDEX_TREE 900
+#define SYNC_TREE_NODE_NEW 892
+#define SYNC_TREE_NODE_FROM_HASH 891
+#define SYNC_TREE_NODE 890
+#define SYNC_PURGE_SYS 810
+#define SYNC_PURGE_LATCH 800
+#define SYNC_TRX_UNDO 700
+#define SYNC_RSEG 600
+#define SYNC_RSEG_HEADER_NEW 591
+#define SYNC_RSEG_HEADER 590
+#define SYNC_TRX_UNDO_PAGE 570
+#define SYNC_EXTERN_STORAGE 500
+#define SYNC_FSP 400
+#define SYNC_FSP_PAGE 395
+/*------------------------------------- Insert buffer headers */
+/*------------------------------------- ibuf_mutex */
+/*------------------------------------- Insert buffer tree */
+#define SYNC_IBUF_BITMAP_MUTEX 351
+#define SYNC_IBUF_BITMAP 350
+/*------------------------------------- MySQL query cache mutex */
+/*------------------------------------- MySQL binlog mutex */
+/*-------------------------------*/
+#define SYNC_KERNEL 300
+#define SYNC_REC_LOCK 299
+#define SYNC_TRX_LOCK_HEAP 298
+#define SYNC_TRX_SYS_HEADER 290
+#define SYNC_LOG 170
+#define SYNC_RECV 168
+#define SYNC_WORK_QUEUE 162
+#define SYNC_SEARCH_SYS_CONF 161 /* for assigning btr_search_enabled */
+#define SYNC_SEARCH_SYS 160 /* NOTE that if we have a memory
+ heap that can be extended to the
+ buffer pool, its logical level is
+ SYNC_SEARCH_SYS, as memory allocation
+ can call routines there! Otherwise
+ the level is SYNC_MEM_HASH. */
+#define SYNC_BUF_POOL 150
+#define SYNC_BUF_BLOCK 149
+#define SYNC_DOUBLEWRITE 140
+#define SYNC_ANY_LATCH 135
+#define SYNC_THR_LOCAL 133
+#define SYNC_MEM_HASH 131
+#define SYNC_MEM_POOL 130
+
+/* Codes used to designate lock operations */
+#define RW_LOCK_NOT_LOCKED 350
+#define RW_LOCK_EX 351
+#define RW_LOCK_EXCLUSIVE 351
+#define RW_LOCK_SHARED 352
+#define RW_LOCK_WAIT_EX 353
+#define SYNC_MUTEX 354
+
+/* NOTE! The structure appears here only for the compiler to know its size.
+Do not use its fields directly! The structure used in the spin lock
+implementation of a mutual exclusion semaphore. */
+
+/** InnoDB mutex */
+struct mutex_struct {
+ os_event_t event; /*!< Used by sync0arr.c for the wait queue */
+ volatile lock_word_t lock_word; /*!< lock_word is the target
+ of the atomic test-and-set instruction when
+ atomic operations are enabled. */
+
+#if !defined(HAVE_ATOMIC_BUILTINS)
+ os_fast_mutex_t
+ os_fast_mutex; /*!< We use this OS mutex in place of lock_word
+ when atomic operations are not enabled */
+#endif
+ ulint waiters; /*!< This ulint is set to 1 if there are (or
+ may be) threads waiting in the global wait
+ array for this mutex to be released.
+ Otherwise, this is 0. */
+ UT_LIST_NODE_T(mutex_t) list; /*!< All allocated mutexes are put into
+ a list. Pointers to the next and prev. */
+#ifdef UNIV_SYNC_DEBUG
+ const char* file_name; /*!< File where the mutex was locked */
+ ulint line; /*!< Line where the mutex was locked */
+ ulint level; /*!< Level in the global latching order */
+#endif /* UNIV_SYNC_DEBUG */
+ const char* cfile_name;/*!< File name where mutex created */
+ ulint cline; /*!< Line where created */
+#ifdef UNIV_DEBUG
+ os_thread_id_t thread_id; /*!< The thread id of the thread
+ which locked the mutex. */
+ ulint magic_n; /*!< MUTEX_MAGIC_N */
+/** Value of mutex_struct::magic_n */
+# define MUTEX_MAGIC_N (ulint)979585
+#endif /* UNIV_DEBUG */
+ ulong count_os_wait; /*!< count of os_wait */
+#ifdef UNIV_DEBUG
+ ulong count_using; /*!< count of times mutex used */
+ ulong count_spin_loop; /*!< count of spin loops */
+ ulong count_spin_rounds;/*!< count of spin rounds */
+ ulong count_os_yield; /*!< count of os_wait */
+ ulonglong lspent_time; /*!< mutex os_wait timer msec */
+ ulonglong lmax_spent_time;/*!< mutex os_wait timer msec */
+ const char* cmutex_name; /*!< mutex name */
+ ulint mutex_type; /*!< 0=usual mutex, 1=rw_lock mutex */
+#endif /* UNIV_DEBUG */
+};
+
+/** The global array of wait cells for implementation of the databases own
+mutexes and read-write locks. */
+extern sync_array_t* sync_primary_wait_array;/* Appears here for
+ debugging purposes only! */
+
+/** Constant determining how long spin wait is continued before suspending
+the thread. A value 600 rounds on a 1995 100 MHz Pentium seems to correspond
+to 20 microseconds. */
+
+#define SYNC_SPIN_ROUNDS srv_n_spin_wait_rounds
+
+/** The number of mutex_exit calls. Intended for performance monitoring. */
+extern ib_int64_t mutex_exit_count;
+
+#ifdef UNIV_SYNC_DEBUG
+/** Latching order checks start when this is set TRUE */
+extern ibool sync_order_checks_on;
+#endif /* UNIV_SYNC_DEBUG */
+
+/** This variable is set to TRUE when sync_init is called */
+extern ibool sync_initialized;
+
+/** Global list of database mutexes (not OS mutexes) created. */
+typedef UT_LIST_BASE_NODE_T(mutex_t) ut_list_base_node_t;
+/** Global list of database mutexes (not OS mutexes) created. */
+extern ut_list_base_node_t mutex_list;
+
+/** Mutex protecting the mutex_list variable */
+extern mutex_t mutex_list_mutex;
+
+
+#ifndef UNIV_NONINL
+#include "sync0sync.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/sync0sync.ic b/storage/innobase/include/sync0sync.ic
new file mode 100644
index 00000000000..b05020b5660
--- /dev/null
+++ b/storage/innobase/include/sync0sync.ic
@@ -0,0 +1,222 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 2008, Google Inc.
+
+Portions of this file contain modifications contributed and copyrighted by
+Google, Inc. Those modifications are gratefully acknowledged and are described
+briefly in the InnoDB documentation. The contributions by Google are
+incorporated with their permission, and subject to the conditions contained in
+the file COPYING.Google.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/sync0sync.ic
+Mutex, the basic synchronization primitive
+
+Created 9/5/1995 Heikki Tuuri
+*******************************************************/
+
+/******************************************************************//**
+Sets the waiters field in a mutex. */
+UNIV_INTERN
+void
+mutex_set_waiters(
+/*==============*/
+ mutex_t* mutex, /*!< in: mutex */
+ ulint n); /*!< in: value to set */
+/******************************************************************//**
+Reserves a mutex for the current thread. If the mutex is reserved, the
+function spins a preset time (controlled by SYNC_SPIN_ROUNDS) waiting
+for the mutex before suspending the thread. */
+UNIV_INTERN
+void
+mutex_spin_wait(
+/*============*/
+ mutex_t* mutex, /*!< in: pointer to mutex */
+ const char* file_name, /*!< in: file name where mutex
+ requested */
+ ulint line); /*!< in: line where requested */
+#ifdef UNIV_SYNC_DEBUG
+/******************************************************************//**
+Sets the debug information for a reserved mutex. */
+UNIV_INTERN
+void
+mutex_set_debug_info(
+/*=================*/
+ mutex_t* mutex, /*!< in: mutex */
+ const char* file_name, /*!< in: file where requested */
+ ulint line); /*!< in: line where requested */
+#endif /* UNIV_SYNC_DEBUG */
+/******************************************************************//**
+Releases the threads waiting in the primary wait array for this mutex. */
+UNIV_INTERN
+void
+mutex_signal_object(
+/*================*/
+ mutex_t* mutex); /*!< in: mutex */
+
+/******************************************************************//**
+Performs an atomic test-and-set instruction to the lock_word field of a
+mutex.
+@return the previous value of lock_word: 0 or 1 */
+UNIV_INLINE
+byte
+mutex_test_and_set(
+/*===============*/
+ mutex_t* mutex) /*!< in: mutex */
+{
+#if defined(HAVE_ATOMIC_BUILTINS)
+ return(os_atomic_test_and_set_byte(&mutex->lock_word, 1));
+#else
+ ibool ret;
+
+ ret = os_fast_mutex_trylock(&(mutex->os_fast_mutex));
+
+ if (ret == 0) {
+ /* We check that os_fast_mutex_trylock does not leak
+ and allow race conditions */
+ ut_a(mutex->lock_word == 0);
+
+ mutex->lock_word = 1;
+ }
+
+ return((byte)ret);
+#endif
+}
+
+/******************************************************************//**
+Performs a reset instruction to the lock_word field of a mutex. This
+instruction also serializes memory operations to the program order. */
+UNIV_INLINE
+void
+mutex_reset_lock_word(
+/*==================*/
+ mutex_t* mutex) /*!< in: mutex */
+{
+#if defined(HAVE_ATOMIC_BUILTINS)
+ /* In theory __sync_lock_release should be used to release the lock.
+ Unfortunately, it does not work properly alone. The workaround is
+ that more conservative __sync_lock_test_and_set is used instead. */
+ os_atomic_test_and_set_byte(&mutex->lock_word, 0);
+#else
+ mutex->lock_word = 0;
+
+ os_fast_mutex_unlock(&(mutex->os_fast_mutex));
+#endif
+}
+
+/******************************************************************//**
+Gets the value of the lock word. */
+UNIV_INLINE
+lock_word_t
+mutex_get_lock_word(
+/*================*/
+ const mutex_t* mutex) /*!< in: mutex */
+{
+ ut_ad(mutex);
+
+ return(mutex->lock_word);
+}
+
+/******************************************************************//**
+Gets the waiters field in a mutex.
+@return value to set */
+UNIV_INLINE
+ulint
+mutex_get_waiters(
+/*==============*/
+ const mutex_t* mutex) /*!< in: mutex */
+{
+ const volatile ulint* ptr; /*!< declared volatile to ensure that
+ the value is read from memory */
+ ut_ad(mutex);
+
+ ptr = &(mutex->waiters);
+
+ return(*ptr); /* Here we assume that the read of a single
+ word from memory is atomic */
+}
+
+/******************************************************************//**
+Unlocks a mutex owned by the current thread. */
+UNIV_INLINE
+void
+mutex_exit(
+/*=======*/
+ mutex_t* mutex) /*!< in: pointer to mutex */
+{
+ ut_ad(mutex_own(mutex));
+
+ ut_d(mutex->thread_id = (os_thread_id_t) ULINT_UNDEFINED);
+
+#ifdef UNIV_SYNC_DEBUG
+ sync_thread_reset_level(mutex);
+#endif
+ mutex_reset_lock_word(mutex);
+
+ /* A problem: we assume that mutex_reset_lock word
+ is a memory barrier, that is when we read the waiters
+ field next, the read must be serialized in memory
+ after the reset. A speculative processor might
+ perform the read first, which could leave a waiting
+ thread hanging indefinitely.
+
+ Our current solution call every second
+ sync_arr_wake_threads_if_sema_free()
+ to wake up possible hanging threads if
+ they are missed in mutex_signal_object. */
+
+ if (mutex_get_waiters(mutex) != 0) {
+
+ mutex_signal_object(mutex);
+ }
+
+#ifdef UNIV_SYNC_PERF_STAT
+ mutex_exit_count++;
+#endif
+}
+
+/******************************************************************//**
+Locks a mutex for the current thread. If the mutex is reserved, the function
+spins a preset time (controlled by SYNC_SPIN_ROUNDS), waiting for the mutex
+before suspending the thread. */
+UNIV_INLINE
+void
+mutex_enter_func(
+/*=============*/
+ mutex_t* mutex, /*!< in: pointer to mutex */
+ const char* file_name, /*!< in: file name where locked */
+ ulint line) /*!< in: line where locked */
+{
+ ut_ad(mutex_validate(mutex));
+ ut_ad(!mutex_own(mutex));
+
+ /* Note that we do not peek at the value of lock_word before trying
+ the atomic test_and_set; we could peek, and possibly save time. */
+
+ ut_d(mutex->count_using++);
+
+ if (!mutex_test_and_set(mutex)) {
+ ut_d(mutex->thread_id = os_thread_get_curr_id());
+#ifdef UNIV_SYNC_DEBUG
+ mutex_set_debug_info(mutex, file_name, line);
+#endif
+ return; /* Succeeded! */
+ }
+
+ mutex_spin_wait(mutex, file_name, line);
+}
diff --git a/storage/innobase/include/sync0types.h b/storage/innobase/include/sync0types.h
new file mode 100644
index 00000000000..1911bbac7fd
--- /dev/null
+++ b/storage/innobase/include/sync0types.h
@@ -0,0 +1,34 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/sync0types.h
+Global types for sync
+
+Created 9/5/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef sync0types_h
+#define sync0types_h
+
+/** Rename mutex_t to avoid name space collision on some systems */
+#define mutex_t ib_mutex_t
+/** InnoDB mutex */
+typedef struct mutex_struct mutex_t;
+
+#endif
diff --git a/storage/innobase/include/thr0loc.h b/storage/innobase/include/thr0loc.h
new file mode 100644
index 00000000000..b4bdc33e615
--- /dev/null
+++ b/storage/innobase/include/thr0loc.h
@@ -0,0 +1,84 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/thr0loc.h
+The thread local storage
+
+Created 10/5/1995 Heikki Tuuri
+*******************************************************/
+
+/* This module implements storage private to each thread,
+a capability useful in some situations like storing the
+OS handle to the current thread, or its priority. */
+
+#ifndef thr0loc_h
+#define thr0loc_h
+
+#include "univ.i"
+#include "os0thread.h"
+
+/****************************************************************//**
+Initializes the thread local storage module. */
+UNIV_INTERN
+void
+thr_local_init(void);
+/*================*/
+/*******************************************************************//**
+Creates a local storage struct for the calling new thread. */
+UNIV_INTERN
+void
+thr_local_create(void);
+/*==================*/
+/*******************************************************************//**
+Frees the local storage struct for the specified thread. */
+UNIV_INTERN
+void
+thr_local_free(
+/*===========*/
+ os_thread_id_t id); /*!< in: thread id */
+/*******************************************************************//**
+Gets the slot number in the thread table of a thread.
+@return slot number */
+UNIV_INTERN
+ulint
+thr_local_get_slot_no(
+/*==================*/
+ os_thread_id_t id); /*!< in: thread id of the thread */
+/*******************************************************************//**
+Sets in the local storage the slot number in the thread table of a thread. */
+UNIV_INTERN
+void
+thr_local_set_slot_no(
+/*==================*/
+ os_thread_id_t id, /*!< in: thread id of the thread */
+ ulint slot_no);/*!< in: slot number */
+/*******************************************************************//**
+Returns pointer to the 'in_ibuf' field within the current thread local
+storage.
+@return pointer to the in_ibuf field */
+UNIV_INTERN
+ibool*
+thr_local_get_in_ibuf_field(void);
+/*=============================*/
+
+#ifndef UNIV_NONINL
+#include "thr0loc.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/thr0loc.ic b/storage/innobase/include/thr0loc.ic
new file mode 100644
index 00000000000..ce44e512320
--- /dev/null
+++ b/storage/innobase/include/thr0loc.ic
@@ -0,0 +1,24 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/thr0loc.ic
+Thread local storage
+
+Created 10/4/1995 Heikki Tuuri
+*******************************************************/
diff --git a/storage/innobase/include/trx0i_s.h b/storage/innobase/include/trx0i_s.h
new file mode 100644
index 00000000000..9bf032de9f9
--- /dev/null
+++ b/storage/innobase/include/trx0i_s.h
@@ -0,0 +1,240 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/trx0i_s.h
+INFORMATION SCHEMA innodb_trx, innodb_locks and
+innodb_lock_waits tables cache structures and public
+functions.
+
+Created July 17, 2007 Vasil Dimov
+*******************************************************/
+
+#ifndef trx0i_s_h
+#define trx0i_s_h
+
+#include "univ.i"
+#include "trx0types.h"
+#include "ut0ut.h"
+
+/** The maximum amount of memory that can be consumed by innodb_trx,
+innodb_locks and innodb_lock_waits information schema tables. */
+#define TRX_I_S_MEM_LIMIT 16777216 /* 16 MiB */
+
+/** The maximum length of a string that can be stored in
+i_s_locks_row_t::lock_data */
+#define TRX_I_S_LOCK_DATA_MAX_LEN 8192
+
+/** The maximum length of a string that can be stored in
+i_s_trx_row_t::trx_query */
+#define TRX_I_S_TRX_QUERY_MAX_LEN 1024
+
+/** A row of INFORMATION_SCHEMA.innodb_locks */
+typedef struct i_s_locks_row_struct i_s_locks_row_t;
+/** A row of INFORMATION_SCHEMA.innodb_trx */
+typedef struct i_s_trx_row_struct i_s_trx_row_t;
+/** A row of INFORMATION_SCHEMA.innodb_lock_waits */
+typedef struct i_s_lock_waits_row_struct i_s_lock_waits_row_t;
+
+/** Objects of trx_i_s_cache_t::locks_hash */
+typedef struct i_s_hash_chain_struct i_s_hash_chain_t;
+
+/** Objects of this type are added to the hash table
+trx_i_s_cache_t::locks_hash */
+struct i_s_hash_chain_struct {
+ i_s_locks_row_t* value; /*!< row of
+ INFORMATION_SCHEMA.innodb_locks*/
+ i_s_hash_chain_t* next; /*!< next item in the hash chain */
+};
+
+/** This structure represents INFORMATION_SCHEMA.innodb_locks row */
+struct i_s_locks_row_struct {
+ ullint lock_trx_id; /*!< transaction identifier */
+ const char* lock_mode; /*!< lock mode from
+ lock_get_mode_str() */
+ const char* lock_type; /*!< lock type from
+ lock_get_type_str() */
+ const char* lock_table; /*!< table name from
+ lock_get_table_name() */
+ const char* lock_index; /*!< index name from
+ lock_rec_get_index_name() */
+ /** Information for record locks. All these are
+ ULINT_UNDEFINED for table locks. */
+ /* @{ */
+ ulint lock_space; /*!< tablespace identifier */
+ ulint lock_page; /*!< page number within the_space */
+ ulint lock_rec; /*!< heap number of the record
+ on the page */
+ const char* lock_data; /*!< (some) content of the record */
+ /* @} */
+
+ /** The following are auxiliary and not included in the table */
+ /* @{ */
+ ullint lock_table_id;
+ /*!< table identifier from
+ lock_get_table_id */
+ i_s_hash_chain_t hash_chain; /*!< hash table chain node for
+ trx_i_s_cache_t::locks_hash */
+ /* @} */
+};
+
+/** This structure represents INFORMATION_SCHEMA.innodb_trx row */
+struct i_s_trx_row_struct {
+ ullint trx_id; /*!< transaction identifier */
+ const char* trx_state; /*!< transaction state from
+ trx_get_que_state_str() */
+ ib_time_t trx_started; /*!< trx_struct::start_time */
+ const i_s_locks_row_t* requested_lock_row;
+ /*!< pointer to a row
+ in innodb_locks if trx
+ is waiting, or NULL */
+ ib_time_t trx_wait_started;
+ /*!< trx_struct::wait_started */
+ ullint trx_weight; /*!< TRX_WEIGHT() */
+ ulint trx_mysql_thread_id;
+ /*!< thd_get_thread_id() */
+ const char* trx_query; /*!< MySQL statement being
+ executed in the transaction */
+};
+
+/** This structure represents INFORMATION_SCHEMA.innodb_lock_waits row */
+struct i_s_lock_waits_row_struct {
+ const i_s_locks_row_t* requested_lock_row; /*!< requested lock */
+ const i_s_locks_row_t* blocking_lock_row; /*!< blocking lock */
+};
+
+/** Cache of INFORMATION_SCHEMA table data */
+typedef struct trx_i_s_cache_struct trx_i_s_cache_t;
+
+/** Auxiliary enum used by functions that need to select one of the
+INFORMATION_SCHEMA tables */
+enum i_s_table {
+ I_S_INNODB_TRX, /*!< INFORMATION_SCHEMA.innodb_trx */
+ I_S_INNODB_LOCKS, /*!< INFORMATION_SCHEMA.innodb_locks */
+ I_S_INNODB_LOCK_WAITS /*!< INFORMATION_SCHEMA.innodb_lock_waits */
+};
+
+/** This is the intermediate buffer where data needed to fill the
+INFORMATION SCHEMA tables is fetched and later retrieved by the C++
+code in handler/i_s.cc. */
+extern trx_i_s_cache_t* trx_i_s_cache;
+
+/*******************************************************************//**
+Initialize INFORMATION SCHEMA trx related cache. */
+UNIV_INTERN
+void
+trx_i_s_cache_init(
+/*===============*/
+ trx_i_s_cache_t* cache); /*!< out: cache to init */
+
+/*******************************************************************//**
+Issue a shared/read lock on the tables cache. */
+UNIV_INTERN
+void
+trx_i_s_cache_start_read(
+/*=====================*/
+ trx_i_s_cache_t* cache); /*!< in: cache */
+
+/*******************************************************************//**
+Release a shared/read lock on the tables cache. */
+UNIV_INTERN
+void
+trx_i_s_cache_end_read(
+/*===================*/
+ trx_i_s_cache_t* cache); /*!< in: cache */
+
+/*******************************************************************//**
+Issue an exclusive/write lock on the tables cache. */
+UNIV_INTERN
+void
+trx_i_s_cache_start_write(
+/*======================*/
+ trx_i_s_cache_t* cache); /*!< in: cache */
+
+/*******************************************************************//**
+Release an exclusive/write lock on the tables cache. */
+UNIV_INTERN
+void
+trx_i_s_cache_end_write(
+/*====================*/
+ trx_i_s_cache_t* cache); /*!< in: cache */
+
+
+/*******************************************************************//**
+Retrieves the number of used rows in the cache for a given
+INFORMATION SCHEMA table.
+@return number of rows */
+UNIV_INTERN
+ulint
+trx_i_s_cache_get_rows_used(
+/*========================*/
+ trx_i_s_cache_t* cache, /*!< in: cache */
+ enum i_s_table table); /*!< in: which table */
+
+/*******************************************************************//**
+Retrieves the nth row in the cache for a given INFORMATION SCHEMA
+table.
+@return row */
+UNIV_INTERN
+void*
+trx_i_s_cache_get_nth_row(
+/*======================*/
+ trx_i_s_cache_t* cache, /*!< in: cache */
+ enum i_s_table table, /*!< in: which table */
+ ulint n); /*!< in: row number */
+
+/*******************************************************************//**
+Update the transactions cache if it has not been read for some time.
+@return 0 - fetched, 1 - not */
+UNIV_INTERN
+int
+trx_i_s_possibly_fetch_data_into_cache(
+/*===================================*/
+ trx_i_s_cache_t* cache); /*!< in/out: cache */
+
+/*******************************************************************//**
+Returns TRUE if the data in the cache is truncated due to the memory
+limit posed by TRX_I_S_MEM_LIMIT.
+@return TRUE if truncated */
+UNIV_INTERN
+ibool
+trx_i_s_cache_is_truncated(
+/*=======================*/
+ trx_i_s_cache_t* cache); /*!< in: cache */
+
+/** The maximum length of a resulting lock_id_size in
+trx_i_s_create_lock_id(), not including the terminating NUL.
+":%lu:%lu:%lu" -> 63 chars */
+#define TRX_I_S_LOCK_ID_MAX_LEN (TRX_ID_MAX_LEN + 63)
+
+/*******************************************************************//**
+Crafts a lock id string from a i_s_locks_row_t object. Returns its
+second argument. This function aborts if there is not enough space in
+lock_id. Be sure to provide at least TRX_I_S_LOCK_ID_MAX_LEN + 1 if you
+want to be 100% sure that it will not abort.
+@return resulting lock id */
+UNIV_INTERN
+char*
+trx_i_s_create_lock_id(
+/*===================*/
+ const i_s_locks_row_t* row, /*!< in: innodb_locks row */
+ char* lock_id,/*!< out: resulting lock_id */
+ ulint lock_id_size);/*!< in: size of the lock id
+ buffer */
+
+#endif /* trx0i_s_h */
diff --git a/storage/innobase/include/trx0purge.h b/storage/innobase/include/trx0purge.h
new file mode 100644
index 00000000000..7812ad7eb92
--- /dev/null
+++ b/storage/innobase/include/trx0purge.h
@@ -0,0 +1,183 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/trx0purge.h
+Purge old versions
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef trx0purge_h
+#define trx0purge_h
+
+#include "univ.i"
+#include "trx0types.h"
+#include "mtr0mtr.h"
+#include "trx0sys.h"
+#include "que0types.h"
+#include "page0page.h"
+#include "usr0sess.h"
+#include "fil0fil.h"
+
+/** The global data structure coordinating a purge */
+extern trx_purge_t* purge_sys;
+
+/** A dummy undo record used as a return value when we have a whole undo log
+which needs no purge */
+extern trx_undo_rec_t trx_purge_dummy_rec;
+
+/********************************************************************//**
+Calculates the file address of an undo log header when we have the file
+address of its history list node.
+@return file address of the log */
+UNIV_INLINE
+fil_addr_t
+trx_purge_get_log_from_hist(
+/*========================*/
+ fil_addr_t node_addr); /*!< in: file address of the history
+ list node of the log */
+/*****************************************************************//**
+Checks if trx_id is >= purge_view: then it is guaranteed that its update
+undo log still exists in the system.
+@return TRUE if is sure that it is preserved, also if the function
+returns FALSE, it is possible that the undo log still exists in the
+system */
+UNIV_INTERN
+ibool
+trx_purge_update_undo_must_exist(
+/*=============================*/
+ trx_id_t trx_id);/*!< in: transaction id */
+/********************************************************************//**
+Creates the global purge system control structure and inits the history
+mutex. */
+UNIV_INTERN
+void
+trx_purge_sys_create(void);
+/*======================*/
+/********************************************************************//**
+Adds the update undo log as the first log in the history list. Removes the
+update undo log segment from the rseg slot if it is too big for reuse. */
+UNIV_INTERN
+void
+trx_purge_add_update_undo_to_history(
+/*=================================*/
+ trx_t* trx, /*!< in: transaction */
+ page_t* undo_page, /*!< in: update undo log header page,
+ x-latched */
+ mtr_t* mtr); /*!< in: mtr */
+/********************************************************************//**
+Fetches the next undo log record from the history list to purge. It must be
+released with the corresponding release function.
+@return copy of an undo log record or pointer to trx_purge_dummy_rec,
+if the whole undo log can skipped in purge; NULL if none left */
+UNIV_INTERN
+trx_undo_rec_t*
+trx_purge_fetch_next_rec(
+/*=====================*/
+ roll_ptr_t* roll_ptr,/*!< out: roll pointer to undo record */
+ trx_undo_inf_t** cell, /*!< out: storage cell for the record in the
+ purge array */
+ mem_heap_t* heap); /*!< in: memory heap where copied */
+/*******************************************************************//**
+Releases a reserved purge undo record. */
+UNIV_INTERN
+void
+trx_purge_rec_release(
+/*==================*/
+ trx_undo_inf_t* cell); /*!< in: storage cell */
+/*******************************************************************//**
+This function runs a purge batch.
+@return number of undo log pages handled in the batch */
+UNIV_INTERN
+ulint
+trx_purge(void);
+/*===========*/
+/******************************************************************//**
+Prints information of the purge system to stderr. */
+UNIV_INTERN
+void
+trx_purge_sys_print(void);
+/*======================*/
+
+/** The control structure used in the purge operation */
+struct trx_purge_struct{
+ ulint state; /*!< Purge system state */
+ sess_t* sess; /*!< System session running the purge
+ query */
+ trx_t* trx; /*!< System transaction running the purge
+ query: this trx is not in the trx list
+ of the trx system and it never ends */
+ que_t* query; /*!< The query graph which will do the
+ parallelized purge operation */
+ rw_lock_t latch; /*!< The latch protecting the purge view.
+ A purge operation must acquire an
+ x-latch here for the instant at which
+ it changes the purge view: an undo
+ log operation can prevent this by
+ obtaining an s-latch here. */
+ read_view_t* view; /*!< The purge will not remove undo logs
+ which are >= this view (purge view) */
+ mutex_t mutex; /*!< Mutex protecting the fields below */
+ ulint n_pages_handled;/*!< Approximate number of undo log
+ pages processed in purge */
+ ulint handle_limit; /*!< Target of how many pages to get
+ processed in the current purge */
+ /*------------------------------*/
+ /* The following two fields form the 'purge pointer' which advances
+ during a purge, and which is used in history list truncation */
+
+ trx_id_t purge_trx_no; /*!< Purge has advanced past all
+ transactions whose number is less
+ than this */
+ undo_no_t purge_undo_no; /*!< Purge has advanced past all records
+ whose undo number is less than this */
+ /*-----------------------------*/
+ ibool next_stored; /*!< TRUE if the info of the next record
+ to purge is stored below: if yes, then
+ the transaction number and the undo
+ number of the record are stored in
+ purge_trx_no and purge_undo_no above */
+ trx_rseg_t* rseg; /*!< Rollback segment for the next undo
+ record to purge */
+ ulint page_no; /*!< Page number for the next undo
+ record to purge, page number of the
+ log header, if dummy record */
+ ulint offset; /*!< Page offset for the next undo
+ record to purge, 0 if the dummy
+ record */
+ ulint hdr_page_no; /*!< Header page of the undo log where
+ the next record to purge belongs */
+ ulint hdr_offset; /*!< Header byte offset on the page */
+ /*-----------------------------*/
+ trx_undo_arr_t* arr; /*!< Array of transaction numbers and
+ undo numbers of the undo records
+ currently under processing in purge */
+ mem_heap_t* heap; /*!< Temporary storage used during a
+ purge: can be emptied after purge
+ completes */
+};
+
+#define TRX_PURGE_ON 1 /* purge operation is running */
+#define TRX_STOP_PURGE 2 /* purge operation is stopped, or
+ it should be stopped */
+#ifndef UNIV_NONINL
+#include "trx0purge.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/trx0purge.ic b/storage/innobase/include/trx0purge.ic
new file mode 100644
index 00000000000..de09e393654
--- /dev/null
+++ b/storage/innobase/include/trx0purge.ic
@@ -0,0 +1,43 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/trx0purge.ic
+Purge old versions
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#include "trx0undo.h"
+
+/********************************************************************//**
+Calculates the file address of an undo log header when we have the file
+address of its history list node.
+@return file address of the log */
+UNIV_INLINE
+fil_addr_t
+trx_purge_get_log_from_hist(
+/*========================*/
+ fil_addr_t node_addr) /*!< in: file address of the history
+ list node of the log */
+{
+ node_addr.boffset -= TRX_UNDO_HISTORY_NODE;
+
+ return(node_addr);
+}
+
diff --git a/storage/innobase/include/trx0rec.h b/storage/innobase/include/trx0rec.h
new file mode 100644
index 00000000000..0ae82c33afe
--- /dev/null
+++ b/storage/innobase/include/trx0rec.h
@@ -0,0 +1,338 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/trx0rec.h
+Transaction undo log record
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef trx0rec_h
+#define trx0rec_h
+
+#include "univ.i"
+#include "trx0types.h"
+#include "row0types.h"
+#include "mtr0mtr.h"
+#include "dict0types.h"
+#include "data0data.h"
+#include "rem0types.h"
+
+#ifndef UNIV_HOTBACKUP
+# include "que0types.h"
+
+/***********************************************************************//**
+Copies the undo record to the heap.
+@return own: copy of undo log record */
+UNIV_INLINE
+trx_undo_rec_t*
+trx_undo_rec_copy(
+/*==============*/
+ trx_undo_rec_t* undo_rec, /*!< in: undo log record */
+ mem_heap_t* heap); /*!< in: heap where copied */
+/**********************************************************************//**
+Reads the undo log record type.
+@return record type */
+UNIV_INLINE
+ulint
+trx_undo_rec_get_type(
+/*==================*/
+ const trx_undo_rec_t* undo_rec); /*!< in: undo log record */
+/**********************************************************************//**
+Reads from an undo log record the record compiler info.
+@return compiler info */
+UNIV_INLINE
+ulint
+trx_undo_rec_get_cmpl_info(
+/*=======================*/
+ const trx_undo_rec_t* undo_rec); /*!< in: undo log record */
+/**********************************************************************//**
+Returns TRUE if an undo log record contains an extern storage field.
+@return TRUE if extern */
+UNIV_INLINE
+ibool
+trx_undo_rec_get_extern_storage(
+/*============================*/
+ const trx_undo_rec_t* undo_rec); /*!< in: undo log record */
+/**********************************************************************//**
+Reads the undo log record number.
+@return undo no */
+UNIV_INLINE
+undo_no_t
+trx_undo_rec_get_undo_no(
+/*=====================*/
+ const trx_undo_rec_t* undo_rec); /*!< in: undo log record */
+/**********************************************************************//**
+Returns the start of the undo record data area.
+@return offset to the data area */
+UNIV_INLINE
+ulint
+trx_undo_rec_get_offset(
+/*====================*/
+ undo_no_t undo_no) /*!< in: undo no read from node */
+ __attribute__((const));
+
+/**********************************************************************//**
+Returns the start of the undo record data area. */
+#define trx_undo_rec_get_ptr(undo_rec, undo_no) \
+ ((undo_rec) + trx_undo_rec_get_offset(undo_no))
+
+/**********************************************************************//**
+Reads from an undo log record the general parameters.
+@return remaining part of undo log record after reading these values */
+UNIV_INTERN
+byte*
+trx_undo_rec_get_pars(
+/*==================*/
+ trx_undo_rec_t* undo_rec, /*!< in: undo log record */
+ ulint* type, /*!< out: undo record type:
+ TRX_UNDO_INSERT_REC, ... */
+ ulint* cmpl_info, /*!< out: compiler info, relevant only
+ for update type records */
+ ibool* updated_extern, /*!< out: TRUE if we updated an
+ externally stored fild */
+ undo_no_t* undo_no, /*!< out: undo log record number */
+ dulint* table_id); /*!< out: table id */
+/*******************************************************************//**
+Builds a row reference from an undo log record.
+@return pointer to remaining part of undo record */
+UNIV_INTERN
+byte*
+trx_undo_rec_get_row_ref(
+/*=====================*/
+ byte* ptr, /*!< in: remaining part of a copy of an undo log
+ record, at the start of the row reference;
+ NOTE that this copy of the undo log record must
+ be preserved as long as the row reference is
+ used, as we do NOT copy the data in the
+ record! */
+ dict_index_t* index, /*!< in: clustered index */
+ dtuple_t** ref, /*!< out, own: row reference */
+ mem_heap_t* heap); /*!< in: memory heap from which the memory
+ needed is allocated */
+/*******************************************************************//**
+Skips a row reference from an undo log record.
+@return pointer to remaining part of undo record */
+UNIV_INTERN
+byte*
+trx_undo_rec_skip_row_ref(
+/*======================*/
+ byte* ptr, /*!< in: remaining part in update undo log
+ record, at the start of the row reference */
+ dict_index_t* index); /*!< in: clustered index */
+/**********************************************************************//**
+Reads from an undo log update record the system field values of the old
+version.
+@return remaining part of undo log record after reading these values */
+UNIV_INTERN
+byte*
+trx_undo_update_rec_get_sys_cols(
+/*=============================*/
+ byte* ptr, /*!< in: remaining part of undo
+ log record after reading
+ general parameters */
+ trx_id_t* trx_id, /*!< out: trx id */
+ roll_ptr_t* roll_ptr, /*!< out: roll ptr */
+ ulint* info_bits); /*!< out: info bits state */
+/*******************************************************************//**
+Builds an update vector based on a remaining part of an undo log record.
+@return remaining part of the record, NULL if an error detected, which
+means that the record is corrupted */
+UNIV_INTERN
+byte*
+trx_undo_update_rec_get_update(
+/*===========================*/
+ byte* ptr, /*!< in: remaining part in update undo log
+ record, after reading the row reference
+ NOTE that this copy of the undo log record must
+ be preserved as long as the update vector is
+ used, as we do NOT copy the data in the
+ record! */
+ dict_index_t* index, /*!< in: clustered index */
+ ulint type, /*!< in: TRX_UNDO_UPD_EXIST_REC,
+ TRX_UNDO_UPD_DEL_REC, or
+ TRX_UNDO_DEL_MARK_REC; in the last case,
+ only trx id and roll ptr fields are added to
+ the update vector */
+ trx_id_t trx_id, /*!< in: transaction id from this undorecord */
+ roll_ptr_t roll_ptr,/*!< in: roll pointer from this undo record */
+ ulint info_bits,/*!< in: info bits from this undo record */
+ trx_t* trx, /*!< in: transaction */
+ mem_heap_t* heap, /*!< in: memory heap from which the memory
+ needed is allocated */
+ upd_t** upd); /*!< out, own: update vector */
+/*******************************************************************//**
+Builds a partial row from an update undo log record. It contains the
+columns which occur as ordering in any index of the table.
+@return pointer to remaining part of undo record */
+UNIV_INTERN
+byte*
+trx_undo_rec_get_partial_row(
+/*=========================*/
+ byte* ptr, /*!< in: remaining part in update undo log
+ record of a suitable type, at the start of
+ the stored index columns;
+ NOTE that this copy of the undo log record must
+ be preserved as long as the partial row is
+ used, as we do NOT copy the data in the
+ record! */
+ dict_index_t* index, /*!< in: clustered index */
+ dtuple_t** row, /*!< out, own: partial row */
+ ibool ignore_prefix, /*!< in: flag to indicate if we
+ expect blob prefixes in undo. Used
+ only in the assertion. */
+ mem_heap_t* heap); /*!< in: memory heap from which the memory
+ needed is allocated */
+/***********************************************************************//**
+Writes information to an undo log about an insert, update, or a delete marking
+of a clustered index record. This information is used in a rollback of the
+transaction and in consistent reads that must look to the history of this
+transaction.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+ulint
+trx_undo_report_row_operation(
+/*==========================*/
+ ulint flags, /*!< in: if BTR_NO_UNDO_LOG_FLAG bit is
+ set, does nothing */
+ ulint op_type, /*!< in: TRX_UNDO_INSERT_OP or
+ TRX_UNDO_MODIFY_OP */
+ que_thr_t* thr, /*!< in: query thread */
+ dict_index_t* index, /*!< in: clustered index */
+ const dtuple_t* clust_entry, /*!< in: in the case of an insert,
+ index entry to insert into the
+ clustered index, otherwise NULL */
+ const upd_t* update, /*!< in: in the case of an update,
+ the update vector, otherwise NULL */
+ ulint cmpl_info, /*!< in: compiler info on secondary
+ index updates */
+ const rec_t* rec, /*!< in: case of an update or delete
+ marking, the record in the clustered
+ index, otherwise NULL */
+ roll_ptr_t* roll_ptr); /*!< out: rollback pointer to the
+ inserted undo log record,
+ ut_dulint_zero if BTR_NO_UNDO_LOG
+ flag was specified */
+/******************************************************************//**
+Copies an undo record to heap. This function can be called if we know that
+the undo log record exists.
+@return own: copy of the record */
+UNIV_INTERN
+trx_undo_rec_t*
+trx_undo_get_undo_rec_low(
+/*======================*/
+ roll_ptr_t roll_ptr, /*!< in: roll pointer to record */
+ mem_heap_t* heap); /*!< in: memory heap where copied */
+/******************************************************************//**
+Copies an undo record to heap.
+
+NOTE: the caller must have latches on the clustered index page and
+purge_view.
+
+@return DB_SUCCESS, or DB_MISSING_HISTORY if the undo log has been
+truncated and we cannot fetch the old version */
+UNIV_INTERN
+ulint
+trx_undo_get_undo_rec(
+/*==================*/
+ roll_ptr_t roll_ptr, /*!< in: roll pointer to record */
+ trx_id_t trx_id, /*!< in: id of the trx that generated
+ the roll pointer: it points to an
+ undo log of this transaction */
+ trx_undo_rec_t** undo_rec, /*!< out, own: copy of the record */
+ mem_heap_t* heap); /*!< in: memory heap where copied */
+/*******************************************************************//**
+Build a previous version of a clustered index record. This function checks
+that the caller has a latch on the index page of the clustered index record
+and an s-latch on the purge_view. This guarantees that the stack of versions
+is locked.
+@return DB_SUCCESS, or DB_MISSING_HISTORY if the previous version is
+earlier than purge_view, which means that it may have been removed,
+DB_ERROR if corrupted record */
+UNIV_INTERN
+ulint
+trx_undo_prev_version_build(
+/*========================*/
+ const rec_t* index_rec,/*!< in: clustered index record in the
+ index tree */
+ mtr_t* index_mtr,/*!< in: mtr which contains the latch to
+ index_rec page and purge_view */
+ const rec_t* rec, /*!< in: version of a clustered index record */
+ dict_index_t* index, /*!< in: clustered index */
+ ulint* offsets,/*!< in: rec_get_offsets(rec, index) */
+ mem_heap_t* heap, /*!< in: memory heap from which the memory
+ needed is allocated */
+ rec_t** old_vers);/*!< out, own: previous version, or NULL if
+ rec is the first inserted version, or if
+ history data has been deleted */
+#endif /* !UNIV_HOTBACKUP */
+/***********************************************************//**
+Parses a redo log record of adding an undo log record.
+@return end of log record or NULL */
+UNIV_INTERN
+byte*
+trx_undo_parse_add_undo_rec(
+/*========================*/
+ byte* ptr, /*!< in: buffer */
+ byte* end_ptr,/*!< in: buffer end */
+ page_t* page); /*!< in: page or NULL */
+/***********************************************************//**
+Parses a redo log record of erasing of an undo page end.
+@return end of log record or NULL */
+UNIV_INTERN
+byte*
+trx_undo_parse_erase_page_end(
+/*==========================*/
+ byte* ptr, /*!< in: buffer */
+ byte* end_ptr,/*!< in: buffer end */
+ page_t* page, /*!< in: page or NULL */
+ mtr_t* mtr); /*!< in: mtr or NULL */
+
+#ifndef UNIV_HOTBACKUP
+
+/* Types of an undo log record: these have to be smaller than 16, as the
+compilation info multiplied by 16 is ORed to this value in an undo log
+record */
+
+#define TRX_UNDO_INSERT_REC 11 /* fresh insert into clustered index */
+#define TRX_UNDO_UPD_EXIST_REC 12 /* update of a non-delete-marked
+ record */
+#define TRX_UNDO_UPD_DEL_REC 13 /* update of a delete marked record to
+ a not delete marked record; also the
+ fields of the record can change */
+#define TRX_UNDO_DEL_MARK_REC 14 /* delete marking of a record; fields
+ do not change */
+#define TRX_UNDO_CMPL_INFO_MULT 16 /* compilation info is multiplied by
+ this and ORed to the type above */
+#define TRX_UNDO_UPD_EXTERN 128 /* This bit can be ORed to type_cmpl
+ to denote that we updated external
+ storage fields: used by purge to
+ free the external storage */
+
+/* Operation type flags used in trx_undo_report_row_operation */
+#define TRX_UNDO_INSERT_OP 1
+#define TRX_UNDO_MODIFY_OP 2
+
+#ifndef UNIV_NONINL
+#include "trx0rec.ic"
+#endif
+
+#endif /* !UNIV_HOTBACKUP */
+
+#endif /* trx0rec_h */
diff --git a/storage/innobase/include/trx0rec.ic b/storage/innobase/include/trx0rec.ic
new file mode 100644
index 00000000000..037b5d4f6cf
--- /dev/null
+++ b/storage/innobase/include/trx0rec.ic
@@ -0,0 +1,112 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/trx0rec.ic
+Transaction undo log record
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef UNIV_HOTBACKUP
+/**********************************************************************//**
+Reads from an undo log record the record type.
+@return record type */
+UNIV_INLINE
+ulint
+trx_undo_rec_get_type(
+/*==================*/
+ const trx_undo_rec_t* undo_rec) /*!< in: undo log record */
+{
+ return(mach_read_from_1(undo_rec + 2) & (TRX_UNDO_CMPL_INFO_MULT - 1));
+}
+
+/**********************************************************************//**
+Reads from an undo log record the record compiler info.
+@return compiler info */
+UNIV_INLINE
+ulint
+trx_undo_rec_get_cmpl_info(
+/*=======================*/
+ const trx_undo_rec_t* undo_rec) /*!< in: undo log record */
+{
+ return(mach_read_from_1(undo_rec + 2) / TRX_UNDO_CMPL_INFO_MULT);
+}
+
+/**********************************************************************//**
+Returns TRUE if an undo log record contains an extern storage field.
+@return TRUE if extern */
+UNIV_INLINE
+ibool
+trx_undo_rec_get_extern_storage(
+/*============================*/
+ const trx_undo_rec_t* undo_rec) /*!< in: undo log record */
+{
+ if (mach_read_from_1(undo_rec + 2) & TRX_UNDO_UPD_EXTERN) {
+
+ return(TRUE);
+ }
+
+ return(FALSE);
+}
+
+/**********************************************************************//**
+Reads the undo log record number.
+@return undo no */
+UNIV_INLINE
+undo_no_t
+trx_undo_rec_get_undo_no(
+/*=====================*/
+ const trx_undo_rec_t* undo_rec) /*!< in: undo log record */
+{
+ const byte* ptr;
+
+ ptr = undo_rec + 3;
+
+ return(mach_dulint_read_much_compressed(ptr));
+}
+
+/**********************************************************************//**
+Returns the start of the undo record data area.
+@return offset to the data area */
+UNIV_INLINE
+ulint
+trx_undo_rec_get_offset(
+/*====================*/
+ undo_no_t undo_no) /*!< in: undo no read from node */
+{
+ return (3 + mach_dulint_get_much_compressed_size(undo_no));
+}
+
+/***********************************************************************//**
+Copies the undo record to the heap.
+@return own: copy of undo log record */
+UNIV_INLINE
+trx_undo_rec_t*
+trx_undo_rec_copy(
+/*==============*/
+ trx_undo_rec_t* undo_rec, /*!< in: undo log record */
+ mem_heap_t* heap) /*!< in: heap where copied */
+{
+ ulint len;
+
+ len = mach_read_from_2(undo_rec)
+ - ut_align_offset(undo_rec, UNIV_PAGE_SIZE);
+ return(mem_heap_dup(heap, undo_rec, len));
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/innobase/include/trx0roll.h b/storage/innobase/include/trx0roll.h
new file mode 100644
index 00000000000..ddca9e9e4ef
--- /dev/null
+++ b/storage/innobase/include/trx0roll.h
@@ -0,0 +1,341 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/trx0roll.h
+Transaction rollback
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef trx0roll_h
+#define trx0roll_h
+
+#include "univ.i"
+#include "trx0trx.h"
+#include "trx0types.h"
+#include "mtr0mtr.h"
+#include "trx0sys.h"
+
+#define trx_roll_free_all_savepoints(s) trx_roll_savepoints_free((s), NULL)
+
+/*******************************************************************//**
+Determines if this transaction is rolling back an incomplete transaction
+in crash recovery.
+@return TRUE if trx is an incomplete transaction that is being rolled
+back in crash recovery */
+UNIV_INTERN
+ibool
+trx_is_recv(
+/*========*/
+ const trx_t* trx); /*!< in: transaction */
+/*******************************************************************//**
+Returns a transaction savepoint taken at this point in time.
+@return savepoint */
+UNIV_INTERN
+trx_savept_t
+trx_savept_take(
+/*============*/
+ trx_t* trx); /*!< in: transaction */
+/*******************************************************************//**
+Creates an undo number array. */
+UNIV_INTERN
+trx_undo_arr_t*
+trx_undo_arr_create(void);
+/*=====================*/
+/*******************************************************************//**
+Frees an undo number array. */
+UNIV_INTERN
+void
+trx_undo_arr_free(
+/*==============*/
+ trx_undo_arr_t* arr); /*!< in: undo number array */
+/*******************************************************************//**
+Returns pointer to nth element in an undo number array.
+@return pointer to the nth element */
+UNIV_INLINE
+trx_undo_inf_t*
+trx_undo_arr_get_nth_info(
+/*======================*/
+ trx_undo_arr_t* arr, /*!< in: undo number array */
+ ulint n); /*!< in: position */
+/***********************************************************************//**
+Tries truncate the undo logs. */
+UNIV_INTERN
+void
+trx_roll_try_truncate(
+/*==================*/
+ trx_t* trx); /*!< in/out: transaction */
+/********************************************************************//**
+Pops the topmost record when the two undo logs of a transaction are seen
+as a single stack of records ordered by their undo numbers. Inserts the
+undo number of the popped undo record to the array of currently processed
+undo numbers in the transaction. When the query thread finishes processing
+of this undo record, it must be released with trx_undo_rec_release.
+@return undo log record copied to heap, NULL if none left, or if the
+undo number of the top record would be less than the limit */
+UNIV_INTERN
+trx_undo_rec_t*
+trx_roll_pop_top_rec_of_trx(
+/*========================*/
+ trx_t* trx, /*!< in: transaction */
+ undo_no_t limit, /*!< in: least undo number we need */
+ roll_ptr_t* roll_ptr,/*!< out: roll pointer to undo record */
+ mem_heap_t* heap); /*!< in: memory heap where copied */
+/********************************************************************//**
+Reserves an undo log record for a query thread to undo. This should be
+called if the query thread gets the undo log record not using the pop
+function above.
+@return TRUE if succeeded */
+UNIV_INTERN
+ibool
+trx_undo_rec_reserve(
+/*=================*/
+ trx_t* trx, /*!< in/out: transaction */
+ undo_no_t undo_no);/*!< in: undo number of the record */
+/*******************************************************************//**
+Releases a reserved undo record. */
+UNIV_INTERN
+void
+trx_undo_rec_release(
+/*=================*/
+ trx_t* trx, /*!< in/out: transaction */
+ undo_no_t undo_no);/*!< in: undo number */
+/*********************************************************************//**
+Starts a rollback operation. */
+UNIV_INTERN
+void
+trx_rollback(
+/*=========*/
+ trx_t* trx, /*!< in: transaction */
+ trx_sig_t* sig, /*!< in: signal starting the rollback */
+ que_thr_t** next_thr);/*!< in/out: next query thread to run;
+ if the value which is passed in is
+ a pointer to a NULL pointer, then the
+ calling function can start running
+ a new query thread */
+/*******************************************************************//**
+Rollback or clean up any incomplete transactions which were
+encountered in crash recovery. If the transaction already was
+committed, then we clean up a possible insert undo log. If the
+transaction was not yet committed, then we roll it back.
+Note: this is done in a background thread.
+@return a dummy parameter */
+UNIV_INTERN
+os_thread_ret_t
+trx_rollback_or_clean_all_recovered(
+/*================================*/
+ void* arg __attribute__((unused)));
+ /*!< in: a dummy parameter required by
+ os_thread_create */
+/****************************************************************//**
+Finishes a transaction rollback. */
+UNIV_INTERN
+void
+trx_finish_rollback_off_kernel(
+/*===========================*/
+ que_t* graph, /*!< in: undo graph which can now be freed */
+ trx_t* trx, /*!< in: transaction */
+ que_thr_t** next_thr);/*!< in/out: next query thread to run;
+ if the value which is passed in is
+ a pointer to a NULL pointer, then the
+ calling function can start running
+ a new query thread; if this parameter is
+ NULL, it is ignored */
+/****************************************************************//**
+Builds an undo 'query' graph for a transaction. The actual rollback is
+performed by executing this query graph like a query subprocedure call.
+The reply about the completion of the rollback will be sent by this
+graph.
+@return own: the query graph */
+UNIV_INTERN
+que_t*
+trx_roll_graph_build(
+/*=================*/
+ trx_t* trx); /*!< in: trx handle */
+/*********************************************************************//**
+Creates a rollback command node struct.
+@return own: rollback node struct */
+UNIV_INTERN
+roll_node_t*
+roll_node_create(
+/*=============*/
+ mem_heap_t* heap); /*!< in: mem heap where created */
+/***********************************************************//**
+Performs an execution step for a rollback command node in a query graph.
+@return query thread to run next, or NULL */
+UNIV_INTERN
+que_thr_t*
+trx_rollback_step(
+/*==============*/
+ que_thr_t* thr); /*!< in: query thread */
+/*******************************************************************//**
+Rollback a transaction used in MySQL.
+@return error code or DB_SUCCESS */
+UNIV_INTERN
+int
+trx_rollback_for_mysql(
+/*===================*/
+ trx_t* trx); /*!< in: transaction handle */
+/*******************************************************************//**
+Rollback the latest SQL statement for MySQL.
+@return error code or DB_SUCCESS */
+UNIV_INTERN
+int
+trx_rollback_last_sql_stat_for_mysql(
+/*=================================*/
+ trx_t* trx); /*!< in: transaction handle */
+/*******************************************************************//**
+Rollback a transaction used in MySQL.
+@return error code or DB_SUCCESS */
+UNIV_INTERN
+int
+trx_general_rollback_for_mysql(
+/*===========================*/
+ trx_t* trx, /*!< in: transaction handle */
+ ibool partial,/*!< in: TRUE if partial rollback requested */
+ trx_savept_t* savept);/*!< in: pointer to savepoint undo number, if
+ partial rollback requested */
+/*******************************************************************//**
+Rolls back a transaction back to a named savepoint. Modifications after the
+savepoint are undone but InnoDB does NOT release the corresponding locks
+which are stored in memory. If a lock is 'implicit', that is, a new inserted
+row holds a lock where the lock information is carried by the trx id stored in
+the row, these locks are naturally released in the rollback. Savepoints which
+were set after this savepoint are deleted.
+@return if no savepoint of the name found then DB_NO_SAVEPOINT,
+otherwise DB_SUCCESS */
+UNIV_INTERN
+ulint
+trx_rollback_to_savepoint_for_mysql(
+/*================================*/
+ trx_t* trx, /*!< in: transaction handle */
+ const char* savepoint_name, /*!< in: savepoint name */
+ ib_int64_t* mysql_binlog_cache_pos);/*!< out: the MySQL binlog cache
+ position corresponding to this
+ savepoint; MySQL needs this
+ information to remove the
+ binlog entries of the queries
+ executed after the savepoint */
+/*******************************************************************//**
+Creates a named savepoint. If the transaction is not yet started, starts it.
+If there is already a savepoint of the same name, this call erases that old
+savepoint and replaces it with a new. Savepoints are deleted in a transaction
+commit or rollback.
+@return always DB_SUCCESS */
+UNIV_INTERN
+ulint
+trx_savepoint_for_mysql(
+/*====================*/
+ trx_t* trx, /*!< in: transaction handle */
+ const char* savepoint_name, /*!< in: savepoint name */
+ ib_int64_t binlog_cache_pos); /*!< in: MySQL binlog cache
+ position corresponding to this
+ connection at the time of the
+ savepoint */
+
+/*******************************************************************//**
+Releases a named savepoint. Savepoints which
+were set after this savepoint are deleted.
+@return if no savepoint of the name found then DB_NO_SAVEPOINT,
+otherwise DB_SUCCESS */
+UNIV_INTERN
+ulint
+trx_release_savepoint_for_mysql(
+/*============================*/
+ trx_t* trx, /*!< in: transaction handle */
+ const char* savepoint_name); /*!< in: savepoint name */
+
+/*******************************************************************//**
+Frees a single savepoint struct. */
+UNIV_INTERN
+void
+trx_roll_savepoint_free(
+/*=====================*/
+ trx_t* trx, /*!< in: transaction handle */
+ trx_named_savept_t* savep); /*!< in: savepoint to free */
+
+/*******************************************************************//**
+Frees savepoint structs starting from savep, if savep == NULL then
+free all savepoints. */
+UNIV_INTERN
+void
+trx_roll_savepoints_free(
+/*=====================*/
+ trx_t* trx, /*!< in: transaction handle */
+ trx_named_savept_t* savep); /*!< in: free all savepoints > this one;
+ if this is NULL, free all savepoints
+ of trx */
+
+/** A cell of trx_undo_arr_struct; used during a rollback and a purge */
+struct trx_undo_inf_struct{
+ trx_id_t trx_no; /*!< transaction number: not defined during
+ a rollback */
+ undo_no_t undo_no;/*!< undo number of an undo record */
+ ibool in_use; /*!< TRUE if the cell is in use */
+};
+
+/** During a rollback and a purge, undo numbers of undo records currently being
+processed are stored in this array */
+
+struct trx_undo_arr_struct{
+ ulint n_cells; /*!< number of cells in the array */
+ ulint n_used; /*!< number of cells currently in use */
+ trx_undo_inf_t* infos; /*!< the array of undo infos */
+ mem_heap_t* heap; /*!< memory heap from which allocated */
+};
+
+/** Rollback node states */
+enum roll_node_state {
+ ROLL_NODE_SEND = 1, /*!< about to send a rollback signal to
+ the transaction */
+ ROLL_NODE_WAIT /*!< rollback signal sent to the transaction,
+ waiting for completion */
+};
+
+/** Rollback command node in a query graph */
+struct roll_node_struct{
+ que_common_t common; /*!< node type: QUE_NODE_ROLLBACK */
+ enum roll_node_state state; /*!< node execution state */
+ ibool partial;/*!< TRUE if we want a partial
+ rollback */
+ trx_savept_t savept; /*!< savepoint to which to
+ roll back, in the case of a
+ partial rollback */
+};
+
+/** A savepoint set with SQL's "SAVEPOINT savepoint_id" command */
+struct trx_named_savept_struct{
+ char* name; /*!< savepoint name */
+ trx_savept_t savept; /*!< the undo number corresponding to
+ the savepoint */
+ ib_int64_t mysql_binlog_cache_pos;
+ /*!< the MySQL binlog cache position
+ corresponding to this savepoint, not
+ defined if the MySQL binlogging is not
+ enabled */
+ UT_LIST_NODE_T(trx_named_savept_t)
+ trx_savepoints; /*!< the list of savepoints of a
+ transaction */
+};
+
+#ifndef UNIV_NONINL
+#include "trx0roll.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/trx0roll.ic b/storage/innobase/include/trx0roll.ic
new file mode 100644
index 00000000000..3460832b18c
--- /dev/null
+++ b/storage/innobase/include/trx0roll.ic
@@ -0,0 +1,40 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/trx0roll.ic
+Transaction rollback
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+/*******************************************************************//**
+Returns pointer to nth element in an undo number array.
+@return pointer to the nth element */
+UNIV_INLINE
+trx_undo_inf_t*
+trx_undo_arr_get_nth_info(
+/*======================*/
+ trx_undo_arr_t* arr, /*!< in: undo number array */
+ ulint n) /*!< in: position */
+{
+ ut_ad(arr);
+ ut_ad(n < arr->n_cells);
+
+ return(arr->infos + n);
+}
diff --git a/storage/innobase/include/trx0rseg.h b/storage/innobase/include/trx0rseg.h
new file mode 100644
index 00000000000..dbc732651ca
--- /dev/null
+++ b/storage/innobase/include/trx0rseg.h
@@ -0,0 +1,213 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/trx0rseg.h
+Rollback segment
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef trx0rseg_h
+#define trx0rseg_h
+
+#include "univ.i"
+#include "trx0types.h"
+#include "trx0sys.h"
+
+/******************************************************************//**
+Gets a rollback segment header.
+@return rollback segment header, page x-latched */
+UNIV_INLINE
+trx_rsegf_t*
+trx_rsegf_get(
+/*==========*/
+ ulint space, /*!< in: space where placed */
+ ulint zip_size, /*!< in: compressed page size in bytes
+ or 0 for uncompressed pages */
+ ulint page_no, /*!< in: page number of the header */
+ mtr_t* mtr); /*!< in: mtr */
+/******************************************************************//**
+Gets a newly created rollback segment header.
+@return rollback segment header, page x-latched */
+UNIV_INLINE
+trx_rsegf_t*
+trx_rsegf_get_new(
+/*==============*/
+ ulint space, /*!< in: space where placed */
+ ulint zip_size, /*!< in: compressed page size in bytes
+ or 0 for uncompressed pages */
+ ulint page_no, /*!< in: page number of the header */
+ mtr_t* mtr); /*!< in: mtr */
+/***************************************************************//**
+Gets the file page number of the nth undo log slot.
+@return page number of the undo log segment */
+UNIV_INLINE
+ulint
+trx_rsegf_get_nth_undo(
+/*===================*/
+ trx_rsegf_t* rsegf, /*!< in: rollback segment header */
+ ulint n, /*!< in: index of slot */
+ mtr_t* mtr); /*!< in: mtr */
+/***************************************************************//**
+Sets the file page number of the nth undo log slot. */
+UNIV_INLINE
+void
+trx_rsegf_set_nth_undo(
+/*===================*/
+ trx_rsegf_t* rsegf, /*!< in: rollback segment header */
+ ulint n, /*!< in: index of slot */
+ ulint page_no,/*!< in: page number of the undo log segment */
+ mtr_t* mtr); /*!< in: mtr */
+/****************************************************************//**
+Looks for a free slot for an undo log segment.
+@return slot index or ULINT_UNDEFINED if not found */
+UNIV_INLINE
+ulint
+trx_rsegf_undo_find_free(
+/*=====================*/
+ trx_rsegf_t* rsegf, /*!< in: rollback segment header */
+ mtr_t* mtr); /*!< in: mtr */
+/******************************************************************//**
+Looks for a rollback segment, based on the rollback segment id.
+@return rollback segment */
+UNIV_INTERN
+trx_rseg_t*
+trx_rseg_get_on_id(
+/*===============*/
+ ulint id); /*!< in: rollback segment id */
+/****************************************************************//**
+Creates a rollback segment header. This function is called only when
+a new rollback segment is created in the database.
+@return page number of the created segment, FIL_NULL if fail */
+UNIV_INTERN
+ulint
+trx_rseg_header_create(
+/*===================*/
+ ulint space, /*!< in: space id */
+ ulint zip_size, /*!< in: compressed page size in bytes
+ or 0 for uncompressed pages */
+ ulint max_size, /*!< in: max size in pages */
+ ulint* slot_no, /*!< out: rseg id == slot number in trx sys */
+ mtr_t* mtr); /*!< in: mtr */
+/*********************************************************************//**
+Creates the memory copies for rollback segments and initializes the
+rseg list and array in trx_sys at a database startup. */
+UNIV_INTERN
+void
+trx_rseg_list_and_array_init(
+/*=========================*/
+ trx_sysf_t* sys_header, /*!< in: trx system header */
+ mtr_t* mtr); /*!< in: mtr */
+/****************************************************************//**
+Creates a new rollback segment to the database.
+@return the created segment object, NULL if fail */
+UNIV_INTERN
+trx_rseg_t*
+trx_rseg_create(
+/*============*/
+ ulint space, /*!< in: space id */
+ ulint max_size, /*!< in: max size in pages */
+ ulint* id, /*!< out: rseg id */
+ mtr_t* mtr); /*!< in: mtr */
+
+
+/* Number of undo log slots in a rollback segment file copy */
+#define TRX_RSEG_N_SLOTS (UNIV_PAGE_SIZE / 16)
+
+/* Maximum number of transactions supported by a single rollback segment */
+#define TRX_RSEG_MAX_N_TRXS (TRX_RSEG_N_SLOTS / 2)
+
+/* The rollback segment memory object */
+struct trx_rseg_struct{
+ /*--------------------------------------------------------*/
+ ulint id; /*!< rollback segment id == the index of
+ its slot in the trx system file copy */
+ mutex_t mutex; /*!< mutex protecting the fields in this
+ struct except id; NOTE that the latching
+ order must always be kernel mutex ->
+ rseg mutex */
+ ulint space; /*!< space where the rollback segment is
+ header is placed */
+ ulint zip_size;/* compressed page size of space
+ in bytes, or 0 for uncompressed spaces */
+ ulint page_no;/* page number of the rollback segment
+ header */
+ ulint max_size;/* maximum allowed size in pages */
+ ulint curr_size;/* current size in pages */
+ /*--------------------------------------------------------*/
+ /* Fields for update undo logs */
+ UT_LIST_BASE_NODE_T(trx_undo_t) update_undo_list;
+ /* List of update undo logs */
+ UT_LIST_BASE_NODE_T(trx_undo_t) update_undo_cached;
+ /* List of update undo log segments
+ cached for fast reuse */
+ /*--------------------------------------------------------*/
+ /* Fields for insert undo logs */
+ UT_LIST_BASE_NODE_T(trx_undo_t) insert_undo_list;
+ /* List of insert undo logs */
+ UT_LIST_BASE_NODE_T(trx_undo_t) insert_undo_cached;
+ /* List of insert undo log segments
+ cached for fast reuse */
+ /*--------------------------------------------------------*/
+ ulint last_page_no; /*!< Page number of the last not yet
+ purged log header in the history list;
+ FIL_NULL if all list purged */
+ ulint last_offset; /*!< Byte offset of the last not yet
+ purged log header */
+ trx_id_t last_trx_no; /*!< Transaction number of the last not
+ yet purged log */
+ ibool last_del_marks; /*!< TRUE if the last not yet purged log
+ needs purging */
+ /*--------------------------------------------------------*/
+ UT_LIST_NODE_T(trx_rseg_t) rseg_list;
+ /* the list of the rollback segment
+ memory objects */
+};
+
+/* Undo log segment slot in a rollback segment header */
+/*-------------------------------------------------------------*/
+#define TRX_RSEG_SLOT_PAGE_NO 0 /* Page number of the header page of
+ an undo log segment */
+/*-------------------------------------------------------------*/
+/* Slot size */
+#define TRX_RSEG_SLOT_SIZE 4
+
+/* The offset of the rollback segment header on its page */
+#define TRX_RSEG FSEG_PAGE_DATA
+
+/* Transaction rollback segment header */
+/*-------------------------------------------------------------*/
+#define TRX_RSEG_MAX_SIZE 0 /* Maximum allowed size for rollback
+ segment in pages */
+#define TRX_RSEG_HISTORY_SIZE 4 /* Number of file pages occupied
+ by the logs in the history list */
+#define TRX_RSEG_HISTORY 8 /* The update undo logs for committed
+ transactions */
+#define TRX_RSEG_FSEG_HEADER (8 + FLST_BASE_NODE_SIZE)
+ /* Header for the file segment where
+ this page is placed */
+#define TRX_RSEG_UNDO_SLOTS (8 + FLST_BASE_NODE_SIZE + FSEG_HEADER_SIZE)
+ /* Undo log segment slots */
+/*-------------------------------------------------------------*/
+
+#ifndef UNIV_NONINL
+#include "trx0rseg.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/trx0rseg.ic b/storage/innobase/include/trx0rseg.ic
new file mode 100644
index 00000000000..daffa92fc7d
--- /dev/null
+++ b/storage/innobase/include/trx0rseg.ic
@@ -0,0 +1,145 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/trx0rseg.ic
+Rollback segment
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#include "srv0srv.h"
+#include "mtr0log.h"
+
+/******************************************************************//**
+Gets a rollback segment header.
+@return rollback segment header, page x-latched */
+UNIV_INLINE
+trx_rsegf_t*
+trx_rsegf_get(
+/*==========*/
+ ulint space, /*!< in: space where placed */
+ ulint zip_size, /*!< in: compressed page size in bytes
+ or 0 for uncompressed pages */
+ ulint page_no, /*!< in: page number of the header */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ buf_block_t* block;
+ trx_rsegf_t* header;
+
+ block = buf_page_get(space, zip_size, page_no, RW_X_LATCH, mtr);
+ buf_block_dbg_add_level(block, SYNC_RSEG_HEADER);
+
+ header = TRX_RSEG + buf_block_get_frame(block);
+
+ return(header);
+}
+
+/******************************************************************//**
+Gets a newly created rollback segment header.
+@return rollback segment header, page x-latched */
+UNIV_INLINE
+trx_rsegf_t*
+trx_rsegf_get_new(
+/*==============*/
+ ulint space, /*!< in: space where placed */
+ ulint zip_size, /*!< in: compressed page size in bytes
+ or 0 for uncompressed pages */
+ ulint page_no, /*!< in: page number of the header */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ buf_block_t* block;
+ trx_rsegf_t* header;
+
+ block = buf_page_get(space, zip_size, page_no, RW_X_LATCH, mtr);
+ buf_block_dbg_add_level(block, SYNC_RSEG_HEADER_NEW);
+
+ header = TRX_RSEG + buf_block_get_frame(block);
+
+ return(header);
+}
+
+/***************************************************************//**
+Gets the file page number of the nth undo log slot.
+@return page number of the undo log segment */
+UNIV_INLINE
+ulint
+trx_rsegf_get_nth_undo(
+/*===================*/
+ trx_rsegf_t* rsegf, /*!< in: rollback segment header */
+ ulint n, /*!< in: index of slot */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ if (UNIV_UNLIKELY(n >= TRX_RSEG_N_SLOTS)) {
+ fprintf(stderr,
+ "InnoDB: Error: trying to get slot %lu of rseg\n",
+ (ulong) n);
+ ut_error;
+ }
+
+ return(mtr_read_ulint(rsegf + TRX_RSEG_UNDO_SLOTS
+ + n * TRX_RSEG_SLOT_SIZE, MLOG_4BYTES, mtr));
+}
+
+/***************************************************************//**
+Sets the file page number of the nth undo log slot. */
+UNIV_INLINE
+void
+trx_rsegf_set_nth_undo(
+/*===================*/
+ trx_rsegf_t* rsegf, /*!< in: rollback segment header */
+ ulint n, /*!< in: index of slot */
+ ulint page_no,/*!< in: page number of the undo log segment */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ if (UNIV_UNLIKELY(n >= TRX_RSEG_N_SLOTS)) {
+ fprintf(stderr,
+ "InnoDB: Error: trying to set slot %lu of rseg\n",
+ (ulong) n);
+ ut_error;
+ }
+
+ mlog_write_ulint(rsegf + TRX_RSEG_UNDO_SLOTS + n * TRX_RSEG_SLOT_SIZE,
+ page_no, MLOG_4BYTES, mtr);
+}
+
+/****************************************************************//**
+Looks for a free slot for an undo log segment.
+@return slot index or ULINT_UNDEFINED if not found */
+UNIV_INLINE
+ulint
+trx_rsegf_undo_find_free(
+/*=====================*/
+ trx_rsegf_t* rsegf, /*!< in: rollback segment header */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ulint i;
+ ulint page_no;
+
+ for (i = 0; i < TRX_RSEG_N_SLOTS; i++) {
+
+ page_no = trx_rsegf_get_nth_undo(rsegf, i, mtr);
+
+ if (page_no == FIL_NULL) {
+
+ return(i);
+ }
+ }
+
+ return(ULINT_UNDEFINED);
+}
diff --git a/storage/innobase/include/trx0sys.h b/storage/innobase/include/trx0sys.h
new file mode 100644
index 00000000000..812e8cfa0ba
--- /dev/null
+++ b/storage/innobase/include/trx0sys.h
@@ -0,0 +1,618 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/trx0sys.h
+Transaction system
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef trx0sys_h
+#define trx0sys_h
+
+#include "univ.i"
+
+#include "trx0types.h"
+#include "fsp0types.h"
+#include "fil0fil.h"
+#include "buf0buf.h"
+#ifndef UNIV_HOTBACKUP
+#include "mtr0mtr.h"
+#include "ut0byte.h"
+#include "mem0mem.h"
+#include "sync0sync.h"
+#include "ut0lst.h"
+#include "read0types.h"
+#include "page0types.h"
+
+/** In a MySQL replication slave, in crash recovery we store the master log
+file name and position here. */
+/* @{ */
+/** Master binlog file name */
+extern char trx_sys_mysql_master_log_name[];
+/** Master binlog file position. We have successfully got the updates
+up to this position. -1 means that no crash recovery was needed, or
+there was no master log position info inside InnoDB.*/
+extern ib_int64_t trx_sys_mysql_master_log_pos;
+/* @} */
+
+/** If this MySQL server uses binary logging, after InnoDB has been inited
+and if it has done a crash recovery, we store the binlog file name and position
+here. */
+/* @{ */
+/** Binlog file name */
+extern char trx_sys_mysql_bin_log_name[];
+/** Binlog file position, or -1 if unknown */
+extern ib_int64_t trx_sys_mysql_bin_log_pos;
+/* @} */
+
+/** The transaction system */
+extern trx_sys_t* trx_sys;
+
+/** Doublewrite system */
+extern trx_doublewrite_t* trx_doublewrite;
+/** The following is set to TRUE when we are upgrading from pre-4.1
+format data files to the multiple tablespaces format data files */
+extern ibool trx_doublewrite_must_reset_space_ids;
+/** Set to TRUE when the doublewrite buffer is being created */
+extern ibool trx_doublewrite_buf_is_being_created;
+/** The following is TRUE when we are using the database in the
+post-4.1 format, i.e., we have successfully upgraded, or have created
+a new database installation */
+extern ibool trx_sys_multiple_tablespace_format;
+
+/****************************************************************//**
+Creates the doublewrite buffer to a new InnoDB installation. The header of the
+doublewrite buffer is placed on the trx system header page. */
+UNIV_INTERN
+void
+trx_sys_create_doublewrite_buf(void);
+/*================================*/
+/****************************************************************//**
+At a database startup initializes the doublewrite buffer memory structure if
+we already have a doublewrite buffer created in the data files. If we are
+upgrading to an InnoDB version which supports multiple tablespaces, then this
+function performs the necessary update operations. If we are in a crash
+recovery, this function uses a possible doublewrite buffer to restore
+half-written pages in the data files. */
+UNIV_INTERN
+void
+trx_sys_doublewrite_init_or_restore_pages(
+/*======================================*/
+ ibool restore_corrupt_pages); /*!< in: TRUE=restore pages */
+/****************************************************************//**
+Marks the trx sys header when we have successfully upgraded to the >= 4.1.x
+multiple tablespace format. */
+UNIV_INTERN
+void
+trx_sys_mark_upgraded_to_multiple_tablespaces(void);
+/*===============================================*/
+/****************************************************************//**
+Determines if a page number is located inside the doublewrite buffer.
+@return TRUE if the location is inside the two blocks of the
+doublewrite buffer */
+UNIV_INTERN
+ibool
+trx_doublewrite_page_inside(
+/*========================*/
+ ulint page_no); /*!< in: page number */
+/***************************************************************//**
+Checks if a page address is the trx sys header page.
+@return TRUE if trx sys header page */
+UNIV_INLINE
+ibool
+trx_sys_hdr_page(
+/*=============*/
+ ulint space, /*!< in: space */
+ ulint page_no);/*!< in: page number */
+/*****************************************************************//**
+Creates and initializes the central memory structures for the transaction
+system. This is called when the database is started. */
+UNIV_INTERN
+void
+trx_sys_init_at_db_start(void);
+/*==========================*/
+/*****************************************************************//**
+Creates and initializes the transaction system at the database creation. */
+UNIV_INTERN
+void
+trx_sys_create(void);
+/*================*/
+/****************************************************************//**
+Looks for a free slot for a rollback segment in the trx system file copy.
+@return slot index or ULINT_UNDEFINED if not found */
+UNIV_INTERN
+ulint
+trx_sysf_rseg_find_free(
+/*====================*/
+ mtr_t* mtr); /*!< in: mtr */
+/***************************************************************//**
+Gets the pointer in the nth slot of the rseg array.
+@return pointer to rseg object, NULL if slot not in use */
+UNIV_INLINE
+trx_rseg_t*
+trx_sys_get_nth_rseg(
+/*=================*/
+ trx_sys_t* sys, /*!< in: trx system */
+ ulint n); /*!< in: index of slot */
+/***************************************************************//**
+Sets the pointer in the nth slot of the rseg array. */
+UNIV_INLINE
+void
+trx_sys_set_nth_rseg(
+/*=================*/
+ trx_sys_t* sys, /*!< in: trx system */
+ ulint n, /*!< in: index of slot */
+ trx_rseg_t* rseg); /*!< in: pointer to rseg object, NULL if slot
+ not in use */
+/**********************************************************************//**
+Gets a pointer to the transaction system file copy and x-locks its page.
+@return pointer to system file copy, page x-locked */
+UNIV_INLINE
+trx_sysf_t*
+trx_sysf_get(
+/*=========*/
+ mtr_t* mtr); /*!< in: mtr */
+/*****************************************************************//**
+Gets the space of the nth rollback segment slot in the trx system
+file copy.
+@return space id */
+UNIV_INLINE
+ulint
+trx_sysf_rseg_get_space(
+/*====================*/
+ trx_sysf_t* sys_header, /*!< in: trx sys file copy */
+ ulint i, /*!< in: slot index == rseg id */
+ mtr_t* mtr); /*!< in: mtr */
+/*****************************************************************//**
+Gets the page number of the nth rollback segment slot in the trx system
+file copy.
+@return page number, FIL_NULL if slot unused */
+UNIV_INLINE
+ulint
+trx_sysf_rseg_get_page_no(
+/*======================*/
+ trx_sysf_t* sys_header, /*!< in: trx sys file copy */
+ ulint i, /*!< in: slot index == rseg id */
+ mtr_t* mtr); /*!< in: mtr */
+/*****************************************************************//**
+Sets the space id of the nth rollback segment slot in the trx system
+file copy. */
+UNIV_INLINE
+void
+trx_sysf_rseg_set_space(
+/*====================*/
+ trx_sysf_t* sys_header, /*!< in: trx sys file copy */
+ ulint i, /*!< in: slot index == rseg id */
+ ulint space, /*!< in: space id */
+ mtr_t* mtr); /*!< in: mtr */
+/*****************************************************************//**
+Sets the page number of the nth rollback segment slot in the trx system
+file copy. */
+UNIV_INLINE
+void
+trx_sysf_rseg_set_page_no(
+/*======================*/
+ trx_sysf_t* sys_header, /*!< in: trx sys file copy */
+ ulint i, /*!< in: slot index == rseg id */
+ ulint page_no, /*!< in: page number, FIL_NULL if
+ the slot is reset to unused */
+ mtr_t* mtr); /*!< in: mtr */
+/*****************************************************************//**
+Allocates a new transaction id.
+@return new, allocated trx id */
+UNIV_INLINE
+trx_id_t
+trx_sys_get_new_trx_id(void);
+/*========================*/
+/*****************************************************************//**
+Allocates a new transaction number.
+@return new, allocated trx number */
+UNIV_INLINE
+trx_id_t
+trx_sys_get_new_trx_no(void);
+/*========================*/
+#endif /* !UNIV_HOTBACKUP */
+/*****************************************************************//**
+Writes a trx id to an index page. In case that the id size changes in
+some future version, this function should be used instead of
+mach_write_... */
+UNIV_INLINE
+void
+trx_write_trx_id(
+/*=============*/
+ byte* ptr, /*!< in: pointer to memory where written */
+ trx_id_t id); /*!< in: id */
+#ifndef UNIV_HOTBACKUP
+/*****************************************************************//**
+Reads a trx id from an index page. In case that the id size changes in
+some future version, this function should be used instead of
+mach_read_...
+@return id */
+UNIV_INLINE
+trx_id_t
+trx_read_trx_id(
+/*============*/
+ const byte* ptr); /*!< in: pointer to memory from where to read */
+/****************************************************************//**
+Looks for the trx handle with the given id in trx_list.
+@return the trx handle or NULL if not found */
+UNIV_INLINE
+trx_t*
+trx_get_on_id(
+/*==========*/
+ trx_id_t trx_id);/*!< in: trx id to search for */
+/****************************************************************//**
+Returns the minumum trx id in trx list. This is the smallest id for which
+the trx can possibly be active. (But, you must look at the trx->conc_state to
+find out if the minimum trx id transaction itself is active, or already
+committed.)
+@return the minimum trx id, or trx_sys->max_trx_id if the trx list is empty */
+UNIV_INLINE
+trx_id_t
+trx_list_get_min_trx_id(void);
+/*=========================*/
+/****************************************************************//**
+Checks if a transaction with the given id is active.
+@return TRUE if active */
+UNIV_INLINE
+ibool
+trx_is_active(
+/*==========*/
+ trx_id_t trx_id);/*!< in: trx id of the transaction */
+/****************************************************************//**
+Checks that trx is in the trx list.
+@return TRUE if is in */
+UNIV_INTERN
+ibool
+trx_in_trx_list(
+/*============*/
+ trx_t* in_trx);/*!< in: trx */
+/*****************************************************************//**
+Updates the offset information about the end of the MySQL binlog entry
+which corresponds to the transaction just being committed. In a MySQL
+replication slave updates the latest master binlog position up to which
+replication has proceeded. */
+UNIV_INTERN
+void
+trx_sys_update_mysql_binlog_offset(
+/*===============================*/
+ const char* file_name,/*!< in: MySQL log file name */
+ ib_int64_t offset, /*!< in: position in that log file */
+ ulint field, /*!< in: offset of the MySQL log info field in
+ the trx sys header */
+ mtr_t* mtr); /*!< in: mtr */
+/*****************************************************************//**
+Prints to stderr the MySQL binlog offset info in the trx system header if
+the magic number shows it valid. */
+UNIV_INTERN
+void
+trx_sys_print_mysql_binlog_offset(void);
+/*===================================*/
+/*****************************************************************//**
+Prints to stderr the MySQL master log offset info in the trx system header if
+the magic number shows it valid. */
+UNIV_INTERN
+void
+trx_sys_print_mysql_master_log_pos(void);
+/*====================================*/
+/*****************************************************************//**
+Initializes the tablespace tag system. */
+UNIV_INTERN
+void
+trx_sys_file_format_init(void);
+/*==========================*/
+/*****************************************************************//**
+Closes the tablespace tag system. */
+UNIV_INTERN
+void
+trx_sys_file_format_close(void);
+/*===========================*/
+/********************************************************************//**
+Tags the system table space with minimum format id if it has not been
+tagged yet.
+WARNING: This function is only called during the startup and AFTER the
+redo log application during recovery has finished. */
+UNIV_INTERN
+void
+trx_sys_file_format_tag_init(void);
+/*==============================*/
+/*****************************************************************//**
+Get the name representation of the file format from its id.
+@return pointer to the name */
+UNIV_INTERN
+const char*
+trx_sys_file_format_id_to_name(
+/*===========================*/
+ const ulint id); /*!< in: id of the file format */
+/*****************************************************************//**
+Set the file format id unconditionally except if it's already the
+same value.
+@return TRUE if value updated */
+UNIV_INTERN
+ibool
+trx_sys_file_format_max_set(
+/*========================*/
+ ulint format_id, /*!< in: file format id */
+ const char** name); /*!< out: max file format name or
+ NULL if not needed. */
+/*****************************************************************//**
+Get the name representation of the file format from its id.
+@return pointer to the max format name */
+UNIV_INTERN
+const char*
+trx_sys_file_format_max_get(void);
+/*=============================*/
+/*****************************************************************//**
+Check for the max file format tag stored on disk.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+ulint
+trx_sys_file_format_max_check(
+/*==========================*/
+ ulint max_format_id); /*!< in: the max format id to check */
+/********************************************************************//**
+Update the file format tag in the system tablespace only if the given
+format id is greater than the known max id.
+@return TRUE if format_id was bigger than the known max id */
+UNIV_INTERN
+ibool
+trx_sys_file_format_max_upgrade(
+/*============================*/
+ const char** name, /*!< out: max file format name */
+ ulint format_id); /*!< in: file format identifier */
+#else /* !UNIV_HOTBACKUP */
+/*****************************************************************//**
+Prints to stderr the MySQL binlog info in the system header if the
+magic number shows it valid. */
+UNIV_INTERN
+void
+trx_sys_print_mysql_binlog_offset_from_page(
+/*========================================*/
+ const byte* page); /*!< in: buffer containing the trx
+ system header page, i.e., page number
+ TRX_SYS_PAGE_NO in the tablespace */
+/*****************************************************************//**
+Reads the file format id from the first system table space file.
+Even if the call succeeds and returns TRUE, the returned format id
+may be ULINT_UNDEFINED signalling that the format id was not present
+in the data file.
+@return TRUE if call succeeds */
+UNIV_INTERN
+ibool
+trx_sys_read_file_format_id(
+/*========================*/
+ const char *pathname, /*!< in: pathname of the first system
+ table space file */
+ ulint *format_id); /*!< out: file format of the system table
+ space */
+/*****************************************************************//**
+Reads the file format id from the given per-table data file.
+@return TRUE if call succeeds */
+UNIV_INTERN
+ibool
+trx_sys_read_pertable_file_format_id(
+/*=================================*/
+ const char *pathname, /*!< in: pathname of a per-table
+ datafile */
+ ulint *format_id); /*!< out: file format of the per-table
+ data file */
+/*****************************************************************//**
+Get the name representation of the file format from its id.
+@return pointer to the name */
+UNIV_INTERN
+const char*
+trx_sys_file_format_id_to_name(
+/*===========================*/
+ const ulint id); /*!< in: id of the file format */
+
+#endif /* !UNIV_HOTBACKUP */
+/* The automatically created system rollback segment has this id */
+#define TRX_SYS_SYSTEM_RSEG_ID 0
+
+/* Space id and page no where the trx system file copy resides */
+#define TRX_SYS_SPACE 0 /* the SYSTEM tablespace */
+#include "fsp0fsp.h"
+#define TRX_SYS_PAGE_NO FSP_TRX_SYS_PAGE_NO
+
+/* The offset of the transaction system header on the page */
+#define TRX_SYS FSEG_PAGE_DATA
+
+/** Transaction system header */
+/*------------------------------------------------------------- @{ */
+#define TRX_SYS_TRX_ID_STORE 0 /*!< the maximum trx id or trx
+ number modulo
+ TRX_SYS_TRX_ID_UPDATE_MARGIN
+ written to a file page by any
+ transaction; the assignment of
+ transaction ids continues from
+ this number rounded up by
+ TRX_SYS_TRX_ID_UPDATE_MARGIN
+ plus
+ TRX_SYS_TRX_ID_UPDATE_MARGIN
+ when the database is
+ started */
+#define TRX_SYS_FSEG_HEADER 8 /*!< segment header for the
+ tablespace segment the trx
+ system is created into */
+#define TRX_SYS_RSEGS (8 + FSEG_HEADER_SIZE)
+ /*!< the start of the array of
+ rollback segment specification
+ slots */
+/*------------------------------------------------------------- @} */
+
+/** Maximum number of rollback segments: the number of segment
+specification slots in the transaction system array; rollback segment
+id must fit in one byte, therefore 256; each slot is currently 8 bytes
+in size */
+#define TRX_SYS_N_RSEGS 256
+
+/** Maximum length of MySQL binlog file name, in bytes.
+@see trx_sys_mysql_master_log_name
+@see trx_sys_mysql_bin_log_name */
+#define TRX_SYS_MYSQL_LOG_NAME_LEN 512
+/** Contents of TRX_SYS_MYSQL_LOG_MAGIC_N_FLD */
+#define TRX_SYS_MYSQL_LOG_MAGIC_N 873422344
+
+#if UNIV_PAGE_SIZE < 4096
+# error "UNIV_PAGE_SIZE < 4096"
+#endif
+/** The offset of the MySQL replication info in the trx system header;
+this contains the same fields as TRX_SYS_MYSQL_LOG_INFO below */
+#define TRX_SYS_MYSQL_MASTER_LOG_INFO (UNIV_PAGE_SIZE - 2000)
+
+/** The offset of the MySQL binlog offset info in the trx system header */
+#define TRX_SYS_MYSQL_LOG_INFO (UNIV_PAGE_SIZE - 1000)
+#define TRX_SYS_MYSQL_LOG_MAGIC_N_FLD 0 /*!< magic number which is
+ TRX_SYS_MYSQL_LOG_MAGIC_N
+ if we have valid data in the
+ MySQL binlog info */
+#define TRX_SYS_MYSQL_LOG_OFFSET_HIGH 4 /*!< high 4 bytes of the offset
+ within that file */
+#define TRX_SYS_MYSQL_LOG_OFFSET_LOW 8 /*!< low 4 bytes of the offset
+ within that file */
+#define TRX_SYS_MYSQL_LOG_NAME 12 /*!< MySQL log file name */
+
+#ifndef UNIV_HOTBACKUP
+/** Doublewrite buffer */
+/* @{ */
+/** The offset of the doublewrite buffer header on the trx system header page */
+#define TRX_SYS_DOUBLEWRITE (UNIV_PAGE_SIZE - 200)
+/*-------------------------------------------------------------*/
+#define TRX_SYS_DOUBLEWRITE_FSEG 0 /*!< fseg header of the fseg
+ containing the doublewrite
+ buffer */
+#define TRX_SYS_DOUBLEWRITE_MAGIC FSEG_HEADER_SIZE
+ /*!< 4-byte magic number which
+ shows if we already have
+ created the doublewrite
+ buffer */
+#define TRX_SYS_DOUBLEWRITE_BLOCK1 (4 + FSEG_HEADER_SIZE)
+ /*!< page number of the
+ first page in the first
+ sequence of 64
+ (= FSP_EXTENT_SIZE) consecutive
+ pages in the doublewrite
+ buffer */
+#define TRX_SYS_DOUBLEWRITE_BLOCK2 (8 + FSEG_HEADER_SIZE)
+ /*!< page number of the
+ first page in the second
+ sequence of 64 consecutive
+ pages in the doublewrite
+ buffer */
+#define TRX_SYS_DOUBLEWRITE_REPEAT 12 /*!< we repeat
+ TRX_SYS_DOUBLEWRITE_MAGIC,
+ TRX_SYS_DOUBLEWRITE_BLOCK1,
+ TRX_SYS_DOUBLEWRITE_BLOCK2
+ so that if the trx sys
+ header is half-written
+ to disk, we still may
+ be able to recover the
+ information */
+/** If this is not yet set to TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N,
+we must reset the doublewrite buffer, because starting from 4.1.x the
+space id of a data page is stored into
+FIL_PAGE_ARCH_LOG_NO_OR_SPACE_NO. */
+#define TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED (24 + FSEG_HEADER_SIZE)
+
+/*-------------------------------------------------------------*/
+/** Contents of TRX_SYS_DOUBLEWRITE_MAGIC */
+#define TRX_SYS_DOUBLEWRITE_MAGIC_N 536853855
+/** Contents of TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED */
+#define TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N 1783657386
+
+/** Size of the doublewrite block in pages */
+#define TRX_SYS_DOUBLEWRITE_BLOCK_SIZE FSP_EXTENT_SIZE
+/* @} */
+
+/** File format tag */
+/* @{ */
+/** The offset of the file format tag on the trx system header page
+(TRX_SYS_PAGE_NO of TRX_SYS_SPACE) */
+#define TRX_SYS_FILE_FORMAT_TAG (UNIV_PAGE_SIZE - 16)
+
+/** Contents of TRX_SYS_FILE_FORMAT_TAG when valid. The file format
+identifier is added to this constant. */
+#define TRX_SYS_FILE_FORMAT_TAG_MAGIC_N_LOW 3645922177UL
+/** Contents of TRX_SYS_FILE_FORMAT_TAG+4 when valid */
+#define TRX_SYS_FILE_FORMAT_TAG_MAGIC_N_HIGH 2745987765UL
+/* @} */
+
+/** Doublewrite control struct */
+struct trx_doublewrite_struct{
+ mutex_t mutex; /*!< mutex protecting the first_free field and
+ write_buf */
+ ulint block1; /*!< the page number of the first
+ doublewrite block (64 pages) */
+ ulint block2; /*!< page number of the second block */
+ ulint first_free; /*!< first free position in write_buf measured
+ in units of UNIV_PAGE_SIZE */
+ byte* write_buf; /*!< write buffer used in writing to the
+ doublewrite buffer, aligned to an
+ address divisible by UNIV_PAGE_SIZE
+ (which is required by Windows aio) */
+ byte* write_buf_unaligned;
+ /*!< pointer to write_buf, but unaligned */
+ buf_page_t**
+ buf_block_arr; /*!< array to store pointers to the buffer
+ blocks which have been cached to write_buf */
+};
+
+/** The transaction system central memory data structure; protected by the
+kernel mutex */
+struct trx_sys_struct{
+ trx_id_t max_trx_id; /*!< The smallest number not yet
+ assigned as a transaction id or
+ transaction number */
+ UT_LIST_BASE_NODE_T(trx_t) trx_list;
+ /*!< List of active and committed in
+ memory transactions, sorted on trx id,
+ biggest first */
+ UT_LIST_BASE_NODE_T(trx_t) mysql_trx_list;
+ /*!< List of transactions created
+ for MySQL */
+ UT_LIST_BASE_NODE_T(trx_rseg_t) rseg_list;
+ /*!< List of rollback segment
+ objects */
+ trx_rseg_t* latest_rseg; /*!< Latest rollback segment in the
+ round-robin assignment of rollback
+ segments to transactions */
+ trx_rseg_t* rseg_array[TRX_SYS_N_RSEGS];
+ /*!< Pointer array to rollback
+ segments; NULL if slot not in use */
+ ulint rseg_history_len;/*!< Length of the TRX_RSEG_HISTORY
+ list (update undo logs for committed
+ transactions), protected by
+ rseg->mutex */
+ UT_LIST_BASE_NODE_T(read_view_t) view_list;
+ /*!< List of read views sorted
+ on trx no, biggest first */
+};
+
+/** When a trx id which is zero modulo this number (which must be a power of
+two) is assigned, the field TRX_SYS_TRX_ID_STORE on the transaction system
+page is updated */
+#define TRX_SYS_TRX_ID_WRITE_MARGIN 256
+#endif /* !UNIV_HOTBACKUP */
+
+#ifndef UNIV_NONINL
+#include "trx0sys.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/trx0sys.ic b/storage/innobase/include/trx0sys.ic
new file mode 100644
index 00000000000..1c7c732751b
--- /dev/null
+++ b/storage/innobase/include/trx0sys.ic
@@ -0,0 +1,387 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/trx0sys.ic
+Transaction system
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#include "trx0trx.h"
+#include "data0type.h"
+#ifndef UNIV_HOTBACKUP
+# include "srv0srv.h"
+# include "mtr0log.h"
+
+/* The typedef for rseg slot in the file copy */
+typedef byte trx_sysf_rseg_t;
+
+/* Rollback segment specification slot offsets */
+/*-------------------------------------------------------------*/
+#define TRX_SYS_RSEG_SPACE 0 /* space where the the segment
+ header is placed; starting with
+ MySQL/InnoDB 5.1.7, this is
+ UNIV_UNDEFINED if the slot is unused */
+#define TRX_SYS_RSEG_PAGE_NO 4 /* page number where the the segment
+ header is placed; this is FIL_NULL
+ if the slot is unused */
+/*-------------------------------------------------------------*/
+/* Size of a rollback segment specification slot */
+#define TRX_SYS_RSEG_SLOT_SIZE 8
+
+/*****************************************************************//**
+Writes the value of max_trx_id to the file based trx system header. */
+UNIV_INTERN
+void
+trx_sys_flush_max_trx_id(void);
+/*==========================*/
+
+/***************************************************************//**
+Checks if a page address is the trx sys header page.
+@return TRUE if trx sys header page */
+UNIV_INLINE
+ibool
+trx_sys_hdr_page(
+/*=============*/
+ ulint space, /*!< in: space */
+ ulint page_no)/*!< in: page number */
+{
+ if ((space == TRX_SYS_SPACE) && (page_no == TRX_SYS_PAGE_NO)) {
+
+ return(TRUE);
+ }
+
+ return(FALSE);
+}
+
+/***************************************************************//**
+Gets the pointer in the nth slot of the rseg array.
+@return pointer to rseg object, NULL if slot not in use */
+UNIV_INLINE
+trx_rseg_t*
+trx_sys_get_nth_rseg(
+/*=================*/
+ trx_sys_t* sys, /*!< in: trx system */
+ ulint n) /*!< in: index of slot */
+{
+ ut_ad(mutex_own(&(kernel_mutex)));
+ ut_ad(n < TRX_SYS_N_RSEGS);
+
+ return(sys->rseg_array[n]);
+}
+
+/***************************************************************//**
+Sets the pointer in the nth slot of the rseg array. */
+UNIV_INLINE
+void
+trx_sys_set_nth_rseg(
+/*=================*/
+ trx_sys_t* sys, /*!< in: trx system */
+ ulint n, /*!< in: index of slot */
+ trx_rseg_t* rseg) /*!< in: pointer to rseg object, NULL if slot
+ not in use */
+{
+ ut_ad(n < TRX_SYS_N_RSEGS);
+
+ sys->rseg_array[n] = rseg;
+}
+
+/**********************************************************************//**
+Gets a pointer to the transaction system header and x-latches its page.
+@return pointer to system header, page x-latched. */
+UNIV_INLINE
+trx_sysf_t*
+trx_sysf_get(
+/*=========*/
+ mtr_t* mtr) /*!< in: mtr */
+{
+ buf_block_t* block;
+ trx_sysf_t* header;
+
+ ut_ad(mtr);
+
+ block = buf_page_get(TRX_SYS_SPACE, 0, TRX_SYS_PAGE_NO,
+ RW_X_LATCH, mtr);
+ buf_block_dbg_add_level(block, SYNC_TRX_SYS_HEADER);
+
+ header = TRX_SYS + buf_block_get_frame(block);
+
+ return(header);
+}
+
+/*****************************************************************//**
+Gets the space of the nth rollback segment slot in the trx system
+file copy.
+@return space id */
+UNIV_INLINE
+ulint
+trx_sysf_rseg_get_space(
+/*====================*/
+ trx_sysf_t* sys_header, /*!< in: trx sys header */
+ ulint i, /*!< in: slot index == rseg id */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ut_ad(mutex_own(&(kernel_mutex)));
+ ut_ad(sys_header);
+ ut_ad(i < TRX_SYS_N_RSEGS);
+
+ return(mtr_read_ulint(sys_header + TRX_SYS_RSEGS
+ + i * TRX_SYS_RSEG_SLOT_SIZE
+ + TRX_SYS_RSEG_SPACE, MLOG_4BYTES, mtr));
+}
+
+/*****************************************************************//**
+Gets the page number of the nth rollback segment slot in the trx system
+header.
+@return page number, FIL_NULL if slot unused */
+UNIV_INLINE
+ulint
+trx_sysf_rseg_get_page_no(
+/*======================*/
+ trx_sysf_t* sys_header, /*!< in: trx system header */
+ ulint i, /*!< in: slot index == rseg id */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ut_ad(sys_header);
+ ut_ad(mutex_own(&(kernel_mutex)));
+ ut_ad(i < TRX_SYS_N_RSEGS);
+
+ return(mtr_read_ulint(sys_header + TRX_SYS_RSEGS
+ + i * TRX_SYS_RSEG_SLOT_SIZE
+ + TRX_SYS_RSEG_PAGE_NO, MLOG_4BYTES, mtr));
+}
+
+/*****************************************************************//**
+Sets the space id of the nth rollback segment slot in the trx system
+file copy. */
+UNIV_INLINE
+void
+trx_sysf_rseg_set_space(
+/*====================*/
+ trx_sysf_t* sys_header, /*!< in: trx sys file copy */
+ ulint i, /*!< in: slot index == rseg id */
+ ulint space, /*!< in: space id */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ut_ad(mutex_own(&(kernel_mutex)));
+ ut_ad(sys_header);
+ ut_ad(i < TRX_SYS_N_RSEGS);
+
+ mlog_write_ulint(sys_header + TRX_SYS_RSEGS
+ + i * TRX_SYS_RSEG_SLOT_SIZE
+ + TRX_SYS_RSEG_SPACE,
+ space,
+ MLOG_4BYTES, mtr);
+}
+
+/*****************************************************************//**
+Sets the page number of the nth rollback segment slot in the trx system
+header. */
+UNIV_INLINE
+void
+trx_sysf_rseg_set_page_no(
+/*======================*/
+ trx_sysf_t* sys_header, /*!< in: trx sys header */
+ ulint i, /*!< in: slot index == rseg id */
+ ulint page_no, /*!< in: page number, FIL_NULL if the
+ slot is reset to unused */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ut_ad(mutex_own(&(kernel_mutex)));
+ ut_ad(sys_header);
+ ut_ad(i < TRX_SYS_N_RSEGS);
+
+ mlog_write_ulint(sys_header + TRX_SYS_RSEGS
+ + i * TRX_SYS_RSEG_SLOT_SIZE
+ + TRX_SYS_RSEG_PAGE_NO,
+ page_no,
+ MLOG_4BYTES, mtr);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/*****************************************************************//**
+Writes a trx id to an index page. In case that the id size changes in
+some future version, this function should be used instead of
+mach_write_... */
+UNIV_INLINE
+void
+trx_write_trx_id(
+/*=============*/
+ byte* ptr, /*!< in: pointer to memory where written */
+ trx_id_t id) /*!< in: id */
+{
+#if DATA_TRX_ID_LEN != 6
+# error "DATA_TRX_ID_LEN != 6"
+#endif
+ mach_write_to_6(ptr, id);
+}
+
+#ifndef UNIV_HOTBACKUP
+/*****************************************************************//**
+Reads a trx id from an index page. In case that the id size changes in
+some future version, this function should be used instead of
+mach_read_...
+@return id */
+UNIV_INLINE
+trx_id_t
+trx_read_trx_id(
+/*============*/
+ const byte* ptr) /*!< in: pointer to memory from where to read */
+{
+#if DATA_TRX_ID_LEN != 6
+# error "DATA_TRX_ID_LEN != 6"
+#endif
+ return(mach_read_from_6(ptr));
+}
+
+/****************************************************************//**
+Looks for the trx handle with the given id in trx_list.
+@return the trx handle or NULL if not found */
+UNIV_INLINE
+trx_t*
+trx_get_on_id(
+/*==========*/
+ trx_id_t trx_id) /*!< in: trx id to search for */
+{
+ trx_t* trx;
+
+ ut_ad(mutex_own(&(kernel_mutex)));
+
+ trx = UT_LIST_GET_FIRST(trx_sys->trx_list);
+
+ while (trx != NULL) {
+ if (0 == ut_dulint_cmp(trx_id, trx->id)) {
+
+ return(trx);
+ }
+
+ trx = UT_LIST_GET_NEXT(trx_list, trx);
+ }
+
+ return(NULL);
+}
+
+/****************************************************************//**
+Returns the minumum trx id in trx list. This is the smallest id for which
+the trx can possibly be active. (But, you must look at the trx->conc_state to
+find out if the minimum trx id transaction itself is active, or already
+committed.)
+@return the minimum trx id, or trx_sys->max_trx_id if the trx list is empty */
+UNIV_INLINE
+trx_id_t
+trx_list_get_min_trx_id(void)
+/*=========================*/
+{
+ trx_t* trx;
+
+ ut_ad(mutex_own(&(kernel_mutex)));
+
+ trx = UT_LIST_GET_LAST(trx_sys->trx_list);
+
+ if (trx == NULL) {
+
+ return(trx_sys->max_trx_id);
+ }
+
+ return(trx->id);
+}
+
+/****************************************************************//**
+Checks if a transaction with the given id is active.
+@return TRUE if active */
+UNIV_INLINE
+ibool
+trx_is_active(
+/*==========*/
+ trx_id_t trx_id) /*!< in: trx id of the transaction */
+{
+ trx_t* trx;
+
+ ut_ad(mutex_own(&(kernel_mutex)));
+
+ if (ut_dulint_cmp(trx_id, trx_list_get_min_trx_id()) < 0) {
+
+ return(FALSE);
+ }
+
+ if (ut_dulint_cmp(trx_id, trx_sys->max_trx_id) >= 0) {
+
+ /* There must be corruption: we return TRUE because this
+ function is only called by lock_clust_rec_some_has_impl()
+ and row_vers_impl_x_locked_off_kernel() and they have
+ diagnostic prints in this case */
+
+ return(TRUE);
+ }
+
+ trx = trx_get_on_id(trx_id);
+ if (trx && (trx->conc_state == TRX_ACTIVE
+ || trx->conc_state == TRX_PREPARED)) {
+
+ return(TRUE);
+ }
+
+ return(FALSE);
+}
+
+/*****************************************************************//**
+Allocates a new transaction id.
+@return new, allocated trx id */
+UNIV_INLINE
+trx_id_t
+trx_sys_get_new_trx_id(void)
+/*========================*/
+{
+ trx_id_t id;
+
+ ut_ad(mutex_own(&kernel_mutex));
+
+ /* VERY important: after the database is started, max_trx_id value is
+ divisible by TRX_SYS_TRX_ID_WRITE_MARGIN, and the following if
+ will evaluate to TRUE when this function is first time called,
+ and the value for trx id will be written to disk-based header!
+ Thus trx id values will not overlap when the database is
+ repeatedly started! */
+
+ if (ut_dulint_get_low(trx_sys->max_trx_id)
+ % TRX_SYS_TRX_ID_WRITE_MARGIN == 0) {
+
+ trx_sys_flush_max_trx_id();
+ }
+
+ id = trx_sys->max_trx_id;
+
+ UT_DULINT_INC(trx_sys->max_trx_id);
+
+ return(id);
+}
+
+/*****************************************************************//**
+Allocates a new transaction number.
+@return new, allocated trx number */
+UNIV_INLINE
+trx_id_t
+trx_sys_get_new_trx_no(void)
+/*========================*/
+{
+ ut_ad(mutex_own(&kernel_mutex));
+
+ return(trx_sys_get_new_trx_id());
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/innobase/include/trx0trx.h b/storage/innobase/include/trx0trx.h
new file mode 100644
index 00000000000..681feeaec94
--- /dev/null
+++ b/storage/innobase/include/trx0trx.h
@@ -0,0 +1,814 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/trx0trx.h
+The transaction
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef trx0trx_h
+#define trx0trx_h
+
+#include "univ.i"
+#include "trx0types.h"
+#include "dict0types.h"
+#ifndef UNIV_HOTBACKUP
+#include "lock0types.h"
+#include "usr0types.h"
+#include "que0types.h"
+#include "mem0mem.h"
+#include "read0types.h"
+#include "trx0xa.h"
+#include "ut0vec.h"
+
+/** Dummy session used currently in MySQL interface */
+extern sess_t* trx_dummy_sess;
+
+/** Number of transactions currently allocated for MySQL: protected by
+the kernel mutex */
+extern ulint trx_n_mysql_transactions;
+
+/********************************************************************//**
+Releases the search latch if trx has reserved it. */
+UNIV_INTERN
+void
+trx_search_latch_release_if_reserved(
+/*=================================*/
+ trx_t* trx); /*!< in: transaction */
+/******************************************************************//**
+Set detailed error message for the transaction. */
+UNIV_INTERN
+void
+trx_set_detailed_error(
+/*===================*/
+ trx_t* trx, /*!< in: transaction struct */
+ const char* msg); /*!< in: detailed error message */
+/*************************************************************//**
+Set detailed error message for the transaction from a file. Note that the
+file is rewinded before reading from it. */
+UNIV_INTERN
+void
+trx_set_detailed_error_from_file(
+/*=============================*/
+ trx_t* trx, /*!< in: transaction struct */
+ FILE* file); /*!< in: file to read message from */
+/****************************************************************//**
+Retrieves the error_info field from a trx.
+@return the error info */
+UNIV_INLINE
+const dict_index_t*
+trx_get_error_info(
+/*===============*/
+ const trx_t* trx); /*!< in: trx object */
+/****************************************************************//**
+Creates and initializes a transaction object.
+@return own: the transaction */
+UNIV_INTERN
+trx_t*
+trx_create(
+/*=======*/
+ sess_t* sess) /*!< in: session */
+ __attribute__((nonnull));
+/********************************************************************//**
+Creates a transaction object for MySQL.
+@return own: transaction object */
+UNIV_INTERN
+trx_t*
+trx_allocate_for_mysql(void);
+/*========================*/
+/********************************************************************//**
+Creates a transaction object for background operations by the master thread.
+@return own: transaction object */
+UNIV_INTERN
+trx_t*
+trx_allocate_for_background(void);
+/*=============================*/
+/********************************************************************//**
+Frees a transaction object. */
+UNIV_INTERN
+void
+trx_free(
+/*=====*/
+ trx_t* trx); /*!< in, own: trx object */
+/********************************************************************//**
+Frees a transaction object for MySQL. */
+UNIV_INTERN
+void
+trx_free_for_mysql(
+/*===============*/
+ trx_t* trx); /*!< in, own: trx object */
+/********************************************************************//**
+Frees a transaction object of a background operation of the master thread. */
+UNIV_INTERN
+void
+trx_free_for_background(
+/*====================*/
+ trx_t* trx); /*!< in, own: trx object */
+/****************************************************************//**
+Creates trx objects for transactions and initializes the trx list of
+trx_sys at database start. Rollback segment and undo log lists must
+already exist when this function is called, because the lists of
+transactions to be rolled back or cleaned up are built based on the
+undo log lists. */
+UNIV_INTERN
+void
+trx_lists_init_at_db_start(void);
+/*============================*/
+/****************************************************************//**
+Starts a new transaction.
+@return TRUE if success, FALSE if the rollback segment could not
+support this many transactions */
+UNIV_INTERN
+ibool
+trx_start(
+/*======*/
+ trx_t* trx, /*!< in: transaction */
+ ulint rseg_id);/*!< in: rollback segment id; if ULINT_UNDEFINED
+ is passed, the system chooses the rollback segment
+ automatically in a round-robin fashion */
+/****************************************************************//**
+Starts a new transaction.
+@return TRUE */
+UNIV_INTERN
+ibool
+trx_start_low(
+/*==========*/
+ trx_t* trx, /*!< in: transaction */
+ ulint rseg_id);/*!< in: rollback segment id; if ULINT_UNDEFINED
+ is passed, the system chooses the rollback segment
+ automatically in a round-robin fashion */
+/*************************************************************//**
+Starts the transaction if it is not yet started. */
+UNIV_INLINE
+void
+trx_start_if_not_started(
+/*=====================*/
+ trx_t* trx); /*!< in: transaction */
+/*************************************************************//**
+Starts the transaction if it is not yet started. Assumes we have reserved
+the kernel mutex! */
+UNIV_INLINE
+void
+trx_start_if_not_started_low(
+/*=========================*/
+ trx_t* trx); /*!< in: transaction */
+/****************************************************************//**
+Commits a transaction. */
+UNIV_INTERN
+void
+trx_commit_off_kernel(
+/*==================*/
+ trx_t* trx); /*!< in: transaction */
+/****************************************************************//**
+Cleans up a transaction at database startup. The cleanup is needed if
+the transaction already got to the middle of a commit when the database
+crashed, andf we cannot roll it back. */
+UNIV_INTERN
+void
+trx_cleanup_at_db_startup(
+/*======================*/
+ trx_t* trx); /*!< in: transaction */
+/**********************************************************************//**
+Does the transaction commit for MySQL.
+@return DB_SUCCESS or error number */
+UNIV_INTERN
+ulint
+trx_commit_for_mysql(
+/*=================*/
+ trx_t* trx); /*!< in: trx handle */
+/**********************************************************************//**
+Does the transaction prepare for MySQL.
+@return 0 or error number */
+UNIV_INTERN
+ulint
+trx_prepare_for_mysql(
+/*==================*/
+ trx_t* trx); /*!< in: trx handle */
+/**********************************************************************//**
+This function is used to find number of prepared transactions and
+their transaction objects for a recovery.
+@return number of prepared transactions */
+UNIV_INTERN
+int
+trx_recover_for_mysql(
+/*==================*/
+ XID* xid_list, /*!< in/out: prepared transactions */
+ ulint len); /*!< in: number of slots in xid_list */
+/*******************************************************************//**
+This function is used to find one X/Open XA distributed transaction
+which is in the prepared state
+@return trx or NULL */
+UNIV_INTERN
+trx_t *
+trx_get_trx_by_xid(
+/*===============*/
+ XID* xid); /*!< in: X/Open XA transaction identification */
+/**********************************************************************//**
+If required, flushes the log to disk if we called trx_commit_for_mysql()
+with trx->flush_log_later == TRUE.
+@return 0 or error number */
+UNIV_INTERN
+ulint
+trx_commit_complete_for_mysql(
+/*==========================*/
+ trx_t* trx); /*!< in: trx handle */
+/**********************************************************************//**
+Marks the latest SQL statement ended. */
+UNIV_INTERN
+void
+trx_mark_sql_stat_end(
+/*==================*/
+ trx_t* trx); /*!< in: trx handle */
+/********************************************************************//**
+Assigns a read view for a consistent read query. All the consistent reads
+within the same transaction will get the same read view, which is created
+when this function is first called for a new started transaction.
+@return consistent read view */
+UNIV_INTERN
+read_view_t*
+trx_assign_read_view(
+/*=================*/
+ trx_t* trx); /*!< in: active transaction */
+/***********************************************************//**
+The transaction must be in the TRX_QUE_LOCK_WAIT state. Puts it to
+the TRX_QUE_RUNNING state and releases query threads which were
+waiting for a lock in the wait_thrs list. */
+UNIV_INTERN
+void
+trx_end_lock_wait(
+/*==============*/
+ trx_t* trx); /*!< in: transaction */
+/****************************************************************//**
+Sends a signal to a trx object. */
+UNIV_INTERN
+void
+trx_sig_send(
+/*=========*/
+ trx_t* trx, /*!< in: trx handle */
+ ulint type, /*!< in: signal type */
+ ulint sender, /*!< in: TRX_SIG_SELF or
+ TRX_SIG_OTHER_SESS */
+ que_thr_t* receiver_thr, /*!< in: query thread which wants the
+ reply, or NULL; if type is
+ TRX_SIG_END_WAIT, this must be NULL */
+ trx_savept_t* savept, /*!< in: possible rollback savepoint, or
+ NULL */
+ que_thr_t** next_thr); /*!< in/out: next query thread to run;
+ if the value which is passed in is
+ a pointer to a NULL pointer, then the
+ calling function can start running
+ a new query thread; if the parameter
+ is NULL, it is ignored */
+/****************************************************************//**
+Send the reply message when a signal in the queue of the trx has
+been handled. */
+UNIV_INTERN
+void
+trx_sig_reply(
+/*==========*/
+ trx_sig_t* sig, /*!< in: signal */
+ que_thr_t** next_thr); /*!< in/out: next query thread to run;
+ if the value which is passed in is
+ a pointer to a NULL pointer, then the
+ calling function can start running
+ a new query thread */
+/****************************************************************//**
+Removes the signal object from a trx signal queue. */
+UNIV_INTERN
+void
+trx_sig_remove(
+/*===========*/
+ trx_t* trx, /*!< in: trx handle */
+ trx_sig_t* sig); /*!< in, own: signal */
+/****************************************************************//**
+Starts handling of a trx signal. */
+UNIV_INTERN
+void
+trx_sig_start_handle(
+/*=================*/
+ trx_t* trx, /*!< in: trx handle */
+ que_thr_t** next_thr); /*!< in/out: next query thread to run;
+ if the value which is passed in is
+ a pointer to a NULL pointer, then the
+ calling function can start running
+ a new query thread */
+/****************************************************************//**
+Ends signal handling. If the session is in the error state, and
+trx->graph_before_signal_handling != NULL, returns control to the error
+handling routine of the graph (currently only returns the control to the
+graph root which then sends an error message to the client). */
+UNIV_INTERN
+void
+trx_end_signal_handling(
+/*====================*/
+ trx_t* trx); /*!< in: trx */
+/*********************************************************************//**
+Creates a commit command node struct.
+@return own: commit node struct */
+UNIV_INTERN
+commit_node_t*
+commit_node_create(
+/*===============*/
+ mem_heap_t* heap); /*!< in: mem heap where created */
+/***********************************************************//**
+Performs an execution step for a commit type node in a query graph.
+@return query thread to run next, or NULL */
+UNIV_INTERN
+que_thr_t*
+trx_commit_step(
+/*============*/
+ que_thr_t* thr); /*!< in: query thread */
+
+/**********************************************************************//**
+Prints info about a transaction to the given file. The caller must own the
+kernel mutex and must have called
+innobase_mysql_prepare_print_arbitrary_thd(), unless he knows that MySQL
+or InnoDB cannot meanwhile change the info printed here. */
+UNIV_INTERN
+void
+trx_print(
+/*======*/
+ FILE* f, /*!< in: output stream */
+ trx_t* trx, /*!< in: transaction */
+ ulint max_query_len); /*!< in: max query length to print, or 0 to
+ use the default max length */
+
+/** Type of data dictionary operation */
+enum trx_dict_op {
+ /** The transaction is not modifying the data dictionary. */
+ TRX_DICT_OP_NONE = 0,
+ /** The transaction is creating a table or an index, or
+ dropping a table. The table must be dropped in crash
+ recovery. This and TRX_DICT_OP_NONE are the only possible
+ operation modes in crash recovery. */
+ TRX_DICT_OP_TABLE = 1,
+ /** The transaction is creating or dropping an index in an
+ existing table. In crash recovery, the the data dictionary
+ must be locked, but the table must not be dropped. */
+ TRX_DICT_OP_INDEX = 2
+};
+
+/**********************************************************************//**
+Determine if a transaction is a dictionary operation.
+@return dictionary operation mode */
+UNIV_INLINE
+enum trx_dict_op
+trx_get_dict_operation(
+/*===================*/
+ const trx_t* trx) /*!< in: transaction */
+ __attribute__((pure));
+/**********************************************************************//**
+Flag a transaction a dictionary operation. */
+UNIV_INLINE
+void
+trx_set_dict_operation(
+/*===================*/
+ trx_t* trx, /*!< in/out: transaction */
+ enum trx_dict_op op); /*!< in: operation, not
+ TRX_DICT_OP_NONE */
+
+#ifndef UNIV_HOTBACKUP
+/**********************************************************************//**
+Determines if the currently running transaction has been interrupted.
+@return TRUE if interrupted */
+UNIV_INTERN
+ibool
+trx_is_interrupted(
+/*===============*/
+ trx_t* trx); /*!< in: transaction */
+#else /* !UNIV_HOTBACKUP */
+#define trx_is_interrupted(trx) FALSE
+#endif /* !UNIV_HOTBACKUP */
+
+/*******************************************************************//**
+Calculates the "weight" of a transaction. The weight of one transaction
+is estimated as the number of altered rows + the number of locked rows.
+@param t transaction
+@return transaction weight */
+#define TRX_WEIGHT(t) \
+ ut_dulint_add((t)->undo_no, UT_LIST_GET_LEN((t)->trx_locks))
+
+/*******************************************************************//**
+Compares the "weight" (or size) of two transactions. Transactions that
+have edited non-transactional tables are considered heavier than ones
+that have not.
+@return <0, 0 or >0; similar to strcmp(3) */
+UNIV_INTERN
+int
+trx_weight_cmp(
+/*===========*/
+ const trx_t* a, /*!< in: the first transaction to be compared */
+ const trx_t* b); /*!< in: the second transaction to be compared */
+
+/*******************************************************************//**
+Retrieves transacion's id, represented as unsigned long long.
+@return transaction's id */
+UNIV_INLINE
+ullint
+trx_get_id(
+/*=======*/
+ const trx_t* trx); /*!< in: transaction */
+
+/* Maximum length of a string that can be returned by
+trx_get_que_state_str(). */
+#define TRX_QUE_STATE_STR_MAX_LEN 12 /* "ROLLING BACK" */
+
+/*******************************************************************//**
+Retrieves transaction's que state in a human readable string. The string
+should not be free()'d or modified.
+@return string in the data segment */
+UNIV_INLINE
+const char*
+trx_get_que_state_str(
+/*==================*/
+ const trx_t* trx); /*!< in: transaction */
+
+/* Signal to a transaction */
+struct trx_sig_struct{
+ unsigned type:3; /*!< signal type */
+ unsigned sender:1; /*!< TRX_SIG_SELF or
+ TRX_SIG_OTHER_SESS */
+ que_thr_t* receiver; /*!< non-NULL if the sender of the signal
+ wants reply after the operation induced
+ by the signal is completed */
+ trx_savept_t savept; /*!< possible rollback savepoint */
+ UT_LIST_NODE_T(trx_sig_t)
+ signals; /*!< queue of pending signals to the
+ transaction */
+ UT_LIST_NODE_T(trx_sig_t)
+ reply_signals; /*!< list of signals for which the sender
+ transaction is waiting a reply */
+};
+
+#define TRX_MAGIC_N 91118598
+
+/* The transaction handle; every session has a trx object which is freed only
+when the session is freed; in addition there may be session-less transactions
+rolling back after a database recovery */
+
+struct trx_struct{
+ ulint magic_n;
+ /* All the next fields are protected by the kernel mutex, except the
+ undo logs which are protected by undo_mutex */
+ const char* op_info; /*!< English text describing the
+ current operation, or an empty
+ string */
+ unsigned is_purge:1; /*!< 0=user transaction, 1=purge */
+ unsigned is_recovered:1; /*!< 0=normal transaction,
+ 1=recovered, must be rolled back */
+ unsigned conc_state:2; /*!< state of the trx from the point
+ of view of concurrency control:
+ TRX_ACTIVE, TRX_COMMITTED_IN_MEMORY,
+ ... */
+ unsigned que_state:2; /*!< valid when conc_state == TRX_ACTIVE:
+ TRX_QUE_RUNNING, TRX_QUE_LOCK_WAIT,
+ ... */
+ unsigned isolation_level:2;/* TRX_ISO_REPEATABLE_READ, ... */
+ unsigned check_foreigns:1;/* normally TRUE, but if the user
+ wants to suppress foreign key checks,
+ (in table imports, for example) we
+ set this FALSE */
+ unsigned check_unique_secondary:1;
+ /* normally TRUE, but if the user
+ wants to speed up inserts by
+ suppressing unique key checks
+ for secondary indexes when we decide
+ if we can use the insert buffer for
+ them, we set this FALSE */
+ unsigned support_xa:1; /*!< normally we do the XA two-phase
+ commit steps, but by setting this to
+ FALSE, one can save CPU time and about
+ 150 bytes in the undo log size as then
+ we skip XA steps */
+ unsigned flush_log_later:1;/* In 2PC, we hold the
+ prepare_commit mutex across
+ both phases. In that case, we
+ defer flush of the logs to disk
+ until after we release the
+ mutex. */
+ unsigned must_flush_log_later:1;/* this flag is set to TRUE in
+ trx_commit_off_kernel() if
+ flush_log_later was TRUE, and there
+ were modifications by the transaction;
+ in that case we must flush the log
+ in trx_commit_complete_for_mysql() */
+ unsigned dict_operation:2;/**< @see enum trx_dict_op */
+ unsigned duplicates:2; /*!< TRX_DUP_IGNORE | TRX_DUP_REPLACE */
+ unsigned active_trans:2; /*!< 1 - if a transaction in MySQL
+ is active. 2 - if prepare_commit_mutex
+ was taken */
+ unsigned has_search_latch:1;
+ /* TRUE if this trx has latched the
+ search system latch in S-mode */
+ unsigned declared_to_be_inside_innodb:1;
+ /* this is TRUE if we have declared
+ this transaction in
+ srv_conc_enter_innodb to be inside the
+ InnoDB engine */
+ unsigned handling_signals:1;/* this is TRUE as long as the trx
+ is handling signals */
+ unsigned dict_operation_lock_mode:2;
+ /* 0, RW_S_LATCH, or RW_X_LATCH:
+ the latch mode trx currently holds
+ on dict_operation_lock */
+ time_t start_time; /*!< time the trx object was created
+ or the state last time became
+ TRX_ACTIVE */
+ trx_id_t id; /*!< transaction id */
+ XID xid; /*!< X/Open XA transaction
+ identification to identify a
+ transaction branch */
+ trx_id_t no; /*!< transaction serialization number ==
+ max trx id when the transaction is
+ moved to COMMITTED_IN_MEMORY state */
+ ib_uint64_t commit_lsn; /*!< lsn at the time of the commit */
+ trx_id_t table_id; /*!< Table to drop iff dict_operation
+ is TRUE, or ut_dulint_zero. */
+ /*------------------------------*/
+ void* mysql_thd; /*!< MySQL thread handle corresponding
+ to this trx, or NULL */
+ char** mysql_query_str;/* pointer to the field in mysqld_thd
+ which contains the pointer to the
+ current SQL query string */
+ const char* mysql_log_file_name;
+ /* if MySQL binlog is used, this field
+ contains a pointer to the latest file
+ name; this is NULL if binlog is not
+ used */
+ ib_int64_t mysql_log_offset;/* if MySQL binlog is used, this field
+ contains the end offset of the binlog
+ entry */
+ os_thread_id_t mysql_thread_id;/* id of the MySQL thread associated
+ with this transaction object */
+ ulint mysql_process_no;/* since in Linux, 'top' reports
+ process id's and not thread id's, we
+ store the process number too */
+ /*------------------------------*/
+ ulint n_mysql_tables_in_use; /* number of Innobase tables
+ used in the processing of the current
+ SQL statement in MySQL */
+ ulint mysql_n_tables_locked;
+ /* how many tables the current SQL
+ statement uses, except those
+ in consistent read */
+ ulint search_latch_timeout;
+ /* If we notice that someone is
+ waiting for our S-lock on the search
+ latch to be released, we wait in
+ row0sel.c for BTR_SEA_TIMEOUT new
+ searches until we try to keep
+ the search latch again over
+ calls from MySQL; this is intended
+ to reduce contention on the search
+ latch */
+ /*------------------------------*/
+ ulint n_tickets_to_enter_innodb;
+ /* this can be > 0 only when
+ declared_to_... is TRUE; when we come
+ to srv_conc_innodb_enter, if the value
+ here is > 0, we decrement this by 1 */
+ /*------------------------------*/
+ UT_LIST_NODE_T(trx_t)
+ trx_list; /*!< list of transactions */
+ UT_LIST_NODE_T(trx_t)
+ mysql_trx_list; /*!< list of transactions created for
+ MySQL */
+ /*------------------------------*/
+ ulint error_state; /*!< 0 if no error, otherwise error
+ number; NOTE That ONLY the thread
+ doing the transaction is allowed to
+ set this field: this is NOT protected
+ by the kernel mutex */
+ const dict_index_t*error_info; /*!< if the error number indicates a
+ duplicate key error, a pointer to
+ the problematic index is stored here */
+ ulint error_key_num; /*!< if the index creation fails to a
+ duplicate key error, a mysql key
+ number of that index is stored here */
+ sess_t* sess; /*!< session of the trx, NULL if none */
+ que_t* graph; /*!< query currently run in the session,
+ or NULL if none; NOTE that the query
+ belongs to the session, and it can
+ survive over a transaction commit, if
+ it is a stored procedure with a COMMIT
+ WORK statement, for instance */
+ ulint n_active_thrs; /*!< number of active query threads */
+ que_t* graph_before_signal_handling;
+ /* value of graph when signal handling
+ for this trx started: this is used to
+ return control to the original query
+ graph for error processing */
+ trx_sig_t sig; /*!< one signal object can be allocated
+ in this space, avoiding mem_alloc */
+ UT_LIST_BASE_NODE_T(trx_sig_t)
+ signals; /*!< queue of processed or pending
+ signals to the trx */
+ UT_LIST_BASE_NODE_T(trx_sig_t)
+ reply_signals; /*!< list of signals sent by the query
+ threads of this trx for which a thread
+ is waiting for a reply; if this trx is
+ killed, the reply requests in the list
+ must be canceled */
+ /*------------------------------*/
+ lock_t* wait_lock; /*!< if trx execution state is
+ TRX_QUE_LOCK_WAIT, this points to
+ the lock request, otherwise this is
+ NULL */
+ ibool was_chosen_as_deadlock_victim;
+ /* when the transaction decides to wait
+ for a lock, it sets this to FALSE;
+ if another transaction chooses this
+ transaction as a victim in deadlock
+ resolution, it sets this to TRUE */
+ time_t wait_started; /*!< lock wait started at this time */
+ UT_LIST_BASE_NODE_T(que_thr_t)
+ wait_thrs; /*!< query threads belonging to this
+ trx that are in the QUE_THR_LOCK_WAIT
+ state */
+ ulint deadlock_mark; /*!< a mark field used in deadlock
+ checking algorithm. This must be
+ in its own machine word, because
+ it can be changed by other
+ threads while holding kernel_mutex. */
+ /*------------------------------*/
+ mem_heap_t* lock_heap; /*!< memory heap for the locks of the
+ transaction */
+ UT_LIST_BASE_NODE_T(lock_t)
+ trx_locks; /*!< locks reserved by the transaction */
+ /*------------------------------*/
+ mem_heap_t* global_read_view_heap;
+ /* memory heap for the global read
+ view */
+ read_view_t* global_read_view;
+ /* consistent read view associated
+ to a transaction or NULL */
+ read_view_t* read_view; /*!< consistent read view used in the
+ transaction or NULL, this read view
+ if defined can be normal read view
+ associated to a transaction (i.e.
+ same as global_read_view) or read view
+ associated to a cursor */
+ /*------------------------------*/
+ UT_LIST_BASE_NODE_T(trx_named_savept_t)
+ trx_savepoints; /*!< savepoints set with SAVEPOINT ...,
+ oldest first */
+ /*------------------------------*/
+ mutex_t undo_mutex; /*!< mutex protecting the fields in this
+ section (down to undo_no_arr), EXCEPT
+ last_sql_stat_start, which can be
+ accessed only when we know that there
+ cannot be any activity in the undo
+ logs! */
+ undo_no_t undo_no; /*!< next undo log record number to
+ assign; since the undo log is
+ private for a transaction, this
+ is a simple ascending sequence
+ with no gaps; thus it represents
+ the number of modified/inserted
+ rows in a transaction */
+ trx_savept_t last_sql_stat_start;
+ /* undo_no when the last sql statement
+ was started: in case of an error, trx
+ is rolled back down to this undo
+ number; see note at undo_mutex! */
+ trx_rseg_t* rseg; /*!< rollback segment assigned to the
+ transaction, or NULL if not assigned
+ yet */
+ trx_undo_t* insert_undo; /*!< pointer to the insert undo log, or
+ NULL if no inserts performed yet */
+ trx_undo_t* update_undo; /*!< pointer to the update undo log, or
+ NULL if no update performed yet */
+ undo_no_t roll_limit; /*!< least undo number to undo during
+ a rollback */
+ ulint pages_undone; /*!< number of undo log pages undone
+ since the last undo log truncation */
+ trx_undo_arr_t* undo_no_arr; /*!< array of undo numbers of undo log
+ records which are currently processed
+ by a rollback operation */
+ /*------------------------------*/
+ ulint n_autoinc_rows; /*!< no. of AUTO-INC rows required for
+ an SQL statement. This is useful for
+ multi-row INSERTs */
+ ib_vector_t* autoinc_locks; /* AUTOINC locks held by this
+ transaction. Note that these are
+ also in the lock list trx_locks. This
+ vector needs to be freed explicitly
+ when the trx_t instance is desrtoyed */
+ /*------------------------------*/
+ char detailed_error[256]; /*!< detailed error message for last
+ error, or empty. */
+};
+
+#define TRX_MAX_N_THREADS 32 /* maximum number of
+ concurrent threads running a
+ single operation of a
+ transaction, e.g., a parallel
+ query */
+/* Transaction concurrency states (trx->conc_state) */
+#define TRX_NOT_STARTED 0
+#define TRX_ACTIVE 1
+#define TRX_COMMITTED_IN_MEMORY 2
+#define TRX_PREPARED 3 /* Support for 2PC/XA */
+
+/* Transaction execution states when trx->conc_state == TRX_ACTIVE */
+#define TRX_QUE_RUNNING 0 /* transaction is running */
+#define TRX_QUE_LOCK_WAIT 1 /* transaction is waiting for a lock */
+#define TRX_QUE_ROLLING_BACK 2 /* transaction is rolling back */
+#define TRX_QUE_COMMITTING 3 /* transaction is committing */
+
+/* Transaction isolation levels (trx->isolation_level) */
+#define TRX_ISO_READ_UNCOMMITTED 0 /* dirty read: non-locking
+ SELECTs are performed so that
+ we do not look at a possible
+ earlier version of a record;
+ thus they are not 'consistent'
+ reads under this isolation
+ level; otherwise like level
+ 2 */
+
+#define TRX_ISO_READ_COMMITTED 1 /* somewhat Oracle-like
+ isolation, except that in
+ range UPDATE and DELETE we
+ must block phantom rows
+ with next-key locks;
+ SELECT ... FOR UPDATE and ...
+ LOCK IN SHARE MODE only lock
+ the index records, NOT the
+ gaps before them, and thus
+ allow free inserting;
+ each consistent read reads its
+ own snapshot */
+
+#define TRX_ISO_REPEATABLE_READ 2 /* this is the default;
+ all consistent reads in the
+ same trx read the same
+ snapshot;
+ full next-key locking used
+ in locking reads to block
+ insertions into gaps */
+
+#define TRX_ISO_SERIALIZABLE 3 /* all plain SELECTs are
+ converted to LOCK IN SHARE
+ MODE reads */
+
+/* Treatment of duplicate values (trx->duplicates; for example, in inserts).
+Multiple flags can be combined with bitwise OR. */
+#define TRX_DUP_IGNORE 1 /* duplicate rows are to be updated */
+#define TRX_DUP_REPLACE 2 /* duplicate rows are to be replaced */
+
+
+/* Types of a trx signal */
+#define TRX_SIG_NO_SIGNAL 0
+#define TRX_SIG_TOTAL_ROLLBACK 1
+#define TRX_SIG_ROLLBACK_TO_SAVEPT 2
+#define TRX_SIG_COMMIT 3
+#define TRX_SIG_ERROR_OCCURRED 4
+#define TRX_SIG_BREAK_EXECUTION 5
+
+/* Sender types of a signal */
+#define TRX_SIG_SELF 0 /* sent by the session itself, or
+ by an error occurring within this
+ session */
+#define TRX_SIG_OTHER_SESS 1 /* sent by another session (which
+ must hold rights to this) */
+
+/** Commit node states */
+enum commit_node_state {
+ COMMIT_NODE_SEND = 1, /*!< about to send a commit signal to
+ the transaction */
+ COMMIT_NODE_WAIT /*!< commit signal sent to the transaction,
+ waiting for completion */
+};
+
+/** Commit command node in a query graph */
+struct commit_node_struct{
+ que_common_t common; /*!< node type: QUE_NODE_COMMIT */
+ enum commit_node_state
+ state; /*!< node execution state */
+};
+
+
+
+#ifndef UNIV_NONINL
+#include "trx0trx.ic"
+#endif
+#endif /* !UNIV_HOTBACKUP */
+
+#endif
diff --git a/storage/innobase/include/trx0trx.ic b/storage/innobase/include/trx0trx.ic
new file mode 100644
index 00000000000..7332eeece85
--- /dev/null
+++ b/storage/innobase/include/trx0trx.ic
@@ -0,0 +1,164 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/trx0trx.ic
+The transaction
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+/*************************************************************//**
+Starts the transaction if it is not yet started. */
+UNIV_INLINE
+void
+trx_start_if_not_started(
+/*=====================*/
+ trx_t* trx) /*!< in: transaction */
+{
+ ut_ad(trx->conc_state != TRX_COMMITTED_IN_MEMORY);
+
+ if (trx->conc_state == TRX_NOT_STARTED) {
+
+ trx_start(trx, ULINT_UNDEFINED);
+ }
+}
+
+/*************************************************************//**
+Starts the transaction if it is not yet started. Assumes we have reserved
+the kernel mutex! */
+UNIV_INLINE
+void
+trx_start_if_not_started_low(
+/*=========================*/
+ trx_t* trx) /*!< in: transaction */
+{
+ ut_ad(trx->conc_state != TRX_COMMITTED_IN_MEMORY);
+
+ if (trx->conc_state == TRX_NOT_STARTED) {
+
+ trx_start_low(trx, ULINT_UNDEFINED);
+ }
+}
+
+/****************************************************************//**
+Retrieves the error_info field from a trx.
+@return the error info */
+UNIV_INLINE
+const dict_index_t*
+trx_get_error_info(
+/*===============*/
+ const trx_t* trx) /*!< in: trx object */
+{
+ return(trx->error_info);
+}
+
+/*******************************************************************//**
+Retrieves transacion's id, represented as unsigned long long.
+@return transaction's id */
+UNIV_INLINE
+ullint
+trx_get_id(
+/*=======*/
+ const trx_t* trx) /*!< in: transaction */
+{
+ return((ullint)ut_conv_dulint_to_longlong(trx->id));
+}
+
+/*******************************************************************//**
+Retrieves transaction's que state in a human readable string. The string
+should not be free()'d or modified.
+@return string in the data segment */
+UNIV_INLINE
+const char*
+trx_get_que_state_str(
+/*==================*/
+ const trx_t* trx) /*!< in: transaction */
+{
+ /* be sure to adjust TRX_QUE_STATE_STR_MAX_LEN if you change this */
+ switch (trx->que_state) {
+ case TRX_QUE_RUNNING:
+ return("RUNNING");
+ case TRX_QUE_LOCK_WAIT:
+ return("LOCK WAIT");
+ case TRX_QUE_ROLLING_BACK:
+ return("ROLLING BACK");
+ case TRX_QUE_COMMITTING:
+ return("COMMITTING");
+ default:
+ return("UNKNOWN");
+ }
+}
+
+/**********************************************************************//**
+Determine if a transaction is a dictionary operation.
+@return dictionary operation mode */
+UNIV_INLINE
+enum trx_dict_op
+trx_get_dict_operation(
+/*===================*/
+ const trx_t* trx) /*!< in: transaction */
+{
+ enum trx_dict_op op = (enum trx_dict_op) trx->dict_operation;
+
+#ifdef UNIV_DEBUG
+ switch (op) {
+ case TRX_DICT_OP_NONE:
+ case TRX_DICT_OP_TABLE:
+ case TRX_DICT_OP_INDEX:
+ return(op);
+ }
+ ut_error;
+#endif /* UNIV_DEBUG */
+ return((enum trx_dict_op) UNIV_EXPECT(op, TRX_DICT_OP_NONE));
+}
+/**********************************************************************//**
+Flag a transaction a dictionary operation. */
+UNIV_INLINE
+void
+trx_set_dict_operation(
+/*===================*/
+ trx_t* trx, /*!< in/out: transaction */
+ enum trx_dict_op op) /*!< in: operation, not
+ TRX_DICT_OP_NONE */
+{
+#ifdef UNIV_DEBUG
+ enum trx_dict_op old_op = trx_get_dict_operation(trx);
+
+ switch (op) {
+ case TRX_DICT_OP_NONE:
+ ut_error;
+ break;
+ case TRX_DICT_OP_TABLE:
+ switch (old_op) {
+ case TRX_DICT_OP_NONE:
+ case TRX_DICT_OP_INDEX:
+ case TRX_DICT_OP_TABLE:
+ goto ok;
+ }
+ ut_error;
+ break;
+ case TRX_DICT_OP_INDEX:
+ ut_ad(old_op == TRX_DICT_OP_NONE);
+ break;
+ }
+ok:
+#endif /* UNIV_DEBUG */
+
+ trx->dict_operation = op;
+}
diff --git a/storage/innobase/include/trx0types.h b/storage/innobase/include/trx0types.h
new file mode 100644
index 00000000000..08cc9622d02
--- /dev/null
+++ b/storage/innobase/include/trx0types.h
@@ -0,0 +1,108 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/trx0types.h
+Transaction system global type definitions
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef trx0types_h
+#define trx0types_h
+
+#include "ut0byte.h"
+
+/** prepare trx_t::id for being printed via printf(3) */
+#define TRX_ID_PREP_PRINTF(id) (ullint) ut_conv_dulint_to_longlong(id)
+
+/** printf(3) format used for printing TRX_ID_PRINTF_PREP() */
+#define TRX_ID_FMT "%llX"
+
+/** maximum length that a formatted trx_t::id could take, not including
+the terminating NUL character. */
+#define TRX_ID_MAX_LEN 17
+
+/** Memory objects */
+/* @{ */
+/** Transaction */
+typedef struct trx_struct trx_t;
+/** Transaction system */
+typedef struct trx_sys_struct trx_sys_t;
+/** Doublewrite information */
+typedef struct trx_doublewrite_struct trx_doublewrite_t;
+/** Signal */
+typedef struct trx_sig_struct trx_sig_t;
+/** Rollback segment */
+typedef struct trx_rseg_struct trx_rseg_t;
+/** Transaction undo log */
+typedef struct trx_undo_struct trx_undo_t;
+/** Array of undo numbers of undo records being rolled back or purged */
+typedef struct trx_undo_arr_struct trx_undo_arr_t;
+/** A cell of trx_undo_arr_t */
+typedef struct trx_undo_inf_struct trx_undo_inf_t;
+/** The control structure used in the purge operation */
+typedef struct trx_purge_struct trx_purge_t;
+/** Rollback command node in a query graph */
+typedef struct roll_node_struct roll_node_t;
+/** Commit command node in a query graph */
+typedef struct commit_node_struct commit_node_t;
+/** SAVEPOINT command node in a query graph */
+typedef struct trx_named_savept_struct trx_named_savept_t;
+/* @} */
+
+/** Rollback contexts */
+enum trx_rb_ctx {
+ RB_NONE = 0, /*!< no rollback */
+ RB_NORMAL, /*!< normal rollback */
+ RB_RECOVERY, /*!< rolling back an incomplete transaction,
+ in crash recovery */
+};
+
+/** Transaction identifier (DB_TRX_ID, DATA_TRX_ID) */
+typedef dulint trx_id_t;
+/** Rollback pointer (DB_ROLL_PTR, DATA_ROLL_PTR) */
+typedef dulint roll_ptr_t;
+/** Undo number */
+typedef dulint undo_no_t;
+
+/** Transaction savepoint */
+typedef struct trx_savept_struct trx_savept_t;
+/** Transaction savepoint */
+struct trx_savept_struct{
+ undo_no_t least_undo_no; /*!< least undo number to undo */
+};
+
+/** File objects */
+/* @{ */
+/** Transaction system header */
+typedef byte trx_sysf_t;
+/** Rollback segment header */
+typedef byte trx_rsegf_t;
+/** Undo segment header */
+typedef byte trx_usegf_t;
+/** Undo log header */
+typedef byte trx_ulogf_t;
+/** Undo log page header */
+typedef byte trx_upagef_t;
+
+/** Undo log record */
+typedef byte trx_undo_rec_t;
+/* @} */
+
+#endif
diff --git a/storage/innobase/include/trx0undo.h b/storage/innobase/include/trx0undo.h
new file mode 100644
index 00000000000..4db10eaa92e
--- /dev/null
+++ b/storage/innobase/include/trx0undo.h
@@ -0,0 +1,544 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/trx0undo.h
+Transaction undo log
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef trx0undo_h
+#define trx0undo_h
+
+#include "univ.i"
+#include "trx0types.h"
+#include "mtr0mtr.h"
+#include "trx0sys.h"
+#include "page0types.h"
+#include "trx0xa.h"
+
+#ifndef UNIV_HOTBACKUP
+/***********************************************************************//**
+Builds a roll pointer.
+@return roll pointer */
+UNIV_INLINE
+roll_ptr_t
+trx_undo_build_roll_ptr(
+/*====================*/
+ ibool is_insert, /*!< in: TRUE if insert undo log */
+ ulint rseg_id, /*!< in: rollback segment id */
+ ulint page_no, /*!< in: page number */
+ ulint offset); /*!< in: offset of the undo entry within page */
+/***********************************************************************//**
+Decodes a roll pointer. */
+UNIV_INLINE
+void
+trx_undo_decode_roll_ptr(
+/*=====================*/
+ roll_ptr_t roll_ptr, /*!< in: roll pointer */
+ ibool* is_insert, /*!< out: TRUE if insert undo log */
+ ulint* rseg_id, /*!< out: rollback segment id */
+ ulint* page_no, /*!< out: page number */
+ ulint* offset); /*!< out: offset of the undo
+ entry within page */
+/***********************************************************************//**
+Returns TRUE if the roll pointer is of the insert type.
+@return TRUE if insert undo log */
+UNIV_INLINE
+ibool
+trx_undo_roll_ptr_is_insert(
+/*========================*/
+ roll_ptr_t roll_ptr); /*!< in: roll pointer */
+#endif /* !UNIV_HOTBACKUP */
+/*****************************************************************//**
+Writes a roll ptr to an index page. In case that the size changes in
+some future version, this function should be used instead of
+mach_write_... */
+UNIV_INLINE
+void
+trx_write_roll_ptr(
+/*===============*/
+ byte* ptr, /*!< in: pointer to memory where
+ written */
+ roll_ptr_t roll_ptr); /*!< in: roll ptr */
+/*****************************************************************//**
+Reads a roll ptr from an index page. In case that the roll ptr size
+changes in some future version, this function should be used instead of
+mach_read_...
+@return roll ptr */
+UNIV_INLINE
+roll_ptr_t
+trx_read_roll_ptr(
+/*==============*/
+ const byte* ptr); /*!< in: pointer to memory from where to read */
+#ifndef UNIV_HOTBACKUP
+/******************************************************************//**
+Gets an undo log page and x-latches it.
+@return pointer to page x-latched */
+UNIV_INLINE
+page_t*
+trx_undo_page_get(
+/*==============*/
+ ulint space, /*!< in: space where placed */
+ ulint zip_size, /*!< in: compressed page size in bytes
+ or 0 for uncompressed pages */
+ ulint page_no, /*!< in: page number */
+ mtr_t* mtr); /*!< in: mtr */
+/******************************************************************//**
+Gets an undo log page and s-latches it.
+@return pointer to page s-latched */
+UNIV_INLINE
+page_t*
+trx_undo_page_get_s_latched(
+/*========================*/
+ ulint space, /*!< in: space where placed */
+ ulint zip_size, /*!< in: compressed page size in bytes
+ or 0 for uncompressed pages */
+ ulint page_no, /*!< in: page number */
+ mtr_t* mtr); /*!< in: mtr */
+/******************************************************************//**
+Returns the previous undo record on the page in the specified log, or
+NULL if none exists.
+@return pointer to record, NULL if none */
+UNIV_INLINE
+trx_undo_rec_t*
+trx_undo_page_get_prev_rec(
+/*=======================*/
+ trx_undo_rec_t* rec, /*!< in: undo log record */
+ ulint page_no,/*!< in: undo log header page number */
+ ulint offset);/*!< in: undo log header offset on page */
+/******************************************************************//**
+Returns the next undo log record on the page in the specified log, or
+NULL if none exists.
+@return pointer to record, NULL if none */
+UNIV_INLINE
+trx_undo_rec_t*
+trx_undo_page_get_next_rec(
+/*=======================*/
+ trx_undo_rec_t* rec, /*!< in: undo log record */
+ ulint page_no,/*!< in: undo log header page number */
+ ulint offset);/*!< in: undo log header offset on page */
+/******************************************************************//**
+Returns the last undo record on the page in the specified undo log, or
+NULL if none exists.
+@return pointer to record, NULL if none */
+UNIV_INLINE
+trx_undo_rec_t*
+trx_undo_page_get_last_rec(
+/*=======================*/
+ page_t* undo_page,/*!< in: undo log page */
+ ulint page_no,/*!< in: undo log header page number */
+ ulint offset); /*!< in: undo log header offset on page */
+/******************************************************************//**
+Returns the first undo record on the page in the specified undo log, or
+NULL if none exists.
+@return pointer to record, NULL if none */
+UNIV_INLINE
+trx_undo_rec_t*
+trx_undo_page_get_first_rec(
+/*========================*/
+ page_t* undo_page,/*!< in: undo log page */
+ ulint page_no,/*!< in: undo log header page number */
+ ulint offset);/*!< in: undo log header offset on page */
+/***********************************************************************//**
+Gets the previous record in an undo log.
+@return undo log record, the page s-latched, NULL if none */
+UNIV_INTERN
+trx_undo_rec_t*
+trx_undo_get_prev_rec(
+/*==================*/
+ trx_undo_rec_t* rec, /*!< in: undo record */
+ ulint page_no,/*!< in: undo log header page number */
+ ulint offset, /*!< in: undo log header offset on page */
+ mtr_t* mtr); /*!< in: mtr */
+/***********************************************************************//**
+Gets the next record in an undo log.
+@return undo log record, the page s-latched, NULL if none */
+UNIV_INTERN
+trx_undo_rec_t*
+trx_undo_get_next_rec(
+/*==================*/
+ trx_undo_rec_t* rec, /*!< in: undo record */
+ ulint page_no,/*!< in: undo log header page number */
+ ulint offset, /*!< in: undo log header offset on page */
+ mtr_t* mtr); /*!< in: mtr */
+/***********************************************************************//**
+Gets the first record in an undo log.
+@return undo log record, the page latched, NULL if none */
+UNIV_INTERN
+trx_undo_rec_t*
+trx_undo_get_first_rec(
+/*===================*/
+ ulint space, /*!< in: undo log header space */
+ ulint zip_size,/*!< in: compressed page size in bytes
+ or 0 for uncompressed pages */
+ ulint page_no,/*!< in: undo log header page number */
+ ulint offset, /*!< in: undo log header offset on page */
+ ulint mode, /*!< in: latching mode: RW_S_LATCH or RW_X_LATCH */
+ mtr_t* mtr); /*!< in: mtr */
+/********************************************************************//**
+Tries to add a page to the undo log segment where the undo log is placed.
+@return page number if success, else FIL_NULL */
+UNIV_INTERN
+ulint
+trx_undo_add_page(
+/*==============*/
+ trx_t* trx, /*!< in: transaction */
+ trx_undo_t* undo, /*!< in: undo log memory object */
+ mtr_t* mtr); /*!< in: mtr which does not have a latch to any
+ undo log page; the caller must have reserved
+ the rollback segment mutex */
+/***********************************************************************//**
+Truncates an undo log from the end. This function is used during a rollback
+to free space from an undo log. */
+UNIV_INTERN
+void
+trx_undo_truncate_end(
+/*==================*/
+ trx_t* trx, /*!< in: transaction whose undo log it is */
+ trx_undo_t* undo, /*!< in: undo log */
+ undo_no_t limit); /*!< in: all undo records with undo number
+ >= this value should be truncated */
+/***********************************************************************//**
+Truncates an undo log from the start. This function is used during a purge
+operation. */
+UNIV_INTERN
+void
+trx_undo_truncate_start(
+/*====================*/
+ trx_rseg_t* rseg, /*!< in: rollback segment */
+ ulint space, /*!< in: space id of the log */
+ ulint hdr_page_no, /*!< in: header page number */
+ ulint hdr_offset, /*!< in: header offset on the page */
+ undo_no_t limit); /*!< in: all undo pages with
+ undo numbers < this value
+ should be truncated; NOTE that
+ the function only frees whole
+ pages; the header page is not
+ freed, but emptied, if all the
+ records there are < limit */
+/********************************************************************//**
+Initializes the undo log lists for a rollback segment memory copy.
+This function is only called when the database is started or a new
+rollback segment created.
+@return the combined size of undo log segments in pages */
+UNIV_INTERN
+ulint
+trx_undo_lists_init(
+/*================*/
+ trx_rseg_t* rseg); /*!< in: rollback segment memory object */
+/**********************************************************************//**
+Assigns an undo log for a transaction. A new undo log is created or a cached
+undo log reused.
+@return DB_SUCCESS if undo log assign successful, possible error codes
+are: DB_TOO_MANY_CONCURRENT_TRXS DB_OUT_OF_FILE_SPACE
+DB_OUT_OF_MEMORY */
+UNIV_INTERN
+ulint
+trx_undo_assign_undo(
+/*=================*/
+ trx_t* trx, /*!< in: transaction */
+ ulint type); /*!< in: TRX_UNDO_INSERT or TRX_UNDO_UPDATE */
+/******************************************************************//**
+Sets the state of the undo log segment at a transaction finish.
+@return undo log segment header page, x-latched */
+UNIV_INTERN
+page_t*
+trx_undo_set_state_at_finish(
+/*=========================*/
+ trx_rseg_t* rseg, /*!< in: rollback segment memory object */
+ trx_t* trx, /*!< in: transaction */
+ trx_undo_t* undo, /*!< in: undo log memory copy */
+ mtr_t* mtr); /*!< in: mtr */
+/******************************************************************//**
+Sets the state of the undo log segment at a transaction prepare.
+@return undo log segment header page, x-latched */
+UNIV_INTERN
+page_t*
+trx_undo_set_state_at_prepare(
+/*==========================*/
+ trx_t* trx, /*!< in: transaction */
+ trx_undo_t* undo, /*!< in: undo log memory copy */
+ mtr_t* mtr); /*!< in: mtr */
+
+/**********************************************************************//**
+Adds the update undo log header as the first in the history list, and
+frees the memory object, or puts it to the list of cached update undo log
+segments. */
+UNIV_INTERN
+void
+trx_undo_update_cleanup(
+/*====================*/
+ trx_t* trx, /*!< in: trx owning the update undo log */
+ page_t* undo_page, /*!< in: update undo log header page,
+ x-latched */
+ mtr_t* mtr); /*!< in: mtr */
+/******************************************************************//**
+Frees or caches an insert undo log after a transaction commit or rollback.
+Knowledge of inserts is not needed after a commit or rollback, therefore
+the data can be discarded. */
+UNIV_INTERN
+void
+trx_undo_insert_cleanup(
+/*====================*/
+ trx_t* trx); /*!< in: transaction handle */
+#endif /* !UNIV_HOTBACKUP */
+/***********************************************************//**
+Parses the redo log entry of an undo log page initialization.
+@return end of log record or NULL */
+UNIV_INTERN
+byte*
+trx_undo_parse_page_init(
+/*=====================*/
+ byte* ptr, /*!< in: buffer */
+ byte* end_ptr,/*!< in: buffer end */
+ page_t* page, /*!< in: page or NULL */
+ mtr_t* mtr); /*!< in: mtr or NULL */
+/***********************************************************//**
+Parses the redo log entry of an undo log page header create or reuse.
+@return end of log record or NULL */
+UNIV_INTERN
+byte*
+trx_undo_parse_page_header(
+/*=======================*/
+ ulint type, /*!< in: MLOG_UNDO_HDR_CREATE or MLOG_UNDO_HDR_REUSE */
+ byte* ptr, /*!< in: buffer */
+ byte* end_ptr,/*!< in: buffer end */
+ page_t* page, /*!< in: page or NULL */
+ mtr_t* mtr); /*!< in: mtr or NULL */
+/***********************************************************//**
+Parses the redo log entry of an undo log page header discard.
+@return end of log record or NULL */
+UNIV_INTERN
+byte*
+trx_undo_parse_discard_latest(
+/*==========================*/
+ byte* ptr, /*!< in: buffer */
+ byte* end_ptr,/*!< in: buffer end */
+ page_t* page, /*!< in: page or NULL */
+ mtr_t* mtr); /*!< in: mtr or NULL */
+
+/* Types of an undo log segment */
+#define TRX_UNDO_INSERT 1 /* contains undo entries for inserts */
+#define TRX_UNDO_UPDATE 2 /* contains undo entries for updates
+ and delete markings: in short,
+ modifys (the name 'UPDATE' is a
+ historical relic) */
+/* States of an undo log segment */
+#define TRX_UNDO_ACTIVE 1 /* contains an undo log of an active
+ transaction */
+#define TRX_UNDO_CACHED 2 /* cached for quick reuse */
+#define TRX_UNDO_TO_FREE 3 /* insert undo segment can be freed */
+#define TRX_UNDO_TO_PURGE 4 /* update undo segment will not be
+ reused: it can be freed in purge when
+ all undo data in it is removed */
+#define TRX_UNDO_PREPARED 5 /* contains an undo log of an
+ prepared transaction */
+
+#ifndef UNIV_HOTBACKUP
+/** Transaction undo log memory object; this is protected by the undo_mutex
+in the corresponding transaction object */
+
+struct trx_undo_struct{
+ /*-----------------------------*/
+ ulint id; /*!< undo log slot number within the
+ rollback segment */
+ ulint type; /*!< TRX_UNDO_INSERT or
+ TRX_UNDO_UPDATE */
+ ulint state; /*!< state of the corresponding undo log
+ segment */
+ ibool del_marks; /*!< relevant only in an update undo log:
+ this is TRUE if the transaction may
+ have delete marked records, because of
+ a delete of a row or an update of an
+ indexed field; purge is then
+ necessary; also TRUE if the transaction
+ has updated an externally stored
+ field */
+ trx_id_t trx_id; /*!< id of the trx assigned to the undo
+ log */
+ XID xid; /*!< X/Open XA transaction
+ identification */
+ ibool dict_operation; /*!< TRUE if a dict operation trx */
+ dulint table_id; /*!< if a dict operation, then the table
+ id */
+ trx_rseg_t* rseg; /*!< rseg where the undo log belongs */
+ /*-----------------------------*/
+ ulint space; /*!< space id where the undo log
+ placed */
+ ulint zip_size; /*!< compressed page size of space
+ in bytes, or 0 for uncompressed */
+ ulint hdr_page_no; /*!< page number of the header page in
+ the undo log */
+ ulint hdr_offset; /*!< header offset of the undo log on the
+ page */
+ ulint last_page_no; /*!< page number of the last page in the
+ undo log; this may differ from
+ top_page_no during a rollback */
+ ulint size; /*!< current size in pages */
+ /*-----------------------------*/
+ ulint empty; /*!< TRUE if the stack of undo log
+ records is currently empty */
+ ulint top_page_no; /*!< page number where the latest undo
+ log record was catenated; during
+ rollback the page from which the latest
+ undo record was chosen */
+ ulint top_offset; /*!< offset of the latest undo record,
+ i.e., the topmost element in the undo
+ log if we think of it as a stack */
+ undo_no_t top_undo_no; /*!< undo number of the latest record */
+ buf_block_t* guess_block; /*!< guess for the buffer block where
+ the top page might reside */
+ /*-----------------------------*/
+ UT_LIST_NODE_T(trx_undo_t) undo_list;
+ /*!< undo log objects in the rollback
+ segment are chained into lists */
+};
+#endif /* !UNIV_HOTBACKUP */
+
+/** The offset of the undo log page header on pages of the undo log */
+#define TRX_UNDO_PAGE_HDR FSEG_PAGE_DATA
+/*-------------------------------------------------------------*/
+/** Transaction undo log page header offsets */
+/* @{ */
+#define TRX_UNDO_PAGE_TYPE 0 /*!< TRX_UNDO_INSERT or
+ TRX_UNDO_UPDATE */
+#define TRX_UNDO_PAGE_START 2 /*!< Byte offset where the undo log
+ records for the LATEST transaction
+ start on this page (remember that
+ in an update undo log, the first page
+ can contain several undo logs) */
+#define TRX_UNDO_PAGE_FREE 4 /*!< On each page of the undo log this
+ field contains the byte offset of the
+ first free byte on the page */
+#define TRX_UNDO_PAGE_NODE 6 /*!< The file list node in the chain
+ of undo log pages */
+/*-------------------------------------------------------------*/
+#define TRX_UNDO_PAGE_HDR_SIZE (6 + FLST_NODE_SIZE)
+ /*!< Size of the transaction undo
+ log page header, in bytes */
+/* @} */
+
+/** An update undo segment with just one page can be reused if it has
+at most this many bytes used; we must leave space at least for one new undo
+log header on the page */
+
+#define TRX_UNDO_PAGE_REUSE_LIMIT (3 * UNIV_PAGE_SIZE / 4)
+
+/* An update undo log segment may contain several undo logs on its first page
+if the undo logs took so little space that the segment could be cached and
+reused. All the undo log headers are then on the first page, and the last one
+owns the undo log records on subsequent pages if the segment is bigger than
+one page. If an undo log is stored in a segment, then on the first page it is
+allowed to have zero undo records, but if the segment extends to several
+pages, then all the rest of the pages must contain at least one undo log
+record. */
+
+/** The offset of the undo log segment header on the first page of the undo
+log segment */
+
+#define TRX_UNDO_SEG_HDR (TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE)
+/** Undo log segment header */
+/* @{ */
+/*-------------------------------------------------------------*/
+#define TRX_UNDO_STATE 0 /*!< TRX_UNDO_ACTIVE, ... */
+#define TRX_UNDO_LAST_LOG 2 /*!< Offset of the last undo log header
+ on the segment header page, 0 if
+ none */
+#define TRX_UNDO_FSEG_HEADER 4 /*!< Header for the file segment which
+ the undo log segment occupies */
+#define TRX_UNDO_PAGE_LIST (4 + FSEG_HEADER_SIZE)
+ /*!< Base node for the list of pages in
+ the undo log segment; defined only on
+ the undo log segment's first page */
+/*-------------------------------------------------------------*/
+/** Size of the undo log segment header */
+#define TRX_UNDO_SEG_HDR_SIZE (4 + FSEG_HEADER_SIZE + FLST_BASE_NODE_SIZE)
+/* @} */
+
+
+/** The undo log header. There can be several undo log headers on the first
+page of an update undo log segment. */
+/* @{ */
+/*-------------------------------------------------------------*/
+#define TRX_UNDO_TRX_ID 0 /*!< Transaction id */
+#define TRX_UNDO_TRX_NO 8 /*!< Transaction number of the
+ transaction; defined only if the log
+ is in a history list */
+#define TRX_UNDO_DEL_MARKS 16 /*!< Defined only in an update undo
+ log: TRUE if the transaction may have
+ done delete markings of records, and
+ thus purge is necessary */
+#define TRX_UNDO_LOG_START 18 /*!< Offset of the first undo log record
+ of this log on the header page; purge
+ may remove undo log record from the
+ log start, and therefore this is not
+ necessarily the same as this log
+ header end offset */
+#define TRX_UNDO_XID_EXISTS 20 /*!< TRUE if undo log header includes
+ X/Open XA transaction identification
+ XID */
+#define TRX_UNDO_DICT_TRANS 21 /*!< TRUE if the transaction is a table
+ create, index create, or drop
+ transaction: in recovery
+ the transaction cannot be rolled back
+ in the usual way: a 'rollback' rather
+ means dropping the created or dropped
+ table, if it still exists */
+#define TRX_UNDO_TABLE_ID 22 /*!< Id of the table if the preceding
+ field is TRUE */
+#define TRX_UNDO_NEXT_LOG 30 /*!< Offset of the next undo log header
+ on this page, 0 if none */
+#define TRX_UNDO_PREV_LOG 32 /*!< Offset of the previous undo log
+ header on this page, 0 if none */
+#define TRX_UNDO_HISTORY_NODE 34 /*!< If the log is put to the history
+ list, the file list node is here */
+/*-------------------------------------------------------------*/
+/** Size of the undo log header without XID information */
+#define TRX_UNDO_LOG_OLD_HDR_SIZE (34 + FLST_NODE_SIZE)
+
+/* Note: the writing of the undo log old header is coded by a log record
+MLOG_UNDO_HDR_CREATE or MLOG_UNDO_HDR_REUSE. The appending of an XID to the
+header is logged separately. In this sense, the XID is not really a member
+of the undo log header. TODO: do not append the XID to the log header if XA
+is not needed by the user. The XID wastes about 150 bytes of space in every
+undo log. In the history list we may have millions of undo logs, which means
+quite a large overhead. */
+
+/** X/Open XA Transaction Identification (XID) */
+/* @{ */
+/** xid_t::formatID */
+#define TRX_UNDO_XA_FORMAT (TRX_UNDO_LOG_OLD_HDR_SIZE)
+/** xid_t::gtrid_length */
+#define TRX_UNDO_XA_TRID_LEN (TRX_UNDO_XA_FORMAT + 4)
+/** xid_t::bqual_length */
+#define TRX_UNDO_XA_BQUAL_LEN (TRX_UNDO_XA_TRID_LEN + 4)
+/** Distributed transaction identifier data */
+#define TRX_UNDO_XA_XID (TRX_UNDO_XA_BQUAL_LEN + 4)
+/*--------------------------------------------------------------*/
+#define TRX_UNDO_LOG_XA_HDR_SIZE (TRX_UNDO_XA_XID + XIDDATASIZE)
+ /*!< Total size of the undo log header
+ with the XA XID */
+/* @} */
+
+#ifndef UNIV_NONINL
+#include "trx0undo.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/trx0undo.ic b/storage/innobase/include/trx0undo.ic
new file mode 100644
index 00000000000..2d289b34ef1
--- /dev/null
+++ b/storage/innobase/include/trx0undo.ic
@@ -0,0 +1,351 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/trx0undo.ic
+Transaction undo log
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#include "data0type.h"
+#include "page0page.h"
+
+#ifndef UNIV_HOTBACKUP
+/***********************************************************************//**
+Builds a roll pointer.
+@return roll pointer */
+UNIV_INLINE
+roll_ptr_t
+trx_undo_build_roll_ptr(
+/*====================*/
+ ibool is_insert, /*!< in: TRUE if insert undo log */
+ ulint rseg_id, /*!< in: rollback segment id */
+ ulint page_no, /*!< in: page number */
+ ulint offset) /*!< in: offset of the undo entry within page */
+{
+#if DATA_ROLL_PTR_LEN != 7
+# error "DATA_ROLL_PTR_LEN != 7"
+#endif
+ ut_ad(rseg_id < 128);
+
+ return(ut_dulint_create(is_insert * 128 * 256 * 256
+ + rseg_id * 256 * 256
+ + (page_no / 256) / 256,
+ (page_no % (256 * 256)) * 256 * 256
+ + offset));
+}
+
+/***********************************************************************//**
+Decodes a roll pointer. */
+UNIV_INLINE
+void
+trx_undo_decode_roll_ptr(
+/*=====================*/
+ roll_ptr_t roll_ptr, /*!< in: roll pointer */
+ ibool* is_insert, /*!< out: TRUE if insert undo log */
+ ulint* rseg_id, /*!< out: rollback segment id */
+ ulint* page_no, /*!< out: page number */
+ ulint* offset) /*!< out: offset of the undo
+ entry within page */
+{
+ ulint low;
+ ulint high;
+#if DATA_ROLL_PTR_LEN != 7
+# error "DATA_ROLL_PTR_LEN != 7"
+#endif
+#if TRUE != 1
+# error "TRUE != 1"
+#endif
+ high = ut_dulint_get_high(roll_ptr);
+ low = ut_dulint_get_low(roll_ptr);
+
+ *offset = low % (256 * 256);
+
+ *is_insert = high / (256 * 256 * 128); /* TRUE == 1 */
+ *rseg_id = (high / (256 * 256)) % 128;
+
+ *page_no = (high % (256 * 256)) * 256 * 256
+ + (low / 256) / 256;
+}
+
+/***********************************************************************//**
+Returns TRUE if the roll pointer is of the insert type.
+@return TRUE if insert undo log */
+UNIV_INLINE
+ibool
+trx_undo_roll_ptr_is_insert(
+/*========================*/
+ roll_ptr_t roll_ptr) /*!< in: roll pointer */
+{
+ ulint high;
+#if DATA_ROLL_PTR_LEN != 7
+# error "DATA_ROLL_PTR_LEN != 7"
+#endif
+#if TRUE != 1
+# error "TRUE != 1"
+#endif
+ high = ut_dulint_get_high(roll_ptr);
+
+ return(high / (256 * 256 * 128));
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/*****************************************************************//**
+Writes a roll ptr to an index page. In case that the size changes in
+some future version, this function should be used instead of
+mach_write_... */
+UNIV_INLINE
+void
+trx_write_roll_ptr(
+/*===============*/
+ byte* ptr, /*!< in: pointer to memory where
+ written */
+ roll_ptr_t roll_ptr) /*!< in: roll ptr */
+{
+#if DATA_ROLL_PTR_LEN != 7
+# error "DATA_ROLL_PTR_LEN != 7"
+#endif
+ mach_write_to_7(ptr, roll_ptr);
+}
+
+/*****************************************************************//**
+Reads a roll ptr from an index page. In case that the roll ptr size
+changes in some future version, this function should be used instead of
+mach_read_...
+@return roll ptr */
+UNIV_INLINE
+roll_ptr_t
+trx_read_roll_ptr(
+/*==============*/
+ const byte* ptr) /*!< in: pointer to memory from where to read */
+{
+#if DATA_ROLL_PTR_LEN != 7
+# error "DATA_ROLL_PTR_LEN != 7"
+#endif
+ return(mach_read_from_7(ptr));
+}
+
+#ifndef UNIV_HOTBACKUP
+/******************************************************************//**
+Gets an undo log page and x-latches it.
+@return pointer to page x-latched */
+UNIV_INLINE
+page_t*
+trx_undo_page_get(
+/*==============*/
+ ulint space, /*!< in: space where placed */
+ ulint zip_size, /*!< in: compressed page size in bytes
+ or 0 for uncompressed pages */
+ ulint page_no, /*!< in: page number */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ buf_block_t* block = buf_page_get(space, zip_size, page_no,
+ RW_X_LATCH, mtr);
+ buf_block_dbg_add_level(block, SYNC_TRX_UNDO_PAGE);
+
+ return(buf_block_get_frame(block));
+}
+
+/******************************************************************//**
+Gets an undo log page and s-latches it.
+@return pointer to page s-latched */
+UNIV_INLINE
+page_t*
+trx_undo_page_get_s_latched(
+/*========================*/
+ ulint space, /*!< in: space where placed */
+ ulint zip_size, /*!< in: compressed page size in bytes
+ or 0 for uncompressed pages */
+ ulint page_no, /*!< in: page number */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ buf_block_t* block = buf_page_get(space, zip_size, page_no,
+ RW_S_LATCH, mtr);
+ buf_block_dbg_add_level(block, SYNC_TRX_UNDO_PAGE);
+
+ return(buf_block_get_frame(block));
+}
+
+/******************************************************************//**
+Returns the start offset of the undo log records of the specified undo
+log on the page.
+@return start offset */
+UNIV_INLINE
+ulint
+trx_undo_page_get_start(
+/*====================*/
+ page_t* undo_page,/*!< in: undo log page */
+ ulint page_no,/*!< in: undo log header page number */
+ ulint offset) /*!< in: undo log header offset on page */
+{
+ ulint start;
+
+ if (page_no == page_get_page_no(undo_page)) {
+
+ start = mach_read_from_2(offset + undo_page
+ + TRX_UNDO_LOG_START);
+ } else {
+ start = TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE;
+ }
+
+ return(start);
+}
+
+/******************************************************************//**
+Returns the end offset of the undo log records of the specified undo
+log on the page.
+@return end offset */
+UNIV_INLINE
+ulint
+trx_undo_page_get_end(
+/*==================*/
+ page_t* undo_page,/*!< in: undo log page */
+ ulint page_no,/*!< in: undo log header page number */
+ ulint offset) /*!< in: undo log header offset on page */
+{
+ trx_ulogf_t* log_hdr;
+ ulint end;
+
+ if (page_no == page_get_page_no(undo_page)) {
+
+ log_hdr = undo_page + offset;
+
+ end = mach_read_from_2(log_hdr + TRX_UNDO_NEXT_LOG);
+
+ if (end == 0) {
+ end = mach_read_from_2(undo_page + TRX_UNDO_PAGE_HDR
+ + TRX_UNDO_PAGE_FREE);
+ }
+ } else {
+ end = mach_read_from_2(undo_page + TRX_UNDO_PAGE_HDR
+ + TRX_UNDO_PAGE_FREE);
+ }
+
+ return(end);
+}
+
+/******************************************************************//**
+Returns the previous undo record on the page in the specified log, or
+NULL if none exists.
+@return pointer to record, NULL if none */
+UNIV_INLINE
+trx_undo_rec_t*
+trx_undo_page_get_prev_rec(
+/*=======================*/
+ trx_undo_rec_t* rec, /*!< in: undo log record */
+ ulint page_no,/*!< in: undo log header page number */
+ ulint offset) /*!< in: undo log header offset on page */
+{
+ page_t* undo_page;
+ ulint start;
+
+ undo_page = (page_t*) ut_align_down(rec, UNIV_PAGE_SIZE);
+
+ start = trx_undo_page_get_start(undo_page, page_no, offset);
+
+ if (start + undo_page == rec) {
+
+ return(NULL);
+ }
+
+ return(undo_page + mach_read_from_2(rec - 2));
+}
+
+/******************************************************************//**
+Returns the next undo log record on the page in the specified log, or
+NULL if none exists.
+@return pointer to record, NULL if none */
+UNIV_INLINE
+trx_undo_rec_t*
+trx_undo_page_get_next_rec(
+/*=======================*/
+ trx_undo_rec_t* rec, /*!< in: undo log record */
+ ulint page_no,/*!< in: undo log header page number */
+ ulint offset) /*!< in: undo log header offset on page */
+{
+ page_t* undo_page;
+ ulint end;
+ ulint next;
+
+ undo_page = (page_t*) ut_align_down(rec, UNIV_PAGE_SIZE);
+
+ end = trx_undo_page_get_end(undo_page, page_no, offset);
+
+ next = mach_read_from_2(rec);
+
+ if (next == end) {
+
+ return(NULL);
+ }
+
+ return(undo_page + next);
+}
+
+/******************************************************************//**
+Returns the last undo record on the page in the specified undo log, or
+NULL if none exists.
+@return pointer to record, NULL if none */
+UNIV_INLINE
+trx_undo_rec_t*
+trx_undo_page_get_last_rec(
+/*=======================*/
+ page_t* undo_page,/*!< in: undo log page */
+ ulint page_no,/*!< in: undo log header page number */
+ ulint offset) /*!< in: undo log header offset on page */
+{
+ ulint start;
+ ulint end;
+
+ start = trx_undo_page_get_start(undo_page, page_no, offset);
+ end = trx_undo_page_get_end(undo_page, page_no, offset);
+
+ if (start == end) {
+
+ return(NULL);
+ }
+
+ return(undo_page + mach_read_from_2(undo_page + end - 2));
+}
+
+/******************************************************************//**
+Returns the first undo record on the page in the specified undo log, or
+NULL if none exists.
+@return pointer to record, NULL if none */
+UNIV_INLINE
+trx_undo_rec_t*
+trx_undo_page_get_first_rec(
+/*========================*/
+ page_t* undo_page,/*!< in: undo log page */
+ ulint page_no,/*!< in: undo log header page number */
+ ulint offset) /*!< in: undo log header offset on page */
+{
+ ulint start;
+ ulint end;
+
+ start = trx_undo_page_get_start(undo_page, page_no, offset);
+ end = trx_undo_page_get_end(undo_page, page_no, offset);
+
+ if (start == end) {
+
+ return(NULL);
+ }
+
+ return(undo_page + start);
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/innobase/include/trx0xa.h b/storage/innobase/include/trx0xa.h
new file mode 100644
index 00000000000..e0dd8a1af5b
--- /dev/null
+++ b/storage/innobase/include/trx0xa.h
@@ -0,0 +1,70 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/*
+ * Start of xa.h header
+ *
+ * Define a symbol to prevent multiple inclusions of this header file
+ */
+#ifndef XA_H
+#define XA_H
+
+/*
+ * Transaction branch identification: XID and NULLXID:
+ */
+#ifndef XIDDATASIZE
+
+/** Sizes of transaction identifier */
+#define XIDDATASIZE 128 /*!< maximum size of a transaction
+ identifier, in bytes */
+#define MAXGTRIDSIZE 64 /*!< maximum size in bytes of gtrid */
+#define MAXBQUALSIZE 64 /*!< maximum size in bytes of bqual */
+
+/** X/Open XA distributed transaction identifier */
+struct xid_t {
+ long formatID; /*!< format identifier; -1
+ means that the XID is null */
+ long gtrid_length; /*!< value from 1 through 64 */
+ long bqual_length; /*!< value from 1 through 64 */
+ char data[XIDDATASIZE]; /*!< distributed transaction
+ identifier */
+};
+/** X/Open XA distributed transaction identifier */
+typedef struct xid_t XID;
+#endif
+/** X/Open XA distributed transaction status codes */
+/* @{ */
+#define XA_OK 0 /*!< normal execution */
+#define XAER_ASYNC -2 /*!< asynchronous operation already
+ outstanding */
+#define XAER_RMERR -3 /*!< a resource manager error
+ occurred in the transaction
+ branch */
+#define XAER_NOTA -4 /*!< the XID is not valid */
+#define XAER_INVAL -5 /*!< invalid arguments were given */
+#define XAER_PROTO -6 /*!< routine invoked in an improper
+ context */
+#define XAER_RMFAIL -7 /*!< resource manager unavailable */
+#define XAER_DUPID -8 /*!< the XID already exists */
+#define XAER_OUTSIDE -9 /*!< resource manager doing
+ work outside transaction */
+/* @} */
+#endif /* ifndef XA_H */
+/*
+ * End of xa.h header
+ */
diff --git a/storage/innobase/include/univ.i b/storage/innobase/include/univ.i
new file mode 100644
index 00000000000..6bce6dd765e
--- /dev/null
+++ b/storage/innobase/include/univ.i
@@ -0,0 +1,498 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 2008, Google Inc.
+Copyright (c) 2009, Sun Microsystems, Inc.
+
+Portions of this file contain modifications contributed and copyrighted by
+Google, Inc. Those modifications are gratefully acknowledged and are described
+briefly in the InnoDB documentation. The contributions by Google are
+incorporated with their permission, and subject to the conditions contained in
+the file COPYING.Google.
+
+Portions of this file contain modifications contributed and copyrighted by
+Sun Microsystems, Inc. Those modifications are gratefully acknowledged and
+are described briefly in the InnoDB documentation. The contributions by
+Sun Microsystems are incorporated with their permission, and subject to the
+conditions contained in the file COPYING.Sun_Microsystems.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/***********************************************************************//**
+@file include/univ.i
+Version control for database, common definitions, and include files
+
+Created 1/20/1994 Heikki Tuuri
+****************************************************************************/
+
+#ifndef univ_i
+#define univ_i
+
+#ifdef UNIV_HOTBACKUP
+#include "hb_univ.i"
+#endif /* UNIV_HOTBACKUP */
+
+#define INNODB_VERSION_MAJOR 1
+#define INNODB_VERSION_MINOR 0
+#define INNODB_VERSION_BUGFIX 4
+
+/* The following is the InnoDB version as shown in
+SELECT plugin_version FROM information_schema.plugins;
+calculated in in make_version_string() in sql/sql_show.cc like this:
+"version >> 8" . "version & 0xff"
+because the version is shown with only one dot, we skip the last
+component, i.e. we show M.N.P as M.N */
+#define INNODB_VERSION_SHORT \
+ (INNODB_VERSION_MAJOR << 8 | INNODB_VERSION_MINOR)
+
+/* auxiliary macros to help creating the version as string */
+#define __INNODB_VERSION(a, b, c) (#a "." #b "." #c)
+#define _INNODB_VERSION(a, b, c) __INNODB_VERSION(a, b, c)
+
+#define INNODB_VERSION_STR \
+ _INNODB_VERSION(INNODB_VERSION_MAJOR, \
+ INNODB_VERSION_MINOR, \
+ INNODB_VERSION_BUGFIX)
+
+#define REFMAN "http://dev.mysql.com/doc/refman/5.1/en/"
+
+#ifdef MYSQL_DYNAMIC_PLUGIN
+/* In the dynamic plugin, redefine some externally visible symbols
+in order not to conflict with the symbols of a builtin InnoDB. */
+
+/* Rename all C++ classes that contain virtual functions, because we
+have not figured out how to apply the visibility=hidden attribute to
+the virtual method table (vtable) in GCC 3. */
+# define ha_innobase ha_innodb
+#endif /* MYSQL_DYNAMIC_PLUGIN */
+
+#if (defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)) && !defined(MYSQL_SERVER) && !defined(__WIN__)
+# undef __WIN__
+# define __WIN__
+
+# include <windows.h>
+
+# if defined(HAVE_WINDOWS_ATOMICS)
+/* If atomics are defined we use them in InnoDB mutex implementation */
+# define HAVE_ATOMIC_BUILTINS
+# endif /* HAVE_WINDOWS_ATOMICS */
+
+# ifdef _NT_
+# define __NT__
+# endif
+
+#else
+/* The defines used with MySQL */
+
+/* Include two header files from MySQL to make the Unix flavor used
+in compiling more Posix-compatible. These headers also define __WIN__
+if we are compiling on Windows. */
+
+#ifndef UNIV_HOTBACKUP
+# include <my_global.h>
+# include <my_pthread.h>
+#endif /* UNIV_HOTBACKUP */
+
+/* Include <sys/stat.h> to get S_I... macros defined for os0file.c */
+# include <sys/stat.h>
+# if !defined(__NETWARE__) && !defined(__WIN__)
+# include <sys/mman.h> /* mmap() for os0proc.c */
+# endif
+
+# undef PACKAGE
+# undef VERSION
+
+/* Include the header file generated by GNU autoconf */
+# ifndef __WIN__
+#ifndef UNIV_HOTBACKUP
+# include "config.h"
+#endif /* UNIV_HOTBACKUP */
+# endif
+
+# ifdef HAVE_SCHED_H
+# include <sched.h>
+# endif
+
+# if defined(HAVE_GCC_ATOMIC_BUILTINS) || defined(HAVE_SOLARIS_ATOMICS) \
+ || defined(HAVE_WINDOWS_ATOMICS)
+/* If atomics are defined we use them in InnoDB mutex implementation */
+# define HAVE_ATOMIC_BUILTINS
+# endif /* (HAVE_GCC_ATOMIC_BUILTINS) || (HAVE_SOLARIS_ATOMICS)
+ || (HAVE_WINDOWS_ATOMICS) */
+
+/* For InnoDB rw_locks to work with atomics we need the thread_id
+to be no more than machine word wide. The following enables using
+atomics for InnoDB rw_locks where these conditions are met. */
+#ifdef HAVE_ATOMIC_BUILTINS
+/* if HAVE_ATOMIC_PTHREAD_T is defined at this point that means that
+the code from plug.in has defined it and we do not need to include
+ut0auxconf.h which would either define HAVE_ATOMIC_PTHREAD_T or will
+be empty */
+# ifndef HAVE_ATOMIC_PTHREAD_T
+# include "ut0auxconf.h"
+# endif /* HAVE_ATOMIC_PTHREAD_T */
+/* now HAVE_ATOMIC_PTHREAD_T is eventually defined either by plug.in or
+from Makefile.in->ut0auxconf.h */
+# ifdef HAVE_ATOMIC_PTHREAD_T
+# define INNODB_RW_LOCKS_USE_ATOMICS
+# endif /* HAVE_ATOMIC_PTHREAD_T */
+#endif /* HAVE_ATOMIC_BUILTINS */
+
+/* We only try to do explicit inlining of functions with gcc and
+Sun Studio */
+
+# if !defined(__GNUC__) && !(defined(__SUNPRO_C) || defined(__SUNPRO_CC))
+# undef UNIV_MUST_NOT_INLINE /* Remove compiler warning */
+# define UNIV_MUST_NOT_INLINE
+# endif
+
+# ifdef HAVE_PREAD
+# define HAVE_PWRITE
+# endif
+
+#endif /* #if (defined(WIN32) || ... */
+
+/* DEBUG VERSION CONTROL
+ ===================== */
+
+/* The following flag will make InnoDB to initialize
+all memory it allocates to zero. It hides Purify
+warnings about reading unallocated memory unless
+memory is read outside the allocated blocks. */
+/*
+#define UNIV_INIT_MEM_TO_ZERO
+*/
+
+/* When this macro is defined then additional test functions will be
+compiled. These functions live at the end of each relevant source file
+and have "test_" prefix. These functions are not called from anywhere in
+the code, they can be called from gdb after
+innobase_start_or_create_for_mysql() has executed using the call
+command. Not tested on Windows. */
+/*
+#define UNIV_COMPILE_TEST_FUNCS
+*/
+
+#if 0
+#define UNIV_DEBUG_VALGRIND /* Enable extra
+ Valgrind instrumentation */
+#define UNIV_DEBUG_PRINT /* Enable the compilation of
+ some debug print functions */
+#define UNIV_AHI_DEBUG /* Enable adaptive hash index
+ debugging without UNIV_DEBUG */
+#define UNIV_BUF_DEBUG /* Enable buffer pool
+ debugging without UNIV_DEBUG */
+#define UNIV_DEBUG /* Enable ut_ad() assertions
+ and disable UNIV_INLINE */
+#define UNIV_DEBUG_FILE_ACCESSES /* Debug .ibd file access
+ (field file_page_was_freed
+ in buf_page_t) */
+#define UNIV_LRU_DEBUG /* debug the buffer pool LRU */
+#define UNIV_HASH_DEBUG /* debug HASH_ macros */
+#define UNIV_LIST_DEBUG /* debug UT_LIST_ macros */
+#define UNIV_MEM_DEBUG /* detect memory leaks etc */
+#define UNIV_IBUF_DEBUG /* debug the insert buffer */
+#define UNIV_IBUF_COUNT_DEBUG /* debug the insert buffer;
+this limits the database to IBUF_COUNT_N_SPACES and IBUF_COUNT_N_PAGES,
+and the insert buffer must be empty when the database is started */
+#define UNIV_SYNC_DEBUG /* debug mutex and latch
+operations (very slow); also UNIV_DEBUG must be defined */
+#define UNIV_SEARCH_DEBUG /* debug B-tree comparisons */
+#define UNIV_SYNC_PERF_STAT /* operation counts for
+ rw-locks and mutexes */
+#define UNIV_SEARCH_PERF_STAT /* statistics for the
+ adaptive hash index */
+#define UNIV_SRV_PRINT_LATCH_WAITS /* enable diagnostic output
+ in sync0sync.c */
+#define UNIV_BTR_PRINT /* enable functions for
+ printing B-trees */
+#define UNIV_ZIP_DEBUG /* extensive consistency checks
+ for compressed pages */
+#define UNIV_ZIP_COPY /* call page_zip_copy_recs()
+ more often */
+#endif
+
+#define UNIV_BTR_DEBUG /* check B-tree links */
+#define UNIV_LIGHT_MEM_DEBUG /* light memory debugging */
+
+#ifdef HAVE_purify
+/* The following sets all new allocated memory to zero before use:
+this can be used to eliminate unnecessary Purify warnings, but note that
+it also masks many bugs Purify could detect. For detailed Purify analysis it
+is best to remove the define below and look through the warnings one
+by one. */
+#define UNIV_SET_MEM_TO_ZERO
+#endif
+
+/*
+#define UNIV_SQL_DEBUG
+#define UNIV_LOG_DEBUG
+*/
+ /* the above option prevents forcing of log to disk
+ at a buffer page write: it should be tested with this
+ option off; also some ibuf tests are suppressed */
+/*
+#define UNIV_BASIC_LOG_DEBUG
+*/
+ /* the above option enables basic recovery debugging:
+ new allocated file pages are reset */
+
+/* Linkage specifier for non-static InnoDB symbols (variables and functions)
+that are only referenced from within InnoDB, not from MySQL */
+#if defined(__GNUC__) && (__GNUC__ >= 4) && !defined(UNIV_HOTBACKUP)
+# define UNIV_INTERN __attribute__((visibility ("hidden")))
+#else
+# define UNIV_INTERN
+#endif
+
+#if (!defined(UNIV_DEBUG) && !defined(UNIV_MUST_NOT_INLINE))
+/* Definition for inline version */
+
+#ifdef __WIN__
+# define UNIV_INLINE __inline
+#elif defined(__SUNPRO_CC) || defined(__SUNPRO_C)
+# define UNIV_INLINE static inline
+#else
+# define UNIV_INLINE static __inline__
+#endif
+
+#else
+/* If we want to compile a noninlined version we use the following macro
+definitions: */
+
+#define UNIV_NONINL
+#define UNIV_INLINE UNIV_INTERN
+
+#endif /* UNIV_DEBUG */
+
+#ifdef _WIN32
+#define UNIV_WORD_SIZE 4
+#elif defined(_WIN64)
+#define UNIV_WORD_SIZE 8
+#else
+/* MySQL config.h generated by GNU autoconf will define SIZEOF_LONG in Posix */
+#define UNIV_WORD_SIZE SIZEOF_LONG
+#endif
+
+/* The following alignment is used in memory allocations in memory heap
+management to ensure correct alignment for doubles etc. */
+#define UNIV_MEM_ALIGNMENT 8
+
+/* The following alignment is used in aligning lints etc. */
+#define UNIV_WORD_ALIGNMENT UNIV_WORD_SIZE
+
+/*
+ DATABASE VERSION CONTROL
+ ========================
+*/
+
+/* The 2-logarithm of UNIV_PAGE_SIZE: */
+#define UNIV_PAGE_SIZE_SHIFT 14
+/* The universal page size of the database */
+#define UNIV_PAGE_SIZE (1 << UNIV_PAGE_SIZE_SHIFT)
+
+/* Maximum number of parallel threads in a parallelized operation */
+#define UNIV_MAX_PARALLELISM 32
+
+/*
+ UNIVERSAL TYPE DEFINITIONS
+ ==========================
+*/
+
+/* Note that inside MySQL 'byte' is defined as char on Linux! */
+#define byte unsigned char
+
+/* Define an unsigned integer type that is exactly 32 bits. */
+
+#if SIZEOF_INT == 4
+typedef unsigned int ib_uint32_t;
+#elif SIZEOF_LONG == 4
+typedef unsigned long ib_uint32_t;
+#else
+#error "Neither int or long is 4 bytes"
+#endif
+
+/* Another basic type we use is unsigned long integer which should be equal to
+the word size of the machine, that is on a 32-bit platform 32 bits, and on a
+64-bit platform 64 bits. We also give the printf format for the type as a
+macro ULINTPF. */
+
+#ifdef _WIN64
+typedef unsigned __int64 ulint;
+#define ULINTPF "%I64u"
+typedef __int64 lint;
+#else
+typedef unsigned long int ulint;
+#define ULINTPF "%lu"
+typedef long int lint;
+#endif
+
+#ifdef __WIN__
+typedef __int64 ib_int64_t;
+typedef unsigned __int64 ib_uint64_t;
+#elif !defined(UNIV_HOTBACKUP)
+/* Note: longlong and ulonglong come from MySQL headers. */
+typedef longlong ib_int64_t;
+typedef ulonglong ib_uint64_t;
+#endif
+
+#ifndef UNIV_HOTBACKUP
+typedef unsigned long long int ullint;
+#endif /* UNIV_HOTBACKUP */
+
+#ifndef __WIN__
+#if SIZEOF_LONG != SIZEOF_VOIDP
+#error "Error: InnoDB's ulint must be of the same size as void*"
+#endif
+#endif
+
+/* The 'undefined' value for a ulint */
+#define ULINT_UNDEFINED ((ulint)(-1))
+
+/* The undefined 32-bit unsigned integer */
+#define ULINT32_UNDEFINED 0xFFFFFFFF
+
+/* Maximum value for a ulint */
+#define ULINT_MAX ((ulint)(-2))
+
+/* Maximum value for ib_uint64_t */
+#define IB_ULONGLONG_MAX ((ib_uint64_t) (~0ULL))
+
+/* This 'ibool' type is used within Innobase. Remember that different included
+headers may define 'bool' differently. Do not assume that 'bool' is a ulint! */
+#define ibool ulint
+
+#ifndef TRUE
+
+#define TRUE 1
+#define FALSE 0
+
+#endif
+
+/* The following number as the length of a logical field means that the field
+has the SQL NULL as its value. NOTE that because we assume that the length
+of a field is a 32-bit integer when we store it, for example, to an undo log
+on disk, we must have also this number fit in 32 bits, also in 64-bit
+computers! */
+
+#define UNIV_SQL_NULL ULINT32_UNDEFINED
+
+/* Lengths which are not UNIV_SQL_NULL, but bigger than the following
+number indicate that a field contains a reference to an externally
+stored part of the field in the tablespace. The length field then
+contains the sum of the following flag and the locally stored len. */
+
+#define UNIV_EXTERN_STORAGE_FIELD (UNIV_SQL_NULL - UNIV_PAGE_SIZE)
+
+/* Some macros to improve branch prediction and reduce cache misses */
+#if defined(__GNUC__) && (__GNUC__ > 2) && ! defined(__INTEL_COMPILER)
+/* Tell the compiler that 'expr' probably evaluates to 'constant'. */
+# define UNIV_EXPECT(expr,constant) __builtin_expect(expr, constant)
+/* Tell the compiler that a pointer is likely to be NULL */
+# define UNIV_LIKELY_NULL(ptr) __builtin_expect((ulint) ptr, 0)
+/* Minimize cache-miss latency by moving data at addr into a cache before
+it is read. */
+# define UNIV_PREFETCH_R(addr) __builtin_prefetch(addr, 0, 3)
+/* Minimize cache-miss latency by moving data at addr into a cache before
+it is read or written. */
+# define UNIV_PREFETCH_RW(addr) __builtin_prefetch(addr, 1, 3)
+#elif defined(__SUNPRO_C) || defined(__SUNPRO_CC)
+# include <sun_prefetch.h>
+#if __SUNPRO_C >= 0x550
+# undef UNIV_INTERN
+# define UNIV_INTERN __hidden
+#endif /* __SUNPRO_C >= 0x550 */
+/* Use sun_prefetch when compile with Sun Studio */
+# define UNIV_EXPECT(expr,value) (expr)
+# define UNIV_LIKELY_NULL(expr) (expr)
+# define UNIV_PREFETCH_R(addr) sun_prefetch_read_many(addr)
+# define UNIV_PREFETCH_RW(addr) sun_prefetch_write_many(addr)
+#else
+/* Dummy versions of the macros */
+# define UNIV_EXPECT(expr,value) (expr)
+# define UNIV_LIKELY_NULL(expr) (expr)
+# define UNIV_PREFETCH_R(addr) ((void) 0)
+# define UNIV_PREFETCH_RW(addr) ((void) 0)
+#endif
+/* Tell the compiler that cond is likely to hold */
+#define UNIV_LIKELY(cond) UNIV_EXPECT(cond, TRUE)
+/* Tell the compiler that cond is unlikely to hold */
+#define UNIV_UNLIKELY(cond) UNIV_EXPECT(cond, FALSE)
+
+/* Compile-time constant of the given array's size. */
+#define UT_ARR_SIZE(a) (sizeof(a) / sizeof((a)[0]))
+
+/* The return type from a thread's start function differs between Unix and
+Windows, so define a typedef for it and a macro to use at the end of such
+functions. */
+
+#ifdef __WIN__
+typedef ulint os_thread_ret_t;
+#define OS_THREAD_DUMMY_RETURN return(0)
+#else
+typedef void* os_thread_ret_t;
+#define OS_THREAD_DUMMY_RETURN return(NULL)
+#endif
+
+#include <stdio.h>
+#include "ut0dbg.h"
+#include "ut0ut.h"
+#include "db0err.h"
+#ifdef UNIV_DEBUG_VALGRIND
+# include <valgrind/memcheck.h>
+# define UNIV_MEM_VALID(addr, size) VALGRIND_MAKE_MEM_DEFINED(addr, size)
+# define UNIV_MEM_INVALID(addr, size) VALGRIND_MAKE_MEM_UNDEFINED(addr, size)
+# define UNIV_MEM_FREE(addr, size) VALGRIND_MAKE_MEM_NOACCESS(addr, size)
+# define UNIV_MEM_ALLOC(addr, size) VALGRIND_MAKE_MEM_UNDEFINED(addr, size)
+# define UNIV_MEM_DESC(addr, size, b) VALGRIND_CREATE_BLOCK(addr, size, b)
+# define UNIV_MEM_UNDESC(b) VALGRIND_DISCARD(b)
+# define UNIV_MEM_ASSERT_RW(addr, size) do { \
+ const void* _p = (const void*) (ulint) \
+ VALGRIND_CHECK_MEM_IS_DEFINED(addr, size); \
+ if (UNIV_LIKELY_NULL(_p)) \
+ fprintf(stderr, "%s:%d: %p[%u] undefined at %ld\n", \
+ __FILE__, __LINE__, \
+ (const void*) (addr), (unsigned) (size), (long) \
+ (((const char*) _p) - ((const char*) (addr)))); \
+ } while (0)
+# define UNIV_MEM_ASSERT_W(addr, size) do { \
+ const void* _p = (const void*) (ulint) \
+ VALGRIND_CHECK_MEM_IS_ADDRESSABLE(addr, size); \
+ if (UNIV_LIKELY_NULL(_p)) \
+ fprintf(stderr, "%s:%d: %p[%u] unwritable at %ld\n", \
+ __FILE__, __LINE__, \
+ (const void*) (addr), (unsigned) (size), (long) \
+ (((const char*) _p) - ((const char*) (addr)))); \
+ } while (0)
+#else
+# define UNIV_MEM_VALID(addr, size) do {} while(0)
+# define UNIV_MEM_INVALID(addr, size) do {} while(0)
+# define UNIV_MEM_FREE(addr, size) do {} while(0)
+# define UNIV_MEM_ALLOC(addr, size) do {} while(0)
+# define UNIV_MEM_DESC(addr, size, b) do {} while(0)
+# define UNIV_MEM_UNDESC(b) do {} while(0)
+# define UNIV_MEM_ASSERT_RW(addr, size) do {} while(0)
+# define UNIV_MEM_ASSERT_W(addr, size) do {} while(0)
+#endif
+#define UNIV_MEM_ASSERT_AND_FREE(addr, size) do { \
+ UNIV_MEM_ASSERT_W(addr, size); \
+ UNIV_MEM_FREE(addr, size); \
+} while (0)
+#define UNIV_MEM_ASSERT_AND_ALLOC(addr, size) do { \
+ UNIV_MEM_ASSERT_W(addr, size); \
+ UNIV_MEM_ALLOC(addr, size); \
+} while (0)
+
+#endif
diff --git a/storage/innobase/include/usr0sess.h b/storage/innobase/include/usr0sess.h
new file mode 100644
index 00000000000..7638a0c69e2
--- /dev/null
+++ b/storage/innobase/include/usr0sess.h
@@ -0,0 +1,78 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/usr0sess.h
+Sessions
+
+Created 6/25/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef usr0sess_h
+#define usr0sess_h
+
+#include "univ.i"
+#include "ut0byte.h"
+#include "trx0types.h"
+#include "srv0srv.h"
+#include "trx0types.h"
+#include "usr0types.h"
+#include "que0types.h"
+#include "data0data.h"
+#include "rem0rec.h"
+
+/*********************************************************************//**
+Opens a session.
+@return own: session object */
+UNIV_INTERN
+sess_t*
+sess_open(void);
+/*============*/
+/*********************************************************************//**
+Closes a session, freeing the memory occupied by it, if it is in a state
+where it should be closed.
+@return TRUE if closed */
+UNIV_INTERN
+ibool
+sess_try_close(
+/*===========*/
+ sess_t* sess); /*!< in, own: session object */
+
+/* The session handle. All fields are protected by the kernel mutex */
+struct sess_struct{
+ ulint state; /*!< state of the session */
+ trx_t* trx; /*!< transaction object permanently
+ assigned for the session: the
+ transaction instance designated by the
+ trx id changes, but the memory
+ structure is preserved */
+ UT_LIST_BASE_NODE_T(que_t)
+ graphs; /*!< query graphs belonging to this
+ session */
+};
+
+/* Session states */
+#define SESS_ACTIVE 1
+#define SESS_ERROR 2 /* session contains an error message
+ which has not yet been communicated
+ to the client */
+#ifndef UNIV_NONINL
+#include "usr0sess.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/usr0sess.ic b/storage/innobase/include/usr0sess.ic
new file mode 100644
index 00000000000..35a75d75acc
--- /dev/null
+++ b/storage/innobase/include/usr0sess.ic
@@ -0,0 +1,24 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/usr0sess.ic
+Sessions
+
+Created 6/25/1996 Heikki Tuuri
+*******************************************************/
diff --git a/storage/innobase/include/usr0types.h b/storage/innobase/include/usr0types.h
new file mode 100644
index 00000000000..6cc6f015613
--- /dev/null
+++ b/storage/innobase/include/usr0types.h
@@ -0,0 +1,31 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/usr0types.h
+Users and sessions global types
+
+Created 6/25/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef usr0types_h
+#define usr0types_h
+
+typedef struct sess_struct sess_t;
+
+#endif
diff --git a/storage/innobase/include/ut0auxconf.h b/storage/innobase/include/ut0auxconf.h
new file mode 100644
index 00000000000..88fb26f1863
--- /dev/null
+++ b/storage/innobase/include/ut0auxconf.h
@@ -0,0 +1,14 @@
+/* Do not remove this file even though it is empty.
+This file is included in univ.i and will cause compilation failure
+if not present.
+A custom check has been added in the generated
+storage/innobase/Makefile.in that is shipped with the InnoDB Plugin
+source archive. This check tries to compile a test program and if
+successful then adds "#define HAVE_ATOMIC_PTHREAD_T" to this file.
+This is a hack that has been developed in order to check for pthread_t
+atomicity without the need to regenerate the ./configure script that is
+distributed in the MySQL 5.1 official source archives.
+If by any chance Makefile.in and ./configure are regenerated and thus
+the hack from Makefile.in wiped away then the "real" check from plug.in
+will take over.
+*/
diff --git a/storage/innobase/include/ut0byte.h b/storage/innobase/include/ut0byte.h
new file mode 100644
index 00000000000..a2687e62f08
--- /dev/null
+++ b/storage/innobase/include/ut0byte.h
@@ -0,0 +1,270 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/ut0byte.h
+Utilities for byte operations
+
+Created 1/20/1994 Heikki Tuuri
+***********************************************************************/
+
+#ifndef ut0byte_h
+#define ut0byte_h
+
+
+#include "univ.i"
+
+/** Pair of ulint integers. */
+typedef struct dulint_struct dulint;
+/** Type definition for a 64-bit unsigned integer, which works also
+in 32-bit machines. NOTE! Access the fields only with the accessor
+functions. This definition appears here only for the compiler to
+know the size of a dulint. */
+struct dulint_struct{
+ ulint high; /*!< most significant 32 bits */
+ ulint low; /*!< least significant 32 bits */
+};
+
+/** Zero value for a dulint */
+extern const dulint ut_dulint_zero;
+
+/** Maximum value for a dulint */
+extern const dulint ut_dulint_max;
+
+/*******************************************************//**
+Creates a 64-bit dulint out of two ulints.
+@return created dulint */
+UNIV_INLINE
+dulint
+ut_dulint_create(
+/*=============*/
+ ulint high, /*!< in: high-order 32 bits */
+ ulint low); /*!< in: low-order 32 bits */
+/*******************************************************//**
+Gets the high-order 32 bits of a dulint.
+@return 32 bits in ulint */
+UNIV_INLINE
+ulint
+ut_dulint_get_high(
+/*===============*/
+ dulint d); /*!< in: dulint */
+/*******************************************************//**
+Gets the low-order 32 bits of a dulint.
+@return 32 bits in ulint */
+UNIV_INLINE
+ulint
+ut_dulint_get_low(
+/*==============*/
+ dulint d); /*!< in: dulint */
+/*******************************************************//**
+Converts a dulint (a struct of 2 ulints) to ib_int64_t, which is a 64-bit
+integer type.
+@return value in ib_int64_t type */
+UNIV_INLINE
+ib_int64_t
+ut_conv_dulint_to_longlong(
+/*=======================*/
+ dulint d); /*!< in: dulint */
+/*******************************************************//**
+Tests if a dulint is zero.
+@return TRUE if zero */
+UNIV_INLINE
+ibool
+ut_dulint_is_zero(
+/*==============*/
+ dulint a); /*!< in: dulint */
+/*******************************************************//**
+Compares two dulints.
+@return -1 if a < b, 0 if a == b, 1 if a > b */
+UNIV_INLINE
+int
+ut_dulint_cmp(
+/*==========*/
+ dulint a, /*!< in: dulint */
+ dulint b); /*!< in: dulint */
+/*******************************************************//**
+Calculates the max of two dulints.
+@return max(a, b) */
+UNIV_INLINE
+dulint
+ut_dulint_get_max(
+/*==============*/
+ dulint a, /*!< in: dulint */
+ dulint b); /*!< in: dulint */
+/*******************************************************//**
+Calculates the min of two dulints.
+@return min(a, b) */
+UNIV_INLINE
+dulint
+ut_dulint_get_min(
+/*==============*/
+ dulint a, /*!< in: dulint */
+ dulint b); /*!< in: dulint */
+/*******************************************************//**
+Adds a ulint to a dulint.
+@return sum a + b */
+UNIV_INLINE
+dulint
+ut_dulint_add(
+/*==========*/
+ dulint a, /*!< in: dulint */
+ ulint b); /*!< in: ulint */
+/*******************************************************//**
+Subtracts a ulint from a dulint.
+@return a - b */
+UNIV_INLINE
+dulint
+ut_dulint_subtract(
+/*===============*/
+ dulint a, /*!< in: dulint */
+ ulint b); /*!< in: ulint, b <= a */
+/*******************************************************//**
+Subtracts a dulint from another. NOTE that the difference must be positive
+and smaller that 4G.
+@return a - b */
+UNIV_INLINE
+ulint
+ut_dulint_minus(
+/*============*/
+ dulint a, /*!< in: dulint; NOTE a must be >= b and at most
+ 2 to power 32 - 1 greater */
+ dulint b); /*!< in: dulint */
+/********************************************************//**
+Rounds a dulint downward to a multiple of a power of 2.
+@return rounded value */
+UNIV_INLINE
+dulint
+ut_dulint_align_down(
+/*=================*/
+ dulint n, /*!< in: number to be rounded */
+ ulint align_no); /*!< in: align by this number which must be a
+ power of 2 */
+/********************************************************//**
+Rounds a dulint upward to a multiple of a power of 2.
+@return rounded value */
+UNIV_INLINE
+dulint
+ut_dulint_align_up(
+/*===============*/
+ dulint n, /*!< in: number to be rounded */
+ ulint align_no); /*!< in: align by this number which must be a
+ power of 2 */
+/********************************************************//**
+Rounds a dulint downward to a multiple of a power of 2.
+@return rounded value */
+UNIV_INLINE
+ib_uint64_t
+ut_uint64_align_down(
+/*=================*/
+ ib_uint64_t n, /*!< in: number to be rounded */
+ ulint align_no); /*!< in: align by this number
+ which must be a power of 2 */
+/********************************************************//**
+Rounds ib_uint64_t upward to a multiple of a power of 2.
+@return rounded value */
+UNIV_INLINE
+ib_uint64_t
+ut_uint64_align_up(
+/*===============*/
+ ib_uint64_t n, /*!< in: number to be rounded */
+ ulint align_no); /*!< in: align by this number
+ which must be a power of 2 */
+/*******************************************************//**
+Increments a dulint variable by 1. */
+#define UT_DULINT_INC(D)\
+{\
+ if ((D).low == 0xFFFFFFFFUL) {\
+ (D).high = (D).high + 1;\
+ (D).low = 0;\
+ } else {\
+ (D).low = (D).low + 1;\
+ }\
+}
+/*******************************************************//**
+Tests if two dulints are equal. */
+#define UT_DULINT_EQ(D1, D2) (((D1).low == (D2).low)\
+ && ((D1).high == (D2).high))
+#ifdef notdefined
+/************************************************************//**
+Sort function for dulint arrays. */
+UNIV_INTERN
+void
+ut_dulint_sort(
+/*===========*/
+ dulint* arr, /*!< in/out: array to be sorted */
+ dulint* aux_arr,/*!< in/out: auxiliary array (same size as arr) */
+ ulint low, /*!< in: low bound of sort interval, inclusive */
+ ulint high); /*!< in: high bound of sort interval, noninclusive */
+#endif /* notdefined */
+
+/*********************************************************//**
+The following function rounds up a pointer to the nearest aligned address.
+@return aligned pointer */
+UNIV_INLINE
+void*
+ut_align(
+/*=====*/
+ void* ptr, /*!< in: pointer */
+ ulint align_no); /*!< in: align by this number */
+/*********************************************************//**
+The following function rounds down a pointer to the nearest
+aligned address.
+@return aligned pointer */
+UNIV_INLINE
+void*
+ut_align_down(
+/*==========*/
+ const void* ptr, /*!< in: pointer */
+ ulint align_no) /*!< in: align by this number */
+ __attribute__((const));
+/*********************************************************//**
+The following function computes the offset of a pointer from the nearest
+aligned address.
+@return distance from aligned pointer */
+UNIV_INLINE
+ulint
+ut_align_offset(
+/*============*/
+ const void* ptr, /*!< in: pointer */
+ ulint align_no) /*!< in: align by this number */
+ __attribute__((const));
+/*****************************************************************//**
+Gets the nth bit of a ulint.
+@return TRUE if nth bit is 1; 0th bit is defined to be the least significant */
+UNIV_INLINE
+ibool
+ut_bit_get_nth(
+/*===========*/
+ ulint a, /*!< in: ulint */
+ ulint n); /*!< in: nth bit requested */
+/*****************************************************************//**
+Sets the nth bit of a ulint.
+@return the ulint with the bit set as requested */
+UNIV_INLINE
+ulint
+ut_bit_set_nth(
+/*===========*/
+ ulint a, /*!< in: ulint */
+ ulint n, /*!< in: nth bit requested */
+ ibool val); /*!< in: value for the bit to set */
+
+#ifndef UNIV_NONINL
+#include "ut0byte.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/ut0byte.ic b/storage/innobase/include/ut0byte.ic
new file mode 100644
index 00000000000..e3beed65138
--- /dev/null
+++ b/storage/innobase/include/ut0byte.ic
@@ -0,0 +1,411 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************************//**
+@file include/ut0byte.ic
+Utilities for byte operations
+
+Created 5/30/1994 Heikki Tuuri
+*******************************************************************/
+
+/*******************************************************//**
+Creates a 64-bit dulint out of two ulints.
+@return created dulint */
+UNIV_INLINE
+dulint
+ut_dulint_create(
+/*=============*/
+ ulint high, /*!< in: high-order 32 bits */
+ ulint low) /*!< in: low-order 32 bits */
+{
+ dulint res;
+
+ ut_ad(high <= 0xFFFFFFFF);
+ ut_ad(low <= 0xFFFFFFFF);
+
+ res.high = high;
+ res.low = low;
+
+ return(res);
+}
+
+/*******************************************************//**
+Gets the high-order 32 bits of a dulint.
+@return 32 bits in ulint */
+UNIV_INLINE
+ulint
+ut_dulint_get_high(
+/*===============*/
+ dulint d) /*!< in: dulint */
+{
+ return(d.high);
+}
+
+/*******************************************************//**
+Gets the low-order 32 bits of a dulint.
+@return 32 bits in ulint */
+UNIV_INLINE
+ulint
+ut_dulint_get_low(
+/*==============*/
+ dulint d) /*!< in: dulint */
+{
+ return(d.low);
+}
+
+/*******************************************************//**
+Converts a dulint (a struct of 2 ulints) to ib_int64_t, which is a 64-bit
+integer type.
+@return value in ib_int64_t type */
+UNIV_INLINE
+ib_int64_t
+ut_conv_dulint_to_longlong(
+/*=======================*/
+ dulint d) /*!< in: dulint */
+{
+ return((ib_int64_t)d.low
+ + (((ib_int64_t)d.high) << 32));
+}
+
+/*******************************************************//**
+Tests if a dulint is zero.
+@return TRUE if zero */
+UNIV_INLINE
+ibool
+ut_dulint_is_zero(
+/*==============*/
+ dulint a) /*!< in: dulint */
+{
+ if ((a.low == 0) && (a.high == 0)) {
+
+ return(TRUE);
+ }
+
+ return(FALSE);
+}
+
+/*******************************************************//**
+Compares two dulints.
+@return -1 if a < b, 0 if a == b, 1 if a > b */
+UNIV_INLINE
+int
+ut_dulint_cmp(
+/*==========*/
+ dulint a, /*!< in: dulint */
+ dulint b) /*!< in: dulint */
+{
+ if (a.high > b.high) {
+ return(1);
+ } else if (a.high < b.high) {
+ return(-1);
+ } else if (a.low > b.low) {
+ return(1);
+ } else if (a.low < b.low) {
+ return(-1);
+ } else {
+ return(0);
+ }
+}
+
+/*******************************************************//**
+Calculates the max of two dulints.
+@return max(a, b) */
+UNIV_INLINE
+dulint
+ut_dulint_get_max(
+/*==============*/
+ dulint a, /*!< in: dulint */
+ dulint b) /*!< in: dulint */
+{
+ if (ut_dulint_cmp(a, b) > 0) {
+
+ return(a);
+ }
+
+ return(b);
+}
+
+/*******************************************************//**
+Calculates the min of two dulints.
+@return min(a, b) */
+UNIV_INLINE
+dulint
+ut_dulint_get_min(
+/*==============*/
+ dulint a, /*!< in: dulint */
+ dulint b) /*!< in: dulint */
+{
+ if (ut_dulint_cmp(a, b) > 0) {
+
+ return(b);
+ }
+
+ return(a);
+}
+
+/*******************************************************//**
+Adds a ulint to a dulint.
+@return sum a + b */
+UNIV_INLINE
+dulint
+ut_dulint_add(
+/*==========*/
+ dulint a, /*!< in: dulint */
+ ulint b) /*!< in: ulint */
+{
+ if (0xFFFFFFFFUL - b >= a.low) {
+ a.low += b;
+
+ return(a);
+ }
+
+ a.low = a.low - (0xFFFFFFFFUL - b) - 1;
+
+ a.high++;
+
+ return(a);
+}
+
+/*******************************************************//**
+Subtracts a ulint from a dulint.
+@return a - b */
+UNIV_INLINE
+dulint
+ut_dulint_subtract(
+/*===============*/
+ dulint a, /*!< in: dulint */
+ ulint b) /*!< in: ulint, b <= a */
+{
+ if (a.low >= b) {
+ a.low -= b;
+
+ return(a);
+ }
+
+ b -= a.low + 1;
+
+ a.low = 0xFFFFFFFFUL - b;
+
+ ut_ad(a.high > 0);
+
+ a.high--;
+
+ return(a);
+}
+
+/*******************************************************//**
+Subtracts a dulint from another. NOTE that the difference must be positive
+and smaller that 4G.
+@return a - b */
+UNIV_INLINE
+ulint
+ut_dulint_minus(
+/*============*/
+ dulint a, /*!< in: dulint; NOTE a must be >= b and at most
+ 2 to power 32 - 1 greater */
+ dulint b) /*!< in: dulint */
+{
+ ulint diff;
+
+ if (a.high == b.high) {
+ ut_ad(a.low >= b.low);
+
+ return(a.low - b.low);
+ }
+
+ ut_ad(a.high == b.high + 1);
+
+ diff = (ulint)(0xFFFFFFFFUL - b.low);
+ diff += 1 + a.low;
+
+ ut_ad(diff > a.low);
+
+ return(diff);
+}
+
+/********************************************************//**
+Rounds a dulint downward to a multiple of a power of 2.
+@return rounded value */
+UNIV_INLINE
+dulint
+ut_dulint_align_down(
+/*=================*/
+ dulint n, /*!< in: number to be rounded */
+ ulint align_no) /*!< in: align by this number which must be a
+ power of 2 */
+{
+ ulint low, high;
+
+ ut_ad(align_no > 0);
+ ut_ad(((align_no - 1) & align_no) == 0);
+
+ low = ut_dulint_get_low(n);
+ high = ut_dulint_get_high(n);
+
+ low = low & ~(align_no - 1);
+
+ return(ut_dulint_create(high, low));
+}
+
+/********************************************************//**
+Rounds a dulint upward to a multiple of a power of 2.
+@return rounded value */
+UNIV_INLINE
+dulint
+ut_dulint_align_up(
+/*===============*/
+ dulint n, /*!< in: number to be rounded */
+ ulint align_no) /*!< in: align by this number which must be a
+ power of 2 */
+{
+ return(ut_dulint_align_down(ut_dulint_add(n, align_no - 1), align_no));
+}
+
+/********************************************************//**
+Rounds ib_uint64_t downward to a multiple of a power of 2.
+@return rounded value */
+UNIV_INLINE
+ib_uint64_t
+ut_uint64_align_down(
+/*=================*/
+ ib_uint64_t n, /*!< in: number to be rounded */
+ ulint align_no) /*!< in: align by this number
+ which must be a power of 2 */
+{
+ ut_ad(align_no > 0);
+ ut_ad(ut_is_2pow(align_no));
+
+ return(n & ~((ib_uint64_t) align_no - 1));
+}
+
+/********************************************************//**
+Rounds ib_uint64_t upward to a multiple of a power of 2.
+@return rounded value */
+UNIV_INLINE
+ib_uint64_t
+ut_uint64_align_up(
+/*===============*/
+ ib_uint64_t n, /*!< in: number to be rounded */
+ ulint align_no) /*!< in: align by this number
+ which must be a power of 2 */
+{
+ ib_uint64_t align_1 = (ib_uint64_t) align_no - 1;
+
+ ut_ad(align_no > 0);
+ ut_ad(ut_is_2pow(align_no));
+
+ return((n + align_1) & ~align_1);
+}
+
+/*********************************************************//**
+The following function rounds up a pointer to the nearest aligned address.
+@return aligned pointer */
+UNIV_INLINE
+void*
+ut_align(
+/*=====*/
+ void* ptr, /*!< in: pointer */
+ ulint align_no) /*!< in: align by this number */
+{
+ ut_ad(align_no > 0);
+ ut_ad(((align_no - 1) & align_no) == 0);
+ ut_ad(ptr);
+
+ ut_ad(sizeof(void*) == sizeof(ulint));
+
+ return((void*)((((ulint)ptr) + align_no - 1) & ~(align_no - 1)));
+}
+
+/*********************************************************//**
+The following function rounds down a pointer to the nearest
+aligned address.
+@return aligned pointer */
+UNIV_INLINE
+void*
+ut_align_down(
+/*==========*/
+ const void* ptr, /*!< in: pointer */
+ ulint align_no) /*!< in: align by this number */
+{
+ ut_ad(align_no > 0);
+ ut_ad(((align_no - 1) & align_no) == 0);
+ ut_ad(ptr);
+
+ ut_ad(sizeof(void*) == sizeof(ulint));
+
+ return((void*)((((ulint)ptr)) & ~(align_no - 1)));
+}
+
+/*********************************************************//**
+The following function computes the offset of a pointer from the nearest
+aligned address.
+@return distance from aligned pointer */
+UNIV_INLINE
+ulint
+ut_align_offset(
+/*============*/
+ const void* ptr, /*!< in: pointer */
+ ulint align_no) /*!< in: align by this number */
+{
+ ut_ad(align_no > 0);
+ ut_ad(((align_no - 1) & align_no) == 0);
+ ut_ad(ptr);
+
+ ut_ad(sizeof(void*) == sizeof(ulint));
+
+ return(((ulint)ptr) & (align_no - 1));
+}
+
+/*****************************************************************//**
+Gets the nth bit of a ulint.
+@return TRUE if nth bit is 1; 0th bit is defined to be the least significant */
+UNIV_INLINE
+ibool
+ut_bit_get_nth(
+/*===========*/
+ ulint a, /*!< in: ulint */
+ ulint n) /*!< in: nth bit requested */
+{
+ ut_ad(n < 8 * sizeof(ulint));
+#if TRUE != 1
+# error "TRUE != 1"
+#endif
+ return(1 & (a >> n));
+}
+
+/*****************************************************************//**
+Sets the nth bit of a ulint.
+@return the ulint with the bit set as requested */
+UNIV_INLINE
+ulint
+ut_bit_set_nth(
+/*===========*/
+ ulint a, /*!< in: ulint */
+ ulint n, /*!< in: nth bit requested */
+ ibool val) /*!< in: value for the bit to set */
+{
+ ut_ad(n < 8 * sizeof(ulint));
+#if TRUE != 1
+# error "TRUE != 1"
+#endif
+ if (val) {
+ return(((ulint) 1 << n) | a);
+ } else {
+ return(~((ulint) 1 << n) & a);
+ }
+}
diff --git a/storage/innobase/include/ut0dbg.h b/storage/innobase/include/ut0dbg.h
new file mode 100644
index 00000000000..78b525c38ab
--- /dev/null
+++ b/storage/innobase/include/ut0dbg.h
@@ -0,0 +1,175 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/*****************************************************************//**
+@file include/ut0dbg.h
+Debug utilities for Innobase
+
+Created 1/30/1994 Heikki Tuuri
+**********************************************************************/
+
+#ifndef ut0dbg_h
+#define ut0dbg_h
+
+#include "univ.i"
+#include <stdlib.h>
+#include "os0thread.h"
+
+#if defined(__GNUC__) && (__GNUC__ > 2)
+/** Test if an assertion fails.
+@param EXPR assertion expression
+@return nonzero if EXPR holds, zero if not */
+# define UT_DBG_FAIL(EXPR) UNIV_UNLIKELY(!((ulint)(EXPR)))
+#else
+/** This is used to eliminate compiler warnings */
+extern ulint ut_dbg_zero;
+/** Test if an assertion fails.
+@param EXPR assertion expression
+@return nonzero if EXPR holds, zero if not */
+# define UT_DBG_FAIL(EXPR) !((ulint)(EXPR) + ut_dbg_zero)
+#endif
+
+/*************************************************************//**
+Report a failed assertion. */
+UNIV_INTERN
+void
+ut_dbg_assertion_failed(
+/*====================*/
+ const char* expr, /*!< in: the failed assertion */
+ const char* file, /*!< in: source file containing the assertion */
+ ulint line); /*!< in: line number of the assertion */
+
+#ifdef __NETWARE__
+/** Flag for ignoring further assertion failures. This is set to TRUE
+when on NetWare there happens an InnoDB assertion failure or other
+fatal error condition that requires an immediate shutdown. */
+extern ibool panic_shutdown;
+/* Abort the execution. */
+void ut_dbg_panic(void);
+# define UT_DBG_PANIC ut_dbg_panic()
+/* Stop threads in ut_a(). */
+# define UT_DBG_STOP do {} while (0) /* We do not do this on NetWare */
+#else /* __NETWARE__ */
+# if defined(__WIN__) || defined(__INTEL_COMPILER)
+# undef UT_DBG_USE_ABORT
+# elif defined(__GNUC__) && (__GNUC__ > 2)
+# define UT_DBG_USE_ABORT
+# endif
+
+# ifndef UT_DBG_USE_ABORT
+/** A null pointer that will be dereferenced to trigger a memory trap */
+extern ulint* ut_dbg_null_ptr;
+# endif
+
+# if defined(UNIV_SYNC_DEBUG) || !defined(UT_DBG_USE_ABORT)
+/** If this is set to TRUE by ut_dbg_assertion_failed(), all threads
+will stop at the next ut_a() or ut_ad(). */
+extern ibool ut_dbg_stop_threads;
+
+/*************************************************************//**
+Stop a thread after assertion failure. */
+UNIV_INTERN
+void
+ut_dbg_stop_thread(
+/*===============*/
+ const char* file,
+ ulint line);
+# endif
+
+# ifdef UT_DBG_USE_ABORT
+/** Abort the execution. */
+# define UT_DBG_PANIC abort()
+/** Stop threads (null operation) */
+# define UT_DBG_STOP do {} while (0)
+# else /* UT_DBG_USE_ABORT */
+/** Abort the execution. */
+# define UT_DBG_PANIC \
+ if (*(ut_dbg_null_ptr)) ut_dbg_null_ptr = NULL
+/** Stop threads in ut_a(). */
+# define UT_DBG_STOP do \
+ if (UNIV_UNLIKELY(ut_dbg_stop_threads)) { \
+ ut_dbg_stop_thread(__FILE__, (ulint) __LINE__); \
+ } while (0)
+# endif /* UT_DBG_USE_ABORT */
+#endif /* __NETWARE__ */
+
+/** Abort execution if EXPR does not evaluate to nonzero.
+@param EXPR assertion expression that should hold */
+#define ut_a(EXPR) do { \
+ if (UT_DBG_FAIL(EXPR)) { \
+ ut_dbg_assertion_failed(#EXPR, \
+ __FILE__, (ulint) __LINE__); \
+ UT_DBG_PANIC; \
+ } \
+ UT_DBG_STOP; \
+} while (0)
+
+/** Abort execution. */
+#define ut_error do { \
+ ut_dbg_assertion_failed(0, __FILE__, (ulint) __LINE__); \
+ UT_DBG_PANIC; \
+} while (0)
+
+#ifdef UNIV_DEBUG
+/** Debug assertion. Does nothing unless UNIV_DEBUG is defined. */
+#define ut_ad(EXPR) ut_a(EXPR)
+/** Debug statement. Does nothing unless UNIV_DEBUG is defined. */
+#define ut_d(EXPR) do {EXPR;} while (0)
+#else
+/** Debug assertion. Does nothing unless UNIV_DEBUG is defined. */
+#define ut_ad(EXPR)
+/** Debug statement. Does nothing unless UNIV_DEBUG is defined. */
+#define ut_d(EXPR)
+#endif
+
+/** Silence warnings about an unused variable by doing a null assignment.
+@param A the unused variable */
+#define UT_NOT_USED(A) A = A
+
+#ifdef UNIV_COMPILE_TEST_FUNCS
+
+#include <sys/types.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+
+/** structure used for recording usage statistics */
+typedef struct speedo_struct {
+ struct rusage ru; /*!< getrusage() result */
+ struct timeval tv; /*!< gettimeofday() result */
+} speedo_t;
+
+/*******************************************************************//**
+Resets a speedo (records the current time in it). */
+UNIV_INTERN
+void
+speedo_reset(
+/*=========*/
+ speedo_t* speedo); /*!< out: speedo */
+
+/*******************************************************************//**
+Shows the time elapsed and usage statistics since the last reset of a
+speedo. */
+UNIV_INTERN
+void
+speedo_show(
+/*========*/
+ const speedo_t* speedo); /*!< in: speedo */
+
+#endif /* UNIV_COMPILE_TEST_FUNCS */
+
+#endif
diff --git a/storage/innobase/include/ut0list.h b/storage/innobase/include/ut0list.h
new file mode 100644
index 00000000000..ec67f4e2a0f
--- /dev/null
+++ b/storage/innobase/include/ut0list.h
@@ -0,0 +1,172 @@
+/*****************************************************************************
+
+Copyright (c) 2006, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/*******************************************************************//**
+@file include/ut0list.h
+A double-linked list
+
+Created 4/26/2006 Osku Salerma
+************************************************************************/
+
+/*******************************************************************//**
+A double-linked list. This differs from the one in ut0lst.h in that in this
+one, each list node contains a pointer to the data, whereas the one in
+ut0lst.h uses a strategy where the list pointers are embedded in the data
+items themselves.
+
+Use this one when you need to store arbitrary data in the list where you
+can't embed the list pointers in the data, if a data item needs to be
+stored in multiple lists, etc.
+
+Note about the memory management: ib_list_t is a fixed-size struct whose
+allocation/deallocation is done through ib_list_create/ib_list_free, but the
+memory for the list nodes is allocated through a user-given memory heap,
+which can either be the same for all nodes or vary per node. Most users will
+probably want to create a memory heap to store the item-specific data, and
+pass in this same heap to the list node creation functions, thus
+automatically freeing the list node when the item's heap is freed.
+
+************************************************************************/
+
+#ifndef IB_LIST_H
+#define IB_LIST_H
+
+#include "mem0mem.h"
+
+typedef struct ib_list_struct ib_list_t;
+typedef struct ib_list_node_struct ib_list_node_t;
+typedef struct ib_list_helper_struct ib_list_helper_t;
+
+/****************************************************************//**
+Create a new list using mem_alloc. Lists created with this function must be
+freed with ib_list_free.
+@return list */
+UNIV_INTERN
+ib_list_t*
+ib_list_create(void);
+/*=================*/
+
+
+/****************************************************************//**
+Create a new list using the given heap. ib_list_free MUST NOT BE CALLED for
+lists created with this function.
+@return list */
+UNIV_INTERN
+ib_list_t*
+ib_list_create_heap(
+/*================*/
+ mem_heap_t* heap); /*!< in: memory heap to use */
+
+/****************************************************************//**
+Free a list. */
+UNIV_INTERN
+void
+ib_list_free(
+/*=========*/
+ ib_list_t* list); /*!< in: list */
+
+/****************************************************************//**
+Add the data to the start of the list.
+@return new list node */
+UNIV_INTERN
+ib_list_node_t*
+ib_list_add_first(
+/*==============*/
+ ib_list_t* list, /*!< in: list */
+ void* data, /*!< in: data */
+ mem_heap_t* heap); /*!< in: memory heap to use */
+
+/****************************************************************//**
+Add the data to the end of the list.
+@return new list node */
+UNIV_INTERN
+ib_list_node_t*
+ib_list_add_last(
+/*=============*/
+ ib_list_t* list, /*!< in: list */
+ void* data, /*!< in: data */
+ mem_heap_t* heap); /*!< in: memory heap to use */
+
+/****************************************************************//**
+Add the data after the indicated node.
+@return new list node */
+UNIV_INTERN
+ib_list_node_t*
+ib_list_add_after(
+/*==============*/
+ ib_list_t* list, /*!< in: list */
+ ib_list_node_t* prev_node, /*!< in: node preceding new node (can
+ be NULL) */
+ void* data, /*!< in: data */
+ mem_heap_t* heap); /*!< in: memory heap to use */
+
+/****************************************************************//**
+Remove the node from the list. */
+UNIV_INTERN
+void
+ib_list_remove(
+/*===========*/
+ ib_list_t* list, /*!< in: list */
+ ib_list_node_t* node); /*!< in: node to remove */
+
+/****************************************************************//**
+Get the first node in the list.
+@return first node, or NULL */
+UNIV_INLINE
+ib_list_node_t*
+ib_list_get_first(
+/*==============*/
+ ib_list_t* list); /*!< in: list */
+
+/****************************************************************//**
+Get the last node in the list.
+@return last node, or NULL */
+UNIV_INLINE
+ib_list_node_t*
+ib_list_get_last(
+/*=============*/
+ ib_list_t* list); /*!< in: list */
+
+/* List. */
+struct ib_list_struct {
+ ib_list_node_t* first; /*!< first node */
+ ib_list_node_t* last; /*!< last node */
+ ibool is_heap_list; /*!< TRUE if this list was
+ allocated through a heap */
+};
+
+/* A list node. */
+struct ib_list_node_struct {
+ ib_list_node_t* prev; /*!< previous node */
+ ib_list_node_t* next; /*!< next node */
+ void* data; /*!< user data */
+};
+
+/* Quite often, the only additional piece of data you need is the per-item
+memory heap, so we have this generic struct available to use in those
+cases. */
+struct ib_list_helper_struct {
+ mem_heap_t* heap; /*!< memory heap */
+ void* data; /*!< user data */
+};
+
+#ifndef UNIV_NONINL
+#include "ut0list.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/ut0list.ic b/storage/innobase/include/ut0list.ic
new file mode 100644
index 00000000000..eb5c62796e8
--- /dev/null
+++ b/storage/innobase/include/ut0list.ic
@@ -0,0 +1,48 @@
+/*****************************************************************************
+
+Copyright (c) 2006, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/*******************************************************************//**
+@file include/ut0list.ic
+A double-linked list
+
+Created 4/26/2006 Osku Salerma
+************************************************************************/
+
+/****************************************************************//**
+Get the first node in the list.
+@return first node, or NULL */
+UNIV_INLINE
+ib_list_node_t*
+ib_list_get_first(
+/*==============*/
+ ib_list_t* list) /*!< in: list */
+{
+ return(list->first);
+}
+
+/****************************************************************//**
+Get the last node in the list.
+@return last node, or NULL */
+UNIV_INLINE
+ib_list_node_t*
+ib_list_get_last(
+/*=============*/
+ ib_list_t* list) /*!< in: list */
+{
+ return(list->last);
+}
diff --git a/storage/innobase/include/ut0lst.h b/storage/innobase/include/ut0lst.h
new file mode 100644
index 00000000000..261d33963dc
--- /dev/null
+++ b/storage/innobase/include/ut0lst.h
@@ -0,0 +1,261 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/ut0lst.h
+List utilities
+
+Created 9/10/1995 Heikki Tuuri
+***********************************************************************/
+
+#ifndef ut0lst_h
+#define ut0lst_h
+
+#include "univ.i"
+
+/* This module implements the two-way linear list which should be used
+if a list is used in the database. Note that a single struct may belong
+to two or more lists, provided that the list are given different names.
+An example of the usage of the lists can be found in fil0fil.c. */
+
+/*******************************************************************//**
+This macro expands to the unnamed type definition of a struct which acts
+as the two-way list base node. The base node contains pointers
+to both ends of the list and a count of nodes in the list (excluding
+the base node from the count).
+@param TYPE the name of the list node data type */
+#define UT_LIST_BASE_NODE_T(TYPE)\
+struct {\
+ ulint count; /*!< count of nodes in list */\
+ TYPE * start; /*!< pointer to list start, NULL if empty */\
+ TYPE * end; /*!< pointer to list end, NULL if empty */\
+}\
+
+/*******************************************************************//**
+This macro expands to the unnamed type definition of a struct which
+should be embedded in the nodes of the list, the node type must be a struct.
+This struct contains the pointers to next and previous nodes in the list.
+The name of the field in the node struct should be the name given
+to the list.
+@param TYPE the list node type name */
+/* Example:
+typedef struct LRU_node_struct LRU_node_t;
+struct LRU_node_struct {
+ UT_LIST_NODE_T(LRU_node_t) LRU_list;
+ ...
+}
+The example implements an LRU list of name LRU_list. Its nodes are of type
+LRU_node_t. */
+
+#define UT_LIST_NODE_T(TYPE)\
+struct {\
+ TYPE * prev; /*!< pointer to the previous node,\
+ NULL if start of list */\
+ TYPE * next; /*!< pointer to next node, NULL if end of list */\
+}\
+
+/*******************************************************************//**
+Initializes the base node of a two-way list.
+@param BASE the list base node
+*/
+#define UT_LIST_INIT(BASE)\
+{\
+ (BASE).count = 0;\
+ (BASE).start = NULL;\
+ (BASE).end = NULL;\
+}\
+
+/*******************************************************************//**
+Adds the node as the first element in a two-way linked list.
+@param NAME list name
+@param BASE the base node (not a pointer to it)
+@param N pointer to the node to be added to the list.
+*/
+#define UT_LIST_ADD_FIRST(NAME, BASE, N)\
+{\
+ ut_ad(N);\
+ ((BASE).count)++;\
+ ((N)->NAME).next = (BASE).start;\
+ ((N)->NAME).prev = NULL;\
+ if (UNIV_LIKELY((BASE).start != NULL)) {\
+ ut_ad((BASE).start != (N));\
+ (((BASE).start)->NAME).prev = (N);\
+ }\
+ (BASE).start = (N);\
+ if (UNIV_UNLIKELY((BASE).end == NULL)) {\
+ (BASE).end = (N);\
+ }\
+}\
+
+/*******************************************************************//**
+Adds the node as the last element in a two-way linked list.
+@param NAME list name
+@param BASE the base node (not a pointer to it)
+@param N pointer to the node to be added to the list
+*/
+#define UT_LIST_ADD_LAST(NAME, BASE, N)\
+{\
+ ut_ad(N);\
+ ((BASE).count)++;\
+ ((N)->NAME).prev = (BASE).end;\
+ ((N)->NAME).next = NULL;\
+ if ((BASE).end != NULL) {\
+ ut_ad((BASE).end != (N));\
+ (((BASE).end)->NAME).next = (N);\
+ }\
+ (BASE).end = (N);\
+ if ((BASE).start == NULL) {\
+ (BASE).start = (N);\
+ }\
+}\
+
+/*******************************************************************//**
+Inserts a NODE2 after NODE1 in a list.
+@param NAME list name
+@param BASE the base node (not a pointer to it)
+@param NODE1 pointer to node after which NODE2 is inserted
+@param NODE2 pointer to node being inserted after NODE1
+*/
+#define UT_LIST_INSERT_AFTER(NAME, BASE, NODE1, NODE2)\
+{\
+ ut_ad(NODE1);\
+ ut_ad(NODE2);\
+ ut_ad((NODE1) != (NODE2));\
+ ((BASE).count)++;\
+ ((NODE2)->NAME).prev = (NODE1);\
+ ((NODE2)->NAME).next = ((NODE1)->NAME).next;\
+ if (((NODE1)->NAME).next != NULL) {\
+ ((((NODE1)->NAME).next)->NAME).prev = (NODE2);\
+ }\
+ ((NODE1)->NAME).next = (NODE2);\
+ if ((BASE).end == (NODE1)) {\
+ (BASE).end = (NODE2);\
+ }\
+}\
+
+#ifdef UNIV_LIST_DEBUG
+/** Invalidate the pointers in a list node.
+@param NAME list name
+@param N pointer to the node that was removed */
+# define UT_LIST_REMOVE_CLEAR(NAME, N) \
+((N)->NAME.prev = (N)->NAME.next = (void*) -1)
+#else
+/** Invalidate the pointers in a list node.
+@param NAME list name
+@param N pointer to the node that was removed */
+# define UT_LIST_REMOVE_CLEAR(NAME, N) while (0)
+#endif
+
+/*******************************************************************//**
+Removes a node from a two-way linked list.
+@param NAME list name
+@param BASE the base node (not a pointer to it)
+@param N pointer to the node to be removed from the list
+*/
+#define UT_LIST_REMOVE(NAME, BASE, N) \
+do { \
+ ut_ad(N); \
+ ut_a((BASE).count > 0); \
+ ((BASE).count)--; \
+ if (((N)->NAME).next != NULL) { \
+ ((((N)->NAME).next)->NAME).prev = ((N)->NAME).prev; \
+ } else { \
+ (BASE).end = ((N)->NAME).prev; \
+ } \
+ if (((N)->NAME).prev != NULL) { \
+ ((((N)->NAME).prev)->NAME).next = ((N)->NAME).next; \
+ } else { \
+ (BASE).start = ((N)->NAME).next; \
+ } \
+ UT_LIST_REMOVE_CLEAR(NAME, N); \
+} while (0)
+
+/********************************************************************//**
+Gets the next node in a two-way list.
+@param NAME list name
+@param N pointer to a node
+@return the successor of N in NAME, or NULL */
+#define UT_LIST_GET_NEXT(NAME, N)\
+ (((N)->NAME).next)
+
+/********************************************************************//**
+Gets the previous node in a two-way list.
+@param NAME list name
+@param N pointer to a node
+@return the predecessor of N in NAME, or NULL */
+#define UT_LIST_GET_PREV(NAME, N)\
+ (((N)->NAME).prev)
+
+/********************************************************************//**
+Alternative macro to get the number of nodes in a two-way list, i.e.,
+its length.
+@param BASE the base node (not a pointer to it).
+@return the number of nodes in the list */
+#define UT_LIST_GET_LEN(BASE)\
+ (BASE).count
+
+/********************************************************************//**
+Gets the first node in a two-way list.
+@param BASE the base node (not a pointer to it)
+@return first node, or NULL if the list is empty */
+#define UT_LIST_GET_FIRST(BASE)\
+ (BASE).start
+
+/********************************************************************//**
+Gets the last node in a two-way list.
+@param BASE the base node (not a pointer to it)
+@return last node, or NULL if the list is empty */
+#define UT_LIST_GET_LAST(BASE)\
+ (BASE).end
+
+/********************************************************************//**
+Checks the consistency of a two-way list.
+@param NAME the name of the list
+@param TYPE node type
+@param BASE base node (not a pointer to it)
+@param ASSERTION a condition on ut_list_node_313 */
+#define UT_LIST_VALIDATE(NAME, TYPE, BASE, ASSERTION) \
+do { \
+ ulint ut_list_i_313; \
+ TYPE* ut_list_node_313; \
+ \
+ ut_list_node_313 = (BASE).start; \
+ \
+ for (ut_list_i_313 = (BASE).count; ut_list_i_313--; ) { \
+ ut_a(ut_list_node_313); \
+ ASSERTION; \
+ ut_ad((ut_list_node_313->NAME).next || !ut_list_i_313); \
+ ut_list_node_313 = (ut_list_node_313->NAME).next; \
+ } \
+ \
+ ut_a(ut_list_node_313 == NULL); \
+ \
+ ut_list_node_313 = (BASE).end; \
+ \
+ for (ut_list_i_313 = (BASE).count; ut_list_i_313--; ) { \
+ ut_a(ut_list_node_313); \
+ ASSERTION; \
+ ut_ad((ut_list_node_313->NAME).prev || !ut_list_i_313); \
+ ut_list_node_313 = (ut_list_node_313->NAME).prev; \
+ } \
+ \
+ ut_a(ut_list_node_313 == NULL); \
+} while (0)
+
+#endif
+
diff --git a/storage/innobase/include/ut0mem.h b/storage/innobase/include/ut0mem.h
new file mode 100644
index 00000000000..cf41cba4643
--- /dev/null
+++ b/storage/innobase/include/ut0mem.h
@@ -0,0 +1,306 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/*******************************************************************//**
+@file include/ut0mem.h
+Memory primitives
+
+Created 5/30/1994 Heikki Tuuri
+************************************************************************/
+
+#ifndef ut0mem_h
+#define ut0mem_h
+
+#include "univ.i"
+#include <string.h>
+#ifndef UNIV_HOTBACKUP
+# include "os0sync.h"
+
+/** The total amount of memory currently allocated from the operating
+system with os_mem_alloc_large() or malloc(). Does not count malloc()
+if srv_use_sys_malloc is set. Protected by ut_list_mutex. */
+extern ulint ut_total_allocated_memory;
+
+/** Mutex protecting ut_total_allocated_memory and ut_mem_block_list */
+extern os_fast_mutex_t ut_list_mutex;
+#endif /* !UNIV_HOTBACKUP */
+
+/** Wrapper for memcpy(3). Copy memory area when the source and
+target are not overlapping.
+* @param dest in: copy to
+* @param sour in: copy from
+* @param n in: number of bytes to copy
+* @return dest */
+UNIV_INLINE
+void*
+ut_memcpy(void* dest, const void* sour, ulint n);
+
+/** Wrapper for memmove(3). Copy memory area when the source and
+target are overlapping.
+* @param dest in: copy to
+* @param sour in: copy from
+* @param n in: number of bytes to copy
+* @return dest */
+UNIV_INLINE
+void*
+ut_memmove(void* dest, const void* sour, ulint n);
+
+/** Wrapper for memcmp(3). Compare memory areas.
+* @param str1 in: first memory block to compare
+* @param str2 in: second memory block to compare
+* @param n in: number of bytes to compare
+* @return negative, 0, or positive if str1 is smaller, equal,
+ or greater than str2, respectively. */
+UNIV_INLINE
+int
+ut_memcmp(const void* str1, const void* str2, ulint n);
+
+/**********************************************************************//**
+Initializes the mem block list at database startup. */
+UNIV_INTERN
+void
+ut_mem_init(void);
+/*=============*/
+
+/**********************************************************************//**
+Allocates memory. Sets it also to zero if UNIV_SET_MEM_TO_ZERO is
+defined and set_to_zero is TRUE.
+@return own: allocated memory */
+UNIV_INTERN
+void*
+ut_malloc_low(
+/*==========*/
+ ulint n, /*!< in: number of bytes to allocate */
+ ibool set_to_zero, /*!< in: TRUE if allocated memory
+ should be set to zero if
+ UNIV_SET_MEM_TO_ZERO is defined */
+ ibool assert_on_error); /*!< in: if TRUE, we crash mysqld if
+ the memory cannot be allocated */
+/**********************************************************************//**
+Allocates memory. Sets it also to zero if UNIV_SET_MEM_TO_ZERO is
+defined.
+@return own: allocated memory */
+UNIV_INTERN
+void*
+ut_malloc(
+/*======*/
+ ulint n); /*!< in: number of bytes to allocate */
+#ifndef UNIV_HOTBACKUP
+/**********************************************************************//**
+Tests if malloc of n bytes would succeed. ut_malloc() asserts if memory runs
+out. It cannot be used if we want to return an error message. Prints to
+stderr a message if fails.
+@return TRUE if succeeded */
+UNIV_INTERN
+ibool
+ut_test_malloc(
+/*===========*/
+ ulint n); /*!< in: try to allocate this many bytes */
+#endif /* !UNIV_HOTBACKUP */
+/**********************************************************************//**
+Frees a memory block allocated with ut_malloc. */
+UNIV_INTERN
+void
+ut_free(
+/*====*/
+ void* ptr); /*!< in, own: memory block */
+#ifndef UNIV_HOTBACKUP
+/**********************************************************************//**
+Implements realloc. This is needed by /pars/lexyy.c. Otherwise, you should not
+use this function because the allocation functions in mem0mem.h are the
+recommended ones in InnoDB.
+
+man realloc in Linux, 2004:
+
+ realloc() changes the size of the memory block pointed to
+ by ptr to size bytes. The contents will be unchanged to
+ the minimum of the old and new sizes; newly allocated mem­
+ ory will be uninitialized. If ptr is NULL, the call is
+ equivalent to malloc(size); if size is equal to zero, the
+ call is equivalent to free(ptr). Unless ptr is NULL, it
+ must have been returned by an earlier call to malloc(),
+ calloc() or realloc().
+
+RETURN VALUE
+ realloc() returns a pointer to the newly allocated memory,
+ which is suitably aligned for any kind of variable and may
+ be different from ptr, or NULL if the request fails. If
+ size was equal to 0, either NULL or a pointer suitable to
+ be passed to free() is returned. If realloc() fails the
+ original block is left untouched - it is not freed or
+ moved.
+@return own: pointer to new mem block or NULL */
+UNIV_INTERN
+void*
+ut_realloc(
+/*=======*/
+ void* ptr, /*!< in: pointer to old block or NULL */
+ ulint size); /*!< in: desired size */
+/**********************************************************************//**
+Frees in shutdown all allocated memory not freed yet. */
+UNIV_INTERN
+void
+ut_free_all_mem(void);
+/*=================*/
+#endif /* !UNIV_HOTBACKUP */
+
+/** Wrapper for strcpy(3). Copy a NUL-terminated string.
+* @param dest in: copy to
+* @param sour in: copy from
+* @return dest */
+UNIV_INLINE
+char*
+ut_strcpy(char* dest, const char* sour);
+
+/** Wrapper for strlen(3). Determine the length of a NUL-terminated string.
+* @param str in: string
+* @return length of the string in bytes, excluding the terminating NUL */
+UNIV_INLINE
+ulint
+ut_strlen(const char* str);
+
+/** Wrapper for strcmp(3). Compare NUL-terminated strings.
+* @param str1 in: first string to compare
+* @param str2 in: second string to compare
+* @return negative, 0, or positive if str1 is smaller, equal,
+ or greater than str2, respectively. */
+UNIV_INLINE
+int
+ut_strcmp(const char* str1, const char* str2);
+
+/**********************************************************************//**
+Copies up to size - 1 characters from the NUL-terminated string src to
+dst, NUL-terminating the result. Returns strlen(src), so truncation
+occurred if the return value >= size.
+@return strlen(src) */
+UNIV_INTERN
+ulint
+ut_strlcpy(
+/*=======*/
+ char* dst, /*!< in: destination buffer */
+ const char* src, /*!< in: source buffer */
+ ulint size); /*!< in: size of destination buffer */
+
+/**********************************************************************//**
+Like ut_strlcpy, but if src doesn't fit in dst completely, copies the last
+(size - 1) bytes of src, not the first.
+@return strlen(src) */
+UNIV_INTERN
+ulint
+ut_strlcpy_rev(
+/*===========*/
+ char* dst, /*!< in: destination buffer */
+ const char* src, /*!< in: source buffer */
+ ulint size); /*!< in: size of destination buffer */
+
+/**********************************************************************//**
+Compute strlen(ut_strcpyq(str, q)).
+@return length of the string when quoted */
+UNIV_INLINE
+ulint
+ut_strlenq(
+/*=======*/
+ const char* str, /*!< in: null-terminated string */
+ char q); /*!< in: the quote character */
+
+/**********************************************************************//**
+Make a quoted copy of a NUL-terminated string. Leading and trailing
+quotes will not be included; only embedded quotes will be escaped.
+See also ut_strlenq() and ut_memcpyq().
+@return pointer to end of dest */
+UNIV_INTERN
+char*
+ut_strcpyq(
+/*=======*/
+ char* dest, /*!< in: output buffer */
+ char q, /*!< in: the quote character */
+ const char* src); /*!< in: null-terminated string */
+
+/**********************************************************************//**
+Make a quoted copy of a fixed-length string. Leading and trailing
+quotes will not be included; only embedded quotes will be escaped.
+See also ut_strlenq() and ut_strcpyq().
+@return pointer to end of dest */
+UNIV_INTERN
+char*
+ut_memcpyq(
+/*=======*/
+ char* dest, /*!< in: output buffer */
+ char q, /*!< in: the quote character */
+ const char* src, /*!< in: string to be quoted */
+ ulint len); /*!< in: length of src */
+
+/**********************************************************************//**
+Return the number of times s2 occurs in s1. Overlapping instances of s2
+are only counted once.
+@return the number of times s2 occurs in s1 */
+UNIV_INTERN
+ulint
+ut_strcount(
+/*========*/
+ const char* s1, /*!< in: string to search in */
+ const char* s2); /*!< in: string to search for */
+
+/**********************************************************************//**
+Replace every occurrence of s1 in str with s2. Overlapping instances of s1
+are only replaced once.
+@return own: modified string, must be freed with mem_free() */
+UNIV_INTERN
+char*
+ut_strreplace(
+/*==========*/
+ const char* str, /*!< in: string to operate on */
+ const char* s1, /*!< in: string to replace */
+ const char* s2); /*!< in: string to replace s1 with */
+
+/**********************************************************************//**
+Converts a raw binary data to a NUL-terminated hex string. The output is
+truncated if there is not enough space in "hex", make sure "hex_size" is at
+least (2 * raw_size + 1) if you do not want this to happen. Returns the
+actual number of characters written to "hex" (including the NUL).
+@return number of chars written */
+UNIV_INLINE
+ulint
+ut_raw_to_hex(
+/*==========*/
+ const void* raw, /*!< in: raw data */
+ ulint raw_size, /*!< in: "raw" length in bytes */
+ char* hex, /*!< out: hex string */
+ ulint hex_size); /*!< in: "hex" size in bytes */
+
+/*******************************************************************//**
+Adds single quotes to the start and end of string and escapes any quotes
+by doubling them. Returns the number of bytes that were written to "buf"
+(including the terminating NUL). If buf_size is too small then the
+trailing bytes from "str" are discarded.
+@return number of bytes that were written */
+UNIV_INLINE
+ulint
+ut_str_sql_format(
+/*==============*/
+ const char* str, /*!< in: string */
+ ulint str_len, /*!< in: string length in bytes */
+ char* buf, /*!< out: output buffer */
+ ulint buf_size); /*!< in: output buffer size
+ in bytes */
+
+#ifndef UNIV_NONINL
+#include "ut0mem.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/ut0mem.ic b/storage/innobase/include/ut0mem.ic
new file mode 100644
index 00000000000..f36c28f1989
--- /dev/null
+++ b/storage/innobase/include/ut0mem.ic
@@ -0,0 +1,338 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/*******************************************************************//**
+@file include/ut0mem.ic
+Memory primitives
+
+Created 5/30/1994 Heikki Tuuri
+************************************************************************/
+
+#include "ut0byte.h"
+#include "mach0data.h"
+
+/** Wrapper for memcpy(3). Copy memory area when the source and
+target are not overlapping.
+* @param dest in: copy to
+* @param sour in: copy from
+* @param n in: number of bytes to copy
+* @return dest */
+UNIV_INLINE
+void*
+ut_memcpy(void* dest, const void* sour, ulint n)
+{
+ return(memcpy(dest, sour, n));
+}
+
+/** Wrapper for memmove(3). Copy memory area when the source and
+target are overlapping.
+* @param dest in: copy to
+* @param sour in: copy from
+* @param n in: number of bytes to copy
+* @return dest */
+UNIV_INLINE
+void*
+ut_memmove(void* dest, const void* sour, ulint n)
+{
+ return(memmove(dest, sour, n));
+}
+
+/** Wrapper for memcmp(3). Compare memory areas.
+* @param str1 in: first memory block to compare
+* @param str2 in: second memory block to compare
+* @param n in: number of bytes to compare
+* @return negative, 0, or positive if str1 is smaller, equal,
+ or greater than str2, respectively. */
+UNIV_INLINE
+int
+ut_memcmp(const void* str1, const void* str2, ulint n)
+{
+ return(memcmp(str1, str2, n));
+}
+
+/** Wrapper for strcpy(3). Copy a NUL-terminated string.
+* @param dest in: copy to
+* @param sour in: copy from
+* @return dest */
+UNIV_INLINE
+char*
+ut_strcpy(char* dest, const char* sour)
+{
+ return(strcpy(dest, sour));
+}
+
+/** Wrapper for strlen(3). Determine the length of a NUL-terminated string.
+* @param str in: string
+* @return length of the string in bytes, excluding the terminating NUL */
+UNIV_INLINE
+ulint
+ut_strlen(const char* str)
+{
+ return(strlen(str));
+}
+
+/** Wrapper for strcmp(3). Compare NUL-terminated strings.
+* @param str1 in: first string to compare
+* @param str2 in: second string to compare
+* @return negative, 0, or positive if str1 is smaller, equal,
+ or greater than str2, respectively. */
+UNIV_INLINE
+int
+ut_strcmp(const char* str1, const char* str2)
+{
+ return(strcmp(str1, str2));
+}
+
+/**********************************************************************//**
+Compute strlen(ut_strcpyq(str, q)).
+@return length of the string when quoted */
+UNIV_INLINE
+ulint
+ut_strlenq(
+/*=======*/
+ const char* str, /*!< in: null-terminated string */
+ char q) /*!< in: the quote character */
+{
+ ulint len;
+
+ for (len = 0; *str; len++, str++) {
+ if (*str == q) {
+ len++;
+ }
+ }
+
+ return(len);
+}
+
+/**********************************************************************//**
+Converts a raw binary data to a NUL-terminated hex string. The output is
+truncated if there is not enough space in "hex", make sure "hex_size" is at
+least (2 * raw_size + 1) if you do not want this to happen. Returns the
+actual number of characters written to "hex" (including the NUL).
+@return number of chars written */
+UNIV_INLINE
+ulint
+ut_raw_to_hex(
+/*==========*/
+ const void* raw, /*!< in: raw data */
+ ulint raw_size, /*!< in: "raw" length in bytes */
+ char* hex, /*!< out: hex string */
+ ulint hex_size) /*!< in: "hex" size in bytes */
+{
+
+#ifdef WORDS_BIGENDIAN
+
+#define MK_UINT16(a, b) (((uint16) (a)) << 8 | (uint16) (b))
+
+#define UINT16_GET_A(u) ((unsigned char) ((u) >> 8))
+#define UINT16_GET_B(u) ((unsigned char) ((u) & 0xFF))
+
+#else /* WORDS_BIGENDIAN */
+
+#define MK_UINT16(a, b) (((uint16) (b)) << 8 | (uint16) (a))
+
+#define UINT16_GET_A(u) ((unsigned char) ((u) & 0xFF))
+#define UINT16_GET_B(u) ((unsigned char) ((u) >> 8))
+
+#endif /* WORDS_BIGENDIAN */
+
+#define MK_ALL_UINT16_WITH_A(a) \
+ MK_UINT16(a, '0'), \
+ MK_UINT16(a, '1'), \
+ MK_UINT16(a, '2'), \
+ MK_UINT16(a, '3'), \
+ MK_UINT16(a, '4'), \
+ MK_UINT16(a, '5'), \
+ MK_UINT16(a, '6'), \
+ MK_UINT16(a, '7'), \
+ MK_UINT16(a, '8'), \
+ MK_UINT16(a, '9'), \
+ MK_UINT16(a, 'A'), \
+ MK_UINT16(a, 'B'), \
+ MK_UINT16(a, 'C'), \
+ MK_UINT16(a, 'D'), \
+ MK_UINT16(a, 'E'), \
+ MK_UINT16(a, 'F')
+
+ static const uint16 hex_map[256] = {
+ MK_ALL_UINT16_WITH_A('0'),
+ MK_ALL_UINT16_WITH_A('1'),
+ MK_ALL_UINT16_WITH_A('2'),
+ MK_ALL_UINT16_WITH_A('3'),
+ MK_ALL_UINT16_WITH_A('4'),
+ MK_ALL_UINT16_WITH_A('5'),
+ MK_ALL_UINT16_WITH_A('6'),
+ MK_ALL_UINT16_WITH_A('7'),
+ MK_ALL_UINT16_WITH_A('8'),
+ MK_ALL_UINT16_WITH_A('9'),
+ MK_ALL_UINT16_WITH_A('A'),
+ MK_ALL_UINT16_WITH_A('B'),
+ MK_ALL_UINT16_WITH_A('C'),
+ MK_ALL_UINT16_WITH_A('D'),
+ MK_ALL_UINT16_WITH_A('E'),
+ MK_ALL_UINT16_WITH_A('F')
+ };
+ const unsigned char* rawc;
+ ulint read_bytes;
+ ulint write_bytes;
+ ulint i;
+
+ rawc = (const unsigned char*) raw;
+
+ if (hex_size == 0) {
+
+ return(0);
+ }
+
+ if (hex_size <= 2 * raw_size) {
+
+ read_bytes = hex_size / 2;
+ write_bytes = hex_size;
+ } else {
+
+ read_bytes = raw_size;
+ write_bytes = 2 * raw_size + 1;
+ }
+
+#define LOOP_READ_BYTES(ASSIGN) \
+ for (i = 0; i < read_bytes; i++) { \
+ ASSIGN; \
+ hex += 2; \
+ rawc++; \
+ }
+
+ if (ut_align_offset(hex, 2) == 0) {
+
+ LOOP_READ_BYTES(
+ *(uint16*) hex = hex_map[*rawc]
+ );
+ } else {
+
+ LOOP_READ_BYTES(
+ *hex = UINT16_GET_A(hex_map[*rawc]);
+ *(hex + 1) = UINT16_GET_B(hex_map[*rawc])
+ );
+ }
+
+ if (hex_size <= 2 * raw_size && hex_size % 2 == 0) {
+
+ hex--;
+ }
+
+ *hex = '\0';
+
+ return(write_bytes);
+}
+
+/*******************************************************************//**
+Adds single quotes to the start and end of string and escapes any quotes
+by doubling them. Returns the number of bytes that were written to "buf"
+(including the terminating NUL). If buf_size is too small then the
+trailing bytes from "str" are discarded.
+@return number of bytes that were written */
+UNIV_INLINE
+ulint
+ut_str_sql_format(
+/*==============*/
+ const char* str, /*!< in: string */
+ ulint str_len, /*!< in: string length in bytes */
+ char* buf, /*!< out: output buffer */
+ ulint buf_size) /*!< in: output buffer size
+ in bytes */
+{
+ ulint str_i;
+ ulint buf_i;
+
+ buf_i = 0;
+
+ switch (buf_size) {
+ case 3:
+
+ if (str_len == 0) {
+
+ buf[buf_i] = '\'';
+ buf_i++;
+ buf[buf_i] = '\'';
+ buf_i++;
+ }
+ /* FALLTHROUGH */
+ case 2:
+ case 1:
+
+ buf[buf_i] = '\0';
+ buf_i++;
+ /* FALLTHROUGH */
+ case 0:
+
+ return(buf_i);
+ }
+
+ /* buf_size >= 4 */
+
+ buf[0] = '\'';
+ buf_i = 1;
+
+ for (str_i = 0; str_i < str_len; str_i++) {
+
+ char ch;
+
+ if (buf_size - buf_i == 2) {
+
+ break;
+ }
+
+ ch = str[str_i];
+
+ switch (ch) {
+ case '\0':
+
+ if (UNIV_UNLIKELY(buf_size - buf_i < 4)) {
+
+ goto func_exit;
+ }
+ buf[buf_i] = '\\';
+ buf_i++;
+ buf[buf_i] = '0';
+ buf_i++;
+ break;
+ case '\'':
+ case '\\':
+
+ if (UNIV_UNLIKELY(buf_size - buf_i < 4)) {
+
+ goto func_exit;
+ }
+ buf[buf_i] = ch;
+ buf_i++;
+ /* FALLTHROUGH */
+ default:
+
+ buf[buf_i] = ch;
+ buf_i++;
+ }
+ }
+
+func_exit:
+
+ buf[buf_i] = '\'';
+ buf_i++;
+ buf[buf_i] = '\0';
+ buf_i++;
+
+ return(buf_i);
+}
diff --git a/storage/innobase/include/ut0rnd.h b/storage/innobase/include/ut0rnd.h
new file mode 100644
index 00000000000..ce5152e942f
--- /dev/null
+++ b/storage/innobase/include/ut0rnd.h
@@ -0,0 +1,143 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/ut0rnd.h
+Random numbers and hashing
+
+Created 1/20/1994 Heikki Tuuri
+***********************************************************************/
+
+#ifndef ut0rnd_h
+#define ut0rnd_h
+
+#include "univ.i"
+
+#include "ut0byte.h"
+
+/** The 'character code' for end of field or string (used
+in folding records */
+#define UT_END_OF_FIELD 257
+
+/********************************************************//**
+This is used to set the random number seed. */
+UNIV_INLINE
+void
+ut_rnd_set_seed(
+/*============*/
+ ulint seed); /*!< in: seed */
+/********************************************************//**
+The following function generates a series of 'random' ulint integers.
+@return the next 'random' number */
+UNIV_INLINE
+ulint
+ut_rnd_gen_next_ulint(
+/*==================*/
+ ulint rnd); /*!< in: the previous random number value */
+/*********************************************************//**
+The following function generates 'random' ulint integers which
+enumerate the value space (let there be N of them) of ulint integers
+in a pseudo-random fashion. Note that the same integer is repeated
+always after N calls to the generator.
+@return the 'random' number */
+UNIV_INLINE
+ulint
+ut_rnd_gen_ulint(void);
+/*==================*/
+/********************************************************//**
+Generates a random integer from a given interval.
+@return the 'random' number */
+UNIV_INLINE
+ulint
+ut_rnd_interval(
+/*============*/
+ ulint low, /*!< in: low limit; can generate also this value */
+ ulint high); /*!< in: high limit; can generate also this value */
+/*********************************************************//**
+Generates a random iboolean value.
+@return the random value */
+UNIV_INLINE
+ibool
+ut_rnd_gen_ibool(void);
+/*=================*/
+/*******************************************************//**
+The following function generates a hash value for a ulint integer
+to a hash table of size table_size, which should be a prime or some
+random number to work reliably.
+@return hash value */
+UNIV_INLINE
+ulint
+ut_hash_ulint(
+/*==========*/
+ ulint key, /*!< in: value to be hashed */
+ ulint table_size); /*!< in: hash table size */
+/*************************************************************//**
+Folds a pair of ulints.
+@return folded value */
+UNIV_INLINE
+ulint
+ut_fold_ulint_pair(
+/*===============*/
+ ulint n1, /*!< in: ulint */
+ ulint n2) /*!< in: ulint */
+ __attribute__((const));
+/*************************************************************//**
+Folds a dulint.
+@return folded value */
+UNIV_INLINE
+ulint
+ut_fold_dulint(
+/*===========*/
+ dulint d) /*!< in: dulint */
+ __attribute__((const));
+/*************************************************************//**
+Folds a character string ending in the null character.
+@return folded value */
+UNIV_INLINE
+ulint
+ut_fold_string(
+/*===========*/
+ const char* str) /*!< in: null-terminated string */
+ __attribute__((pure));
+/*************************************************************//**
+Folds a binary string.
+@return folded value */
+UNIV_INLINE
+ulint
+ut_fold_binary(
+/*===========*/
+ const byte* str, /*!< in: string of bytes */
+ ulint len) /*!< in: length */
+ __attribute__((pure));
+/***********************************************************//**
+Looks for a prime number slightly greater than the given argument.
+The prime is chosen so that it is not near any power of 2.
+@return prime */
+UNIV_INTERN
+ulint
+ut_find_prime(
+/*==========*/
+ ulint n) /*!< in: positive number > 100 */
+ __attribute__((const));
+
+
+#ifndef UNIV_NONINL
+#include "ut0rnd.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/ut0rnd.ic b/storage/innobase/include/ut0rnd.ic
new file mode 100644
index 00000000000..763469142ec
--- /dev/null
+++ b/storage/innobase/include/ut0rnd.ic
@@ -0,0 +1,230 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************************//**
+@file include/ut0rnd.ic
+Random numbers and hashing
+
+Created 5/30/1994 Heikki Tuuri
+*******************************************************************/
+
+#define UT_HASH_RANDOM_MASK 1463735687
+#define UT_HASH_RANDOM_MASK2 1653893711
+#define UT_RND1 151117737
+#define UT_RND2 119785373
+#define UT_RND3 85689495
+#define UT_RND4 76595339
+#define UT_SUM_RND2 98781234
+#define UT_SUM_RND3 126792457
+#define UT_SUM_RND4 63498502
+#define UT_XOR_RND1 187678878
+#define UT_XOR_RND2 143537923
+
+/** Seed value of ut_rnd_gen_ulint() */
+extern ulint ut_rnd_ulint_counter;
+
+/********************************************************//**
+This is used to set the random number seed. */
+UNIV_INLINE
+void
+ut_rnd_set_seed(
+/*============*/
+ ulint seed) /*!< in: seed */
+{
+ ut_rnd_ulint_counter = seed;
+}
+
+/********************************************************//**
+The following function generates a series of 'random' ulint integers.
+@return the next 'random' number */
+UNIV_INLINE
+ulint
+ut_rnd_gen_next_ulint(
+/*==================*/
+ ulint rnd) /*!< in: the previous random number value */
+{
+ ulint n_bits;
+
+ n_bits = 8 * sizeof(ulint);
+
+ rnd = UT_RND2 * rnd + UT_SUM_RND3;
+ rnd = UT_XOR_RND1 ^ rnd;
+ rnd = (rnd << 20) + (rnd >> (n_bits - 20));
+ rnd = UT_RND3 * rnd + UT_SUM_RND4;
+ rnd = UT_XOR_RND2 ^ rnd;
+ rnd = (rnd << 20) + (rnd >> (n_bits - 20));
+ rnd = UT_RND1 * rnd + UT_SUM_RND2;
+
+ return(rnd);
+}
+
+/********************************************************//**
+The following function generates 'random' ulint integers which
+enumerate the value space of ulint integers in a pseudo random
+fashion. Note that the same integer is repeated always after
+2 to power 32 calls to the generator (if ulint is 32-bit).
+@return the 'random' number */
+UNIV_INLINE
+ulint
+ut_rnd_gen_ulint(void)
+/*==================*/
+{
+ ulint rnd;
+ ulint n_bits;
+
+ n_bits = 8 * sizeof(ulint);
+
+ ut_rnd_ulint_counter = UT_RND1 * ut_rnd_ulint_counter + UT_RND2;
+
+ rnd = ut_rnd_gen_next_ulint(ut_rnd_ulint_counter);
+
+ return(rnd);
+}
+
+/********************************************************//**
+Generates a random integer from a given interval.
+@return the 'random' number */
+UNIV_INLINE
+ulint
+ut_rnd_interval(
+/*============*/
+ ulint low, /*!< in: low limit; can generate also this value */
+ ulint high) /*!< in: high limit; can generate also this value */
+{
+ ulint rnd;
+
+ ut_ad(high >= low);
+
+ if (low == high) {
+
+ return(low);
+ }
+
+ rnd = ut_rnd_gen_ulint();
+
+ return(low + (rnd % (high - low + 1)));
+}
+
+/*********************************************************//**
+Generates a random iboolean value.
+@return the random value */
+UNIV_INLINE
+ibool
+ut_rnd_gen_ibool(void)
+/*=================*/
+{
+ ulint x;
+
+ x = ut_rnd_gen_ulint();
+
+ if (((x >> 20) + (x >> 15)) & 1) {
+
+ return(TRUE);
+ }
+
+ return(FALSE);
+}
+
+/*******************************************************//**
+The following function generates a hash value for a ulint integer
+to a hash table of size table_size, which should be a prime
+or some random number for the hash table to work reliably.
+@return hash value */
+UNIV_INLINE
+ulint
+ut_hash_ulint(
+/*==========*/
+ ulint key, /*!< in: value to be hashed */
+ ulint table_size) /*!< in: hash table size */
+{
+ key = key ^ UT_HASH_RANDOM_MASK2;
+
+ return(key % table_size);
+}
+
+/*************************************************************//**
+Folds a pair of ulints.
+@return folded value */
+UNIV_INLINE
+ulint
+ut_fold_ulint_pair(
+/*===============*/
+ ulint n1, /*!< in: ulint */
+ ulint n2) /*!< in: ulint */
+{
+ return(((((n1 ^ n2 ^ UT_HASH_RANDOM_MASK2) << 8) + n1)
+ ^ UT_HASH_RANDOM_MASK) + n2);
+}
+
+/*************************************************************//**
+Folds a dulint.
+@return folded value */
+UNIV_INLINE
+ulint
+ut_fold_dulint(
+/*===========*/
+ dulint d) /*!< in: dulint */
+{
+ return(ut_fold_ulint_pair(ut_dulint_get_low(d),
+ ut_dulint_get_high(d)));
+}
+
+/*************************************************************//**
+Folds a character string ending in the null character.
+@return folded value */
+UNIV_INLINE
+ulint
+ut_fold_string(
+/*===========*/
+ const char* str) /*!< in: null-terminated string */
+{
+ ulint fold = 0;
+
+ ut_ad(str);
+
+ while (*str != '\0') {
+ fold = ut_fold_ulint_pair(fold, (ulint)(*str));
+ str++;
+ }
+
+ return(fold);
+}
+
+/*************************************************************//**
+Folds a binary string.
+@return folded value */
+UNIV_INLINE
+ulint
+ut_fold_binary(
+/*===========*/
+ const byte* str, /*!< in: string of bytes */
+ ulint len) /*!< in: length */
+{
+ const byte* str_end = str + len;
+ ulint fold = 0;
+
+ ut_ad(str || !len);
+
+ while (str < str_end) {
+ fold = ut_fold_ulint_pair(fold, (ulint)(*str));
+
+ str++;
+ }
+
+ return(fold);
+}
diff --git a/storage/innobase/include/ut0sort.h b/storage/innobase/include/ut0sort.h
new file mode 100644
index 00000000000..5c6647dda9e
--- /dev/null
+++ b/storage/innobase/include/ut0sort.h
@@ -0,0 +1,106 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/ut0sort.h
+Sort utility
+
+Created 11/9/1995 Heikki Tuuri
+***********************************************************************/
+
+#ifndef ut0sort_h
+#define ut0sort_h
+
+#include "univ.i"
+
+/* This module gives a macro definition of the body of
+a standard sort function for an array of elements of any
+type. The comparison function is given as a parameter to
+the macro. The sort algorithm is mergesort which has logarithmic
+worst case.
+*/
+
+/*******************************************************************//**
+This macro expands to the body of a standard sort function.
+The sort function uses mergesort and must be defined separately
+for each type of array.
+Also the comparison function has to be defined individually
+for each array cell type. SORT_FUN is the sort function name.
+The function takes the array to be sorted (ARR),
+the array of auxiliary space (AUX_ARR) of same size,
+and the low (LOW), inclusive, and high (HIGH), noninclusive,
+limits for the sort interval as arguments.
+CMP_FUN is the comparison function name. It takes as arguments
+two elements from the array and returns 1, if the first is bigger,
+0 if equal, and -1 if the second bigger. */
+
+#define UT_SORT_FUNCTION_BODY(SORT_FUN, ARR, AUX_ARR, LOW, HIGH, CMP_FUN)\
+{\
+ ulint ut_sort_mid77;\
+ ulint ut_sort_i77;\
+ ulint ut_sort_low77;\
+ ulint ut_sort_high77;\
+\
+ ut_ad((LOW) < (HIGH));\
+ ut_ad(ARR);\
+ ut_ad(AUX_ARR);\
+\
+ if ((LOW) == (HIGH) - 1) {\
+ return;\
+ } else if ((LOW) == (HIGH) - 2) {\
+ if (CMP_FUN((ARR)[LOW], (ARR)[(HIGH) - 1]) > 0) {\
+ (AUX_ARR)[LOW] = (ARR)[LOW];\
+ (ARR)[LOW] = (ARR)[(HIGH) - 1];\
+ (ARR)[(HIGH) - 1] = (AUX_ARR)[LOW];\
+ }\
+ return;\
+ }\
+\
+ ut_sort_mid77 = ((LOW) + (HIGH)) / 2;\
+\
+ SORT_FUN((ARR), (AUX_ARR), (LOW), ut_sort_mid77);\
+ SORT_FUN((ARR), (AUX_ARR), ut_sort_mid77, (HIGH));\
+\
+ ut_sort_low77 = (LOW);\
+ ut_sort_high77 = ut_sort_mid77;\
+\
+ for (ut_sort_i77 = (LOW); ut_sort_i77 < (HIGH); ut_sort_i77++) {\
+\
+ if (ut_sort_low77 >= ut_sort_mid77) {\
+ (AUX_ARR)[ut_sort_i77] = (ARR)[ut_sort_high77];\
+ ut_sort_high77++;\
+ } else if (ut_sort_high77 >= (HIGH)) {\
+ (AUX_ARR)[ut_sort_i77] = (ARR)[ut_sort_low77];\
+ ut_sort_low77++;\
+ } else if (CMP_FUN((ARR)[ut_sort_low77],\
+ (ARR)[ut_sort_high77]) > 0) {\
+ (AUX_ARR)[ut_sort_i77] = (ARR)[ut_sort_high77];\
+ ut_sort_high77++;\
+ } else {\
+ (AUX_ARR)[ut_sort_i77] = (ARR)[ut_sort_low77];\
+ ut_sort_low77++;\
+ }\
+ }\
+\
+ memcpy((void*) ((ARR) + (LOW)), (AUX_ARR) + (LOW),\
+ ((HIGH) - (LOW)) * sizeof *(ARR));\
+}\
+
+
+#endif
+
diff --git a/storage/innobase/include/ut0ut.h b/storage/innobase/include/ut0ut.h
new file mode 100644
index 00000000000..80094321041
--- /dev/null
+++ b/storage/innobase/include/ut0ut.h
@@ -0,0 +1,385 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 2009, Sun Microsystems, Inc.
+
+Portions of this file contain modifications contributed and copyrighted by
+Sun Microsystems, Inc. Those modifications are gratefully acknowledged and
+are described briefly in the InnoDB documentation. The contributions by
+Sun Microsystems are incorporated with their permission, and subject to the
+conditions contained in the file COPYING.Sun_Microsystems.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/ut0ut.h
+Various utilities
+
+Created 1/20/1994 Heikki Tuuri
+***********************************************************************/
+
+#ifndef ut0ut_h
+#define ut0ut_h
+
+#include "univ.i"
+#include <time.h>
+#ifndef MYSQL_SERVER
+#include <ctype.h>
+#endif
+
+/** Index name prefix in fast index creation */
+#define TEMP_INDEX_PREFIX '\377'
+/** Index name prefix in fast index creation, as a string constant */
+#define TEMP_INDEX_PREFIX_STR "\377"
+
+/** Time stamp */
+typedef time_t ib_time_t;
+
+#if defined(IB_HAVE_PAUSE_INSTRUCTION)
+# ifdef WIN32
+ /* In the Win32 API, the x86 PAUSE instruction is executed by calling
+ the YieldProcessor macro defined in WinNT.h. It is a CPU architecture-
+ independent way by using YieldProcessor.*/
+# define UT_RELAX_CPU() YieldProcessor()
+# else
+ /* According to the gcc info page, asm volatile means that the
+ instruction has important side-effects and must not be removed.
+ Also asm volatile may trigger a memory barrier (spilling all registers
+ to memory). */
+# define UT_RELAX_CPU() __asm__ __volatile__ ("pause")
+# endif
+#elif defined(HAVE_ATOMIC_BUILTINS)
+# define UT_RELAX_CPU() do { \
+ volatile lint volatile_var; \
+ os_compare_and_swap_lint(&volatile_var, 0, 1); \
+ } while (0)
+#else
+# define UT_RELAX_CPU() ((void)0) /* avoid warning for an empty statement */
+#endif
+
+/*********************************************************************//**
+Delays execution for at most max_wait_us microseconds or returns earlier
+if cond becomes true.
+@param cond in: condition to wait for; evaluated every 2 ms
+@param max_wait_us in: maximum delay to wait, in microseconds */
+#define UT_WAIT_FOR(cond, max_wait_us) \
+do { \
+ ullint start_us; \
+ start_us = ut_time_us(NULL); \
+ while (!(cond) \
+ && ut_time_us(NULL) - start_us < (max_wait_us)) {\
+ \
+ os_thread_sleep(2000 /* 2 ms */); \
+ } \
+} while (0)
+
+/********************************************************//**
+Gets the high 32 bits in a ulint. That is makes a shift >> 32,
+but since there seem to be compiler bugs in both gcc and Visual C++,
+we do this by a special conversion.
+@return a >> 32 */
+UNIV_INTERN
+ulint
+ut_get_high32(
+/*==========*/
+ ulint a); /*!< in: ulint */
+/******************************************************//**
+Calculates the minimum of two ulints.
+@return minimum */
+UNIV_INLINE
+ulint
+ut_min(
+/*===*/
+ ulint n1, /*!< in: first number */
+ ulint n2); /*!< in: second number */
+/******************************************************//**
+Calculates the maximum of two ulints.
+@return maximum */
+UNIV_INLINE
+ulint
+ut_max(
+/*===*/
+ ulint n1, /*!< in: first number */
+ ulint n2); /*!< in: second number */
+/****************************************************************//**
+Calculates minimum of two ulint-pairs. */
+UNIV_INLINE
+void
+ut_pair_min(
+/*========*/
+ ulint* a, /*!< out: more significant part of minimum */
+ ulint* b, /*!< out: less significant part of minimum */
+ ulint a1, /*!< in: more significant part of first pair */
+ ulint b1, /*!< in: less significant part of first pair */
+ ulint a2, /*!< in: more significant part of second pair */
+ ulint b2); /*!< in: less significant part of second pair */
+/******************************************************//**
+Compares two ulints.
+@return 1 if a > b, 0 if a == b, -1 if a < b */
+UNIV_INLINE
+int
+ut_ulint_cmp(
+/*=========*/
+ ulint a, /*!< in: ulint */
+ ulint b); /*!< in: ulint */
+/*******************************************************//**
+Compares two pairs of ulints.
+@return -1 if a < b, 0 if a == b, 1 if a > b */
+UNIV_INLINE
+int
+ut_pair_cmp(
+/*========*/
+ ulint a1, /*!< in: more significant part of first pair */
+ ulint a2, /*!< in: less significant part of first pair */
+ ulint b1, /*!< in: more significant part of second pair */
+ ulint b2); /*!< in: less significant part of second pair */
+/*************************************************************//**
+Determines if a number is zero or a power of two.
+@param n in: number
+@return nonzero if n is zero or a power of two; zero otherwise */
+#define ut_is_2pow(n) UNIV_LIKELY(!((n) & ((n) - 1)))
+/*************************************************************//**
+Calculates fast the remainder of n/m when m is a power of two.
+@param n in: numerator
+@param m in: denominator, must be a power of two
+@return the remainder of n/m */
+#define ut_2pow_remainder(n, m) ((n) & ((m) - 1))
+/*************************************************************//**
+Calculates the biggest multiple of m that is not bigger than n
+when m is a power of two. In other words, rounds n down to m * k.
+@param n in: number to round down
+@param m in: alignment, must be a power of two
+@return n rounded down to the biggest possible integer multiple of m */
+#define ut_2pow_round(n, m) ((n) & ~((m) - 1))
+/** Align a number down to a multiple of a power of two.
+@param n in: number to round down
+@param m in: alignment, must be a power of two
+@return n rounded down to the biggest possible integer multiple of m */
+#define ut_calc_align_down(n, m) ut_2pow_round(n, m)
+/********************************************************//**
+Calculates the smallest multiple of m that is not smaller than n
+when m is a power of two. In other words, rounds n up to m * k.
+@param n in: number to round up
+@param m in: alignment, must be a power of two
+@return n rounded up to the smallest possible integer multiple of m */
+#define ut_calc_align(n, m) (((n) + ((m) - 1)) & ~((m) - 1))
+/*************************************************************//**
+Calculates fast the 2-logarithm of a number, rounded upward to an
+integer.
+@return logarithm in the base 2, rounded upward */
+UNIV_INLINE
+ulint
+ut_2_log(
+/*=====*/
+ ulint n); /*!< in: number */
+/*************************************************************//**
+Calculates 2 to power n.
+@return 2 to power n */
+UNIV_INLINE
+ulint
+ut_2_exp(
+/*=====*/
+ ulint n); /*!< in: number */
+/*************************************************************//**
+Calculates fast the number rounded up to the nearest power of 2.
+@return first power of 2 which is >= n */
+UNIV_INTERN
+ulint
+ut_2_power_up(
+/*==========*/
+ ulint n) /*!< in: number != 0 */
+ __attribute__((const));
+
+/** Determine how many bytes (groups of 8 bits) are needed to
+store the given number of bits.
+@param b in: bits
+@return number of bytes (octets) needed to represent b */
+#define UT_BITS_IN_BYTES(b) (((b) + 7) / 8)
+
+/**********************************************************//**
+Returns system time. We do not specify the format of the time returned:
+the only way to manipulate it is to use the function ut_difftime.
+@return system time */
+UNIV_INTERN
+ib_time_t
+ut_time(void);
+/*=========*/
+/**********************************************************//**
+Returns system time.
+Upon successful completion, the value 0 is returned; otherwise the
+value -1 is returned and the global variable errno is set to indicate the
+error.
+@return 0 on success, -1 otherwise */
+UNIV_INTERN
+int
+ut_usectime(
+/*========*/
+ ulint* sec, /*!< out: seconds since the Epoch */
+ ulint* ms); /*!< out: microseconds since the Epoch+*sec */
+
+/**********************************************************//**
+Returns the number of microseconds since epoch. Similar to
+time(3), the return value is also stored in *tloc, provided
+that tloc is non-NULL.
+@return us since epoch */
+UNIV_INTERN
+ullint
+ut_time_us(
+/*=======*/
+ ullint* tloc); /*!< out: us since epoch, if non-NULL */
+
+/**********************************************************//**
+Returns the difference of two times in seconds.
+@return time2 - time1 expressed in seconds */
+UNIV_INTERN
+double
+ut_difftime(
+/*========*/
+ ib_time_t time2, /*!< in: time */
+ ib_time_t time1); /*!< in: time */
+/**********************************************************//**
+Prints a timestamp to a file. */
+UNIV_INTERN
+void
+ut_print_timestamp(
+/*===============*/
+ FILE* file); /*!< in: file where to print */
+/**********************************************************//**
+Sprintfs a timestamp to a buffer, 13..14 chars plus terminating NUL. */
+UNIV_INTERN
+void
+ut_sprintf_timestamp(
+/*=================*/
+ char* buf); /*!< in: buffer where to sprintf */
+#ifdef UNIV_HOTBACKUP
+/**********************************************************//**
+Sprintfs a timestamp to a buffer with no spaces and with ':' characters
+replaced by '_'. */
+UNIV_INTERN
+void
+ut_sprintf_timestamp_without_extra_chars(
+/*=====================================*/
+ char* buf); /*!< in: buffer where to sprintf */
+/**********************************************************//**
+Returns current year, month, day. */
+UNIV_INTERN
+void
+ut_get_year_month_day(
+/*==================*/
+ ulint* year, /*!< out: current year */
+ ulint* month, /*!< out: month */
+ ulint* day); /*!< out: day */
+#else /* UNIV_HOTBACKUP */
+/*************************************************************//**
+Runs an idle loop on CPU. The argument gives the desired delay
+in microseconds on 100 MHz Pentium + Visual C++.
+@return dummy value */
+UNIV_INTERN
+ulint
+ut_delay(
+/*=====*/
+ ulint delay); /*!< in: delay in microseconds on 100 MHz Pentium */
+#endif /* UNIV_HOTBACKUP */
+/*************************************************************//**
+Prints the contents of a memory buffer in hex and ascii. */
+UNIV_INTERN
+void
+ut_print_buf(
+/*=========*/
+ FILE* file, /*!< in: file where to print */
+ const void* buf, /*!< in: memory buffer */
+ ulint len); /*!< in: length of the buffer */
+
+/**********************************************************************//**
+Outputs a NUL-terminated file name, quoted with apostrophes. */
+UNIV_INTERN
+void
+ut_print_filename(
+/*==============*/
+ FILE* f, /*!< in: output stream */
+ const char* name); /*!< in: name to print */
+
+#ifndef UNIV_HOTBACKUP
+/* Forward declaration of transaction handle */
+struct trx_struct;
+
+/**********************************************************************//**
+Outputs a fixed-length string, quoted as an SQL identifier.
+If the string contains a slash '/', the string will be
+output as two identifiers separated by a period (.),
+as in SQL database_name.identifier. */
+UNIV_INTERN
+void
+ut_print_name(
+/*==========*/
+ FILE* f, /*!< in: output stream */
+ struct trx_struct*trx, /*!< in: transaction */
+ ibool table_id,/*!< in: TRUE=print a table name,
+ FALSE=print other identifier */
+ const char* name); /*!< in: name to print */
+
+/**********************************************************************//**
+Outputs a fixed-length string, quoted as an SQL identifier.
+If the string contains a slash '/', the string will be
+output as two identifiers separated by a period (.),
+as in SQL database_name.identifier. */
+UNIV_INTERN
+void
+ut_print_namel(
+/*===========*/
+ FILE* f, /*!< in: output stream */
+ struct trx_struct*trx, /*!< in: transaction (NULL=no quotes) */
+ ibool table_id,/*!< in: TRUE=print a table name,
+ FALSE=print other identifier */
+ const char* name, /*!< in: name to print */
+ ulint namelen);/*!< in: length of name */
+
+/**********************************************************************//**
+Catenate files. */
+UNIV_INTERN
+void
+ut_copy_file(
+/*=========*/
+ FILE* dest, /*!< in: output file */
+ FILE* src); /*!< in: input file to be appended to output */
+#endif /* !UNIV_HOTBACKUP */
+
+#ifdef __WIN__
+/**********************************************************************//**
+A substitute for snprintf(3), formatted output conversion into
+a limited buffer.
+@return number of characters that would have been printed if the size
+were unlimited, not including the terminating '\0'. */
+UNIV_INTERN
+int
+ut_snprintf(
+/*========*/
+ char* str, /*!< out: string */
+ size_t size, /*!< in: str size */
+ const char* fmt, /*!< in: format */
+ ...); /*!< in: format values */
+#else
+/**********************************************************************//**
+A wrapper for snprintf(3), formatted output conversion into
+a limited buffer. */
+# define ut_snprintf snprintf
+#endif /* __WIN__ */
+
+#ifndef UNIV_NONINL
+#include "ut0ut.ic"
+#endif
+
+#endif
+
diff --git a/storage/innobase/include/ut0ut.ic b/storage/innobase/include/ut0ut.ic
new file mode 100644
index 00000000000..6f55c7e410e
--- /dev/null
+++ b/storage/innobase/include/ut0ut.ic
@@ -0,0 +1,162 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************************//**
+@file include/ut0ut.ic
+Various utilities
+
+Created 5/30/1994 Heikki Tuuri
+*******************************************************************/
+
+/******************************************************//**
+Calculates the minimum of two ulints.
+@return minimum */
+UNIV_INLINE
+ulint
+ut_min(
+/*===*/
+ ulint n1, /*!< in: first number */
+ ulint n2) /*!< in: second number */
+{
+ return((n1 <= n2) ? n1 : n2);
+}
+
+/******************************************************//**
+Calculates the maximum of two ulints.
+@return maximum */
+UNIV_INLINE
+ulint
+ut_max(
+/*===*/
+ ulint n1, /*!< in: first number */
+ ulint n2) /*!< in: second number */
+{
+ return((n1 <= n2) ? n2 : n1);
+}
+
+/****************************************************************//**
+Calculates minimum of two ulint-pairs. */
+UNIV_INLINE
+void
+ut_pair_min(
+/*========*/
+ ulint* a, /*!< out: more significant part of minimum */
+ ulint* b, /*!< out: less significant part of minimum */
+ ulint a1, /*!< in: more significant part of first pair */
+ ulint b1, /*!< in: less significant part of first pair */
+ ulint a2, /*!< in: more significant part of second pair */
+ ulint b2) /*!< in: less significant part of second pair */
+{
+ if (a1 == a2) {
+ *a = a1;
+ *b = ut_min(b1, b2);
+ } else if (a1 < a2) {
+ *a = a1;
+ *b = b1;
+ } else {
+ *a = a2;
+ *b = b2;
+ }
+}
+
+/******************************************************//**
+Compares two ulints.
+@return 1 if a > b, 0 if a == b, -1 if a < b */
+UNIV_INLINE
+int
+ut_ulint_cmp(
+/*=========*/
+ ulint a, /*!< in: ulint */
+ ulint b) /*!< in: ulint */
+{
+ if (a < b) {
+ return(-1);
+ } else if (a == b) {
+ return(0);
+ } else {
+ return(1);
+ }
+}
+
+/*******************************************************//**
+Compares two pairs of ulints.
+@return -1 if a < b, 0 if a == b, 1 if a > b */
+UNIV_INLINE
+int
+ut_pair_cmp(
+/*========*/
+ ulint a1, /*!< in: more significant part of first pair */
+ ulint a2, /*!< in: less significant part of first pair */
+ ulint b1, /*!< in: more significant part of second pair */
+ ulint b2) /*!< in: less significant part of second pair */
+{
+ if (a1 > b1) {
+ return(1);
+ } else if (a1 < b1) {
+ return(-1);
+ } else if (a2 > b2) {
+ return(1);
+ } else if (a2 < b2) {
+ return(-1);
+ } else {
+ return(0);
+ }
+}
+
+/*************************************************************//**
+Calculates fast the 2-logarithm of a number, rounded upward to an
+integer.
+@return logarithm in the base 2, rounded upward */
+UNIV_INLINE
+ulint
+ut_2_log(
+/*=====*/
+ ulint n) /*!< in: number != 0 */
+{
+ ulint res;
+
+ res = 0;
+
+ ut_ad(n > 0);
+
+ n = n - 1;
+
+ for (;;) {
+ n = n / 2;
+
+ if (n == 0) {
+ break;
+ }
+
+ res++;
+ }
+
+ return(res + 1);
+}
+
+/*************************************************************//**
+Calculates 2 to power n.
+@return 2 to power n */
+UNIV_INLINE
+ulint
+ut_2_exp(
+/*=====*/
+ ulint n) /*!< in: number */
+{
+ return((ulint) 1 << n);
+}
diff --git a/storage/innobase/include/ut0vec.h b/storage/innobase/include/ut0vec.h
new file mode 100644
index 00000000000..a770f671cfc
--- /dev/null
+++ b/storage/innobase/include/ut0vec.h
@@ -0,0 +1,125 @@
+/*****************************************************************************
+
+Copyright (c) 2006, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/*******************************************************************//**
+@file include/ut0vec.h
+A vector of pointers to data items
+
+Created 4/6/2006 Osku Salerma
+************************************************************************/
+
+#ifndef IB_VECTOR_H
+#define IB_VECTOR_H
+
+#include "univ.i"
+#include "mem0mem.h"
+
+/** An automatically resizing vector data type. */
+typedef struct ib_vector_struct ib_vector_t;
+
+/* An automatically resizing vector datatype with the following properties:
+
+ -Contains void* items.
+
+ -The items are owned by the caller.
+
+ -All memory allocation is done through a heap owned by the caller, who is
+ responsible for freeing it when done with the vector.
+
+ -When the vector is resized, the old memory area is left allocated since it
+ uses the same heap as the new memory area, so this is best used for
+ relatively small or short-lived uses.
+*/
+
+/****************************************************************//**
+Create a new vector with the given initial size.
+@return vector */
+UNIV_INTERN
+ib_vector_t*
+ib_vector_create(
+/*=============*/
+ mem_heap_t* heap, /*!< in: heap */
+ ulint size); /*!< in: initial size */
+
+/****************************************************************//**
+Push a new element to the vector, increasing its size if necessary. */
+UNIV_INTERN
+void
+ib_vector_push(
+/*===========*/
+ ib_vector_t* vec, /*!< in: vector */
+ void* elem); /*!< in: data element */
+
+/****************************************************************//**
+Get the number of elements in the vector.
+@return number of elements in vector */
+UNIV_INLINE
+ulint
+ib_vector_size(
+/*===========*/
+ const ib_vector_t* vec); /*!< in: vector */
+
+/****************************************************************//**
+Test whether a vector is empty or not.
+@return TRUE if empty */
+UNIV_INLINE
+ibool
+ib_vector_is_empty(
+/*===============*/
+ const ib_vector_t* vec); /*!< in: vector */
+
+/****************************************************************//**
+Get the n'th element.
+@return n'th element */
+UNIV_INLINE
+void*
+ib_vector_get(
+/*==========*/
+ ib_vector_t* vec, /*!< in: vector */
+ ulint n); /*!< in: element index to get */
+
+/****************************************************************//**
+Remove the last element from the vector. */
+UNIV_INLINE
+void*
+ib_vector_pop(
+/*==========*/
+ ib_vector_t* vec); /*!< in: vector */
+
+/****************************************************************//**
+Free the underlying heap of the vector. Note that vec is invalid
+after this call. */
+UNIV_INLINE
+void
+ib_vector_free(
+/*===========*/
+ ib_vector_t* vec); /*!< in,own: vector */
+
+/** An automatically resizing vector data type. */
+struct ib_vector_struct {
+ mem_heap_t* heap; /*!< heap */
+ void** data; /*!< data elements */
+ ulint used; /*!< number of elements currently used */
+ ulint total; /*!< number of elements allocated */
+};
+
+#ifndef UNIV_NONINL
+#include "ut0vec.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/ut0vec.ic b/storage/innobase/include/ut0vec.ic
new file mode 100644
index 00000000000..02e881f9bca
--- /dev/null
+++ b/storage/innobase/include/ut0vec.ic
@@ -0,0 +1,96 @@
+/*****************************************************************************
+
+Copyright (c) 2006, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/*******************************************************************//**
+@file include/ut0vec.ic
+A vector of pointers to data items
+
+Created 4/6/2006 Osku Salerma
+************************************************************************/
+
+/****************************************************************//**
+Get number of elements in vector.
+@return number of elements in vector */
+UNIV_INLINE
+ulint
+ib_vector_size(
+/*===========*/
+ const ib_vector_t* vec) /*!< in: vector */
+{
+ return(vec->used);
+}
+
+/****************************************************************//**
+Get n'th element.
+@return n'th element */
+UNIV_INLINE
+void*
+ib_vector_get(
+/*==========*/
+ ib_vector_t* vec, /*!< in: vector */
+ ulint n) /*!< in: element index to get */
+{
+ ut_a(n < vec->used);
+
+ return(vec->data[n]);
+}
+
+/****************************************************************//**
+Remove the last element from the vector.
+@return last vector element */
+UNIV_INLINE
+void*
+ib_vector_pop(
+/*==========*/
+ ib_vector_t* vec) /*!< in/out: vector */
+{
+ void* elem;
+
+ ut_a(vec->used > 0);
+ --vec->used;
+ elem = vec->data[vec->used];
+
+ ut_d(vec->data[vec->used] = NULL);
+ UNIV_MEM_INVALID(&vec->data[vec->used], sizeof(*vec->data));
+
+ return(elem);
+}
+
+/****************************************************************//**
+Free the underlying heap of the vector. Note that vec is invalid
+after this call. */
+UNIV_INLINE
+void
+ib_vector_free(
+/*===========*/
+ ib_vector_t* vec) /*!< in, own: vector */
+{
+ mem_heap_free(vec->heap);
+}
+
+/****************************************************************//**
+Test whether a vector is empty or not.
+@return TRUE if empty */
+UNIV_INLINE
+ibool
+ib_vector_is_empty(
+/*===============*/
+ const ib_vector_t* vec) /*!< in: vector */
+{
+ return(ib_vector_size(vec) == 0);
+}
diff --git a/storage/innobase/include/ut0wqueue.h b/storage/innobase/include/ut0wqueue.h
new file mode 100644
index 00000000000..2ec0f16ab05
--- /dev/null
+++ b/storage/innobase/include/ut0wqueue.h
@@ -0,0 +1,85 @@
+/*****************************************************************************
+
+Copyright (c) 2006, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/*******************************************************************//**
+@file include/ut0wqueue.h
+A work queue
+
+Created 4/26/2006 Osku Salerma
+************************************************************************/
+
+/*******************************************************************//**
+A Work queue. Threads can add work items to the queue and other threads can
+wait for work items to be available and take them off the queue for
+processing.
+************************************************************************/
+
+#ifndef IB_WORK_QUEUE_H
+#define IB_WORK_QUEUE_H
+
+#include "ut0list.h"
+#include "mem0mem.h"
+#include "os0sync.h"
+#include "sync0types.h"
+
+typedef struct ib_wqueue_struct ib_wqueue_t;
+
+/****************************************************************//**
+Create a new work queue.
+@return work queue */
+UNIV_INTERN
+ib_wqueue_t*
+ib_wqueue_create(void);
+/*===================*/
+
+/****************************************************************//**
+Free a work queue. */
+UNIV_INTERN
+void
+ib_wqueue_free(
+/*===========*/
+ ib_wqueue_t* wq); /*!< in: work queue */
+
+/****************************************************************//**
+Add a work item to the queue. */
+UNIV_INTERN
+void
+ib_wqueue_add(
+/*==========*/
+ ib_wqueue_t* wq, /*!< in: work queue */
+ void* item, /*!< in: work item */
+ mem_heap_t* heap); /*!< in: memory heap to use for allocating the
+ list node */
+
+/****************************************************************//**
+Wait for a work item to appear in the queue.
+@return work item */
+UNIV_INTERN
+void*
+ib_wqueue_wait(
+/*===========*/
+ ib_wqueue_t* wq); /*!< in: work queue */
+
+/* Work queue. */
+struct ib_wqueue_struct {
+ mutex_t mutex; /*!< mutex protecting everything */
+ ib_list_t* items; /*!< work item list */
+ os_event_t event; /*!< event we use to signal additions to list */
+};
+
+#endif