summaryrefslogtreecommitdiff
path: root/innobase/include
diff options
context:
space:
mode:
authorunknown <monty@donna.mysql.com>2001-02-17 14:19:19 +0200
committerunknown <monty@donna.mysql.com>2001-02-17 14:19:19 +0200
commit2662b59306ef0cd495fa6e2edf7129e58a11393a (patch)
treebfe39951a73e906579ab819bf5198ad8f3a64a36 /innobase/include
parent66de55a56bdcf2f7a9c0c4f8e19b3e761475e202 (diff)
downloadmariadb-git-2662b59306ef0cd495fa6e2edf7129e58a11393a.tar.gz
Added Innobase to source distribution
Docs/manual.texi: Added Innobase documentation configure.in: Incremented version include/my_base.h: Added option for Innobase myisam/mi_check.c: cleanup mysql-test/t/bdb.test: cleanup mysql-test/t/innobase.test: Extended with new tests from bdb.test mysql-test/t/merge.test: Added test of SHOW create mysys/my_init.c: Fix for UNIXWARE 7 scripts/mysql_install_db.sh: Always write how to start mysqld scripts/safe_mysqld.sh: Fixed type sql/ha_innobase.cc: Update to new version sql/ha_innobase.h: Update to new version sql/handler.h: Added 'update_table_comment()' and 'append_create_info()' sql/sql_delete.cc: Fixes for Innobase sql/sql_select.cc: Fixes for Innobase sql/sql_show.cc: Append create information (for MERGE tables) sql/sql_update.cc: Fixes for Innobase
Diffstat (limited to 'innobase/include')
-rw-r--r--innobase/include/Makefile.i5
-rw-r--r--innobase/include/btr0btr.h391
-rw-r--r--innobase/include/btr0btr.ic223
-rw-r--r--innobase/include/btr0cur.h519
-rw-r--r--innobase/include/btr0cur.ic172
-rw-r--r--innobase/include/btr0pcur.h486
-rw-r--r--innobase/include/btr0pcur.ic598
-rw-r--r--innobase/include/btr0sea.h269
-rw-r--r--innobase/include/btr0sea.ic65
-rw-r--r--innobase/include/btr0types.h21
-rw-r--r--innobase/include/buf0buf.h834
-rw-r--r--innobase/include/buf0buf.ic641
-rw-r--r--innobase/include/buf0flu.h110
-rw-r--r--innobase/include/buf0flu.ic100
-rw-r--r--innobase/include/buf0lru.h117
-rw-r--r--innobase/include/buf0lru.ic8
-rw-r--r--innobase/include/buf0rea.h98
-rw-r--r--innobase/include/buf0types.h20
-rw-r--r--innobase/include/com0com.h125
-rw-r--r--innobase/include/com0com.ic7
-rw-r--r--innobase/include/com0shm.h103
-rw-r--r--innobase/include/com0shm.ic7
-rw-r--r--innobase/include/data0data.h430
-rw-r--r--innobase/include/data0data.ic491
-rw-r--r--innobase/include/data0type.h214
-rw-r--r--innobase/include/data0type.ic248
-rw-r--r--innobase/include/data0types.h19
-rw-r--r--innobase/include/db0err.h44
-rw-r--r--innobase/include/dict0boot.h132
-rw-r--r--innobase/include/dict0boot.ic124
-rw-r--r--innobase/include/dict0crea.h140
-rw-r--r--innobase/include/dict0crea.ic8
-rw-r--r--innobase/include/dict0dict.h677
-rw-r--r--innobase/include/dict0dict.ic696
-rw-r--r--innobase/include/dict0load.h49
-rw-r--r--innobase/include/dict0load.ic9
-rw-r--r--innobase/include/dict0mem.h335
-rw-r--r--innobase/include/dict0mem.ic9
-rw-r--r--innobase/include/dict0types.h28
-rw-r--r--innobase/include/dyn0dyn.h172
-rw-r--r--innobase/include/dyn0dyn.ic345
-rw-r--r--innobase/include/eval0eval.h97
-rw-r--r--innobase/include/eval0eval.ic236
-rw-r--r--innobase/include/eval0proc.h79
-rw-r--r--innobase/include/eval0proc.ic71
-rw-r--r--innobase/include/fil0fil.h357
-rw-r--r--innobase/include/fsp0fsp.h331
-rw-r--r--innobase/include/fsp0fsp.ic24
-rw-r--r--innobase/include/fut0fut.h36
-rw-r--r--innobase/include/fut0fut.ic36
-rw-r--r--innobase/include/fut0lst.h198
-rw-r--r--innobase/include/fut0lst.ic147
-rw-r--r--innobase/include/ha0ha.h137
-rw-r--r--innobase/include/ha0ha.ic280
-rw-r--r--innobase/include/hash0hash.h345
-rw-r--r--innobase/include/hash0hash.ic131
-rw-r--r--innobase/include/ib_odbc.h149
-rw-r--r--innobase/include/ibuf0ibuf.h268
-rw-r--r--innobase/include/ibuf0ibuf.ic226
-rw-r--r--innobase/include/ibuf0types.h15
-rw-r--r--innobase/include/lock0lock.h538
-rw-r--r--innobase/include/lock0lock.ic80
-rw-r--r--innobase/include/lock0types.h15
-rw-r--r--innobase/include/log0log.h752
-rw-r--r--innobase/include/log0log.ic378
-rw-r--r--innobase/include/log0recv.h284
-rw-r--r--innobase/include/log0recv.ic35
-rw-r--r--innobase/include/mach0data.h332
-rw-r--r--innobase/include/mach0data.ic727
-rw-r--r--innobase/include/makefilewin.i34
-rw-r--r--innobase/include/mem0dbg.h117
-rw-r--r--innobase/include/mem0dbg.ic91
-rw-r--r--innobase/include/mem0mem.h350
-rw-r--r--innobase/include/mem0mem.ic597
-rw-r--r--innobase/include/mem0pool.h83
-rw-r--r--innobase/include/mem0pool.ic7
-rw-r--r--innobase/include/mtr0log.h178
-rw-r--r--innobase/include/mtr0log.ic187
-rw-r--r--innobase/include/mtr0mtr.h343
-rw-r--r--innobase/include/mtr0mtr.ic261
-rw-r--r--innobase/include/mtr0types.h14
-rw-r--r--innobase/include/odbc0odbc.h20
-rw-r--r--innobase/include/os0file.h353
-rw-r--r--innobase/include/os0proc.h71
-rw-r--r--innobase/include/os0proc.ic10
-rw-r--r--innobase/include/os0shm.h66
-rw-r--r--innobase/include/os0shm.ic10
-rw-r--r--innobase/include/os0sync.h198
-rw-r--r--innobase/include/os0sync.ic56
-rw-r--r--innobase/include/os0thread.h121
-rw-r--r--innobase/include/os0thread.ic8
-rw-r--r--innobase/include/page0cur.h263
-rw-r--r--innobase/include/page0cur.ic221
-rw-r--r--innobase/include/page0page.h697
-rw-r--r--innobase/include/page0page.ic772
-rw-r--r--innobase/include/page0types.h20
-rw-r--r--innobase/include/pars0grm.h90
-rw-r--r--innobase/include/pars0opt.h58
-rw-r--r--innobase/include/pars0opt.ic7
-rw-r--r--innobase/include/pars0pars.h566
-rw-r--r--innobase/include/pars0pars.ic7
-rw-r--r--innobase/include/pars0sym.h191
-rw-r--r--innobase/include/pars0sym.ic7
-rw-r--r--innobase/include/pars0types.h29
-rw-r--r--innobase/include/que0que.h495
-rw-r--r--innobase/include/que0que.ic304
-rw-r--r--innobase/include/que0types.h42
-rw-r--r--innobase/include/read0read.h92
-rw-r--r--innobase/include/read0read.ic85
-rw-r--r--innobase/include/read0types.h14
-rw-r--r--innobase/include/rem0cmp.h130
-rw-r--r--innobase/include/rem0cmp.ic84
-rw-r--r--innobase/include/rem0rec.h357
-rw-r--r--innobase/include/rem0rec.ic959
-rw-r--r--innobase/include/rem0types.h16
-rw-r--r--innobase/include/row0ins.h142
-rw-r--r--innobase/include/row0ins.ic9
-rw-r--r--innobase/include/row0mysql.h359
-rw-r--r--innobase/include/row0mysql.ic97
-rw-r--r--innobase/include/row0purge.h80
-rw-r--r--innobase/include/row0purge.ic8
-rw-r--r--innobase/include/row0row.h266
-rw-r--r--innobase/include/row0row.ic165
-rw-r--r--innobase/include/row0sel.h330
-rw-r--r--innobase/include/row0sel.ic91
-rw-r--r--innobase/include/row0types.h37
-rw-r--r--innobase/include/row0uins.h37
-rw-r--r--innobase/include/row0uins.ic8
-rw-r--r--innobase/include/row0umod.h35
-rw-r--r--innobase/include/row0umod.ic7
-rw-r--r--innobase/include/row0undo.h117
-rw-r--r--innobase/include/row0undo.ic7
-rw-r--r--innobase/include/row0upd.h363
-rw-r--r--innobase/include/row0upd.ic105
-rw-r--r--innobase/include/row0vers.h95
-rw-r--r--innobase/include/row0vers.ic83
-rw-r--r--innobase/include/srv0que.h53
-rw-r--r--innobase/include/srv0srv.h237
-rw-r--r--innobase/include/srv0srv.ic7
-rw-r--r--innobase/include/srv0start.h31
-rw-r--r--innobase/include/sync0arr.h114
-rw-r--r--innobase/include/sync0arr.ic10
-rw-r--r--innobase/include/sync0ipm.h113
-rw-r--r--innobase/include/sync0ipm.ic182
-rw-r--r--innobase/include/sync0rw.h493
-rw-r--r--innobase/include/sync0rw.ic510
-rw-r--r--innobase/include/sync0sync.h497
-rw-r--r--innobase/include/sync0sync.ic226
-rw-r--r--innobase/include/sync0types.h15
-rw-r--r--innobase/include/thr0loc.h67
-rw-r--r--innobase/include/thr0loc.ic7
-rw-r--r--innobase/include/trx0purge.h166
-rw-r--r--innobase/include/trx0purge.ic26
-rw-r--r--innobase/include/trx0rec.h284
-rw-r--r--innobase/include/trx0rec.ic69
-rw-r--r--innobase/include/trx0roll.h216
-rw-r--r--innobase/include/trx0roll.ic23
-rw-r--r--innobase/include/trx0rseg.h193
-rw-r--r--innobase/include/trx0rseg.ic112
-rw-r--r--innobase/include/trx0sys.h270
-rw-r--r--innobase/include/trx0sys.ic352
-rw-r--r--innobase/include/trx0trx.h412
-rw-r--r--innobase/include/trx0trx.ic23
-rw-r--r--innobase/include/trx0types.h43
-rw-r--r--innobase/include/trx0undo.h473
-rw-r--r--innobase/include/trx0undo.ic319
-rw-r--r--innobase/include/univ.i166
-rw-r--r--innobase/include/univold.i164
-rw-r--r--innobase/include/univoldmysql.i181
-rw-r--r--innobase/include/usr0sess.h318
-rw-r--r--innobase/include/usr0sess.ic31
-rw-r--r--innobase/include/usr0types.h16
-rw-r--r--innobase/include/ut0byte.h229
-rw-r--r--innobase/include/ut0byte.ic360
-rw-r--r--innobase/include/ut0dbg.h78
-rw-r--r--innobase/include/ut0lst.h215
-rw-r--r--innobase/include/ut0mem.h64
-rw-r--r--innobase/include/ut0mem.ic57
-rw-r--r--innobase/include/ut0rnd.h121
-rw-r--r--innobase/include/ut0rnd.ic222
-rw-r--r--innobase/include/ut0sort.h91
-rw-r--r--innobase/include/ut0ut.h174
-rw-r--r--innobase/include/ut0ut.ic196
183 files changed, 34741 insertions, 0 deletions
diff --git a/innobase/include/Makefile.i b/innobase/include/Makefile.i
new file mode 100644
index 00000000000..2bc51147347
--- /dev/null
+++ b/innobase/include/Makefile.i
@@ -0,0 +1,5 @@
+# Makefile included in Makefile.am in every subdirectory
+
+libsdir = ../libs
+
+INCLUDES = -I../../include -I../include
diff --git a/innobase/include/btr0btr.h b/innobase/include/btr0btr.h
new file mode 100644
index 00000000000..d2ac9952695
--- /dev/null
+++ b/innobase/include/btr0btr.h
@@ -0,0 +1,391 @@
+/******************************************************
+The B-tree
+
+(c) 1994-1996 Innobase Oy
+
+Created 6/2/1994 Heikki Tuuri
+*******************************************************/
+
+#ifndef btr0btr_h
+#define btr0btr_h
+
+#include "univ.i"
+
+#include "dict0dict.h"
+#include "data0data.h"
+#include "page0cur.h"
+#include "rem0rec.h"
+#include "mtr0mtr.h"
+#include "btr0types.h"
+
+/* Maximum record size which can be stored on a page, without using the
+special big record storage structure */
+
+#define BTR_PAGE_MAX_REC_SIZE (UNIV_PAGE_SIZE / 2 - 200)
+
+/* Maximum key size in a B-tree: the records on non-leaf levels must be
+shorter than this */
+
+#define BTR_PAGE_MAX_KEY_SIZE 1024
+
+/* If data in page drops below this limit, we try to compress it.
+NOTE! The value has to be > 2 * BTR_MAX_KEY_SIZE */
+
+#define BTR_COMPRESS_LIMIT (UNIV_PAGE_SIZE / 4 + 1);
+
+/* Latching modes for the search function (in btr0cur.*) */
+#define BTR_SEARCH_LEAF RW_S_LATCH
+#define BTR_MODIFY_LEAF RW_X_LATCH
+#define BTR_NO_LATCHES RW_NO_LATCH
+#define BTR_MODIFY_TREE 33
+#define BTR_CONT_MODIFY_TREE 34
+#define BTR_SEARCH_PREV 35
+#define BTR_MODIFY_PREV 36
+
+/* If this is ORed to the latch mode, it means that the search tuple will be
+inserted to the index, at the searched position */
+#define BTR_INSERT 512
+
+/* This flag ORed to latch mode says that we do the search in query
+optimization */
+#define BTR_ESTIMATE 1024
+/******************************************************************
+Gets a buffer page and declares its latching order level. */
+UNIV_INLINE
+page_t*
+btr_page_get(
+/*=========*/
+ ulint space, /* in: space id */
+ ulint page_no, /* in: page number */
+ ulint mode, /* in: latch mode */
+ mtr_t* mtr); /* in: mtr */
+/******************************************************************
+Gets the index id field of a page. */
+UNIV_INLINE
+dulint
+btr_page_get_index_id(
+/*==================*/
+ /* out: index id */
+ page_t* page); /* in: index page */
+/************************************************************
+Gets the node level field in an index page. */
+UNIV_INLINE
+ulint
+btr_page_get_level_low(
+/*===================*/
+ /* out: level, leaf level == 0 */
+ page_t* page); /* in: index page */
+/************************************************************
+Gets the node level field in an index page. */
+UNIV_INLINE
+ulint
+btr_page_get_level(
+/*===============*/
+ /* out: level, leaf level == 0 */
+ page_t* page, /* in: index page */
+ mtr_t* mtr); /* in: mini-transaction handle */
+/************************************************************
+Gets the next index page number. */
+UNIV_INLINE
+ulint
+btr_page_get_next(
+/*==============*/
+ /* out: next page number */
+ page_t* page, /* in: index page */
+ mtr_t* mtr); /* in: mini-transaction handle */
+/************************************************************
+Gets the previous index page number. */
+UNIV_INLINE
+ulint
+btr_page_get_prev(
+/*==============*/
+ /* out: prev page number */
+ page_t* page, /* in: index page */
+ mtr_t* mtr); /* in: mini-transaction handle */
+/*****************************************************************
+Gets pointer to the previous user record in the tree. It is assumed
+that the caller has appropriate latches on the page and its neighbor. */
+
+rec_t*
+btr_get_prev_user_rec(
+/*==================*/
+ /* out: previous user record, NULL if there is none */
+ rec_t* rec, /* in: record on leaf level */
+ mtr_t* mtr); /* in: mtr holding a latch on the page, and if
+ needed, also to the previous page */
+/*****************************************************************
+Gets pointer to the next user record in the tree. It is assumed
+that the caller has appropriate latches on the page and its neighbor. */
+
+rec_t*
+btr_get_next_user_rec(
+/*==================*/
+ /* out: next user record, NULL if there is none */
+ rec_t* rec, /* in: record on leaf level */
+ mtr_t* mtr); /* in: mtr holding a latch on the page, and if
+ needed, also to the next page */
+/******************************************************************
+Releases the latch on a leaf page and bufferunfixes it. */
+UNIV_INLINE
+void
+btr_leaf_page_release(
+/*==================*/
+ page_t* page, /* in: page */
+ ulint latch_mode, /* in: BTR_SEARCH_LEAF or BTR_MODIFY_LEAF */
+ mtr_t* mtr); /* in: mtr */
+/******************************************************************
+Gets the child node file address in a node pointer. */
+UNIV_INLINE
+ulint
+btr_node_ptr_get_child_page_no(
+/*===========================*/
+ /* out: child node address */
+ rec_t* rec); /* in: node pointer record */
+/****************************************************************
+Creates the root node for a new index tree. */
+
+ulint
+btr_create(
+/*=======*/
+ /* out: page number of the created root, FIL_NULL if
+ did not succeed */
+ ulint type, /* in: type of the index */
+ ulint space, /* in: space where created */
+ dulint index_id,/* in: index id */
+ mtr_t* mtr); /* in: mini-transaction handle */
+/****************************************************************
+Frees a B-tree except the root page, which MUST be freed after this
+by calling btr_free_root. */
+
+void
+btr_free_but_not_root(
+/*==================*/
+ ulint space, /* in: space where created */
+ ulint root_page_no); /* in: root page number */
+/****************************************************************
+Frees the B-tree root page. Other tree MUST already have been freed. */
+
+void
+btr_free_root(
+/*==========*/
+ ulint space, /* in: space where created */
+ ulint root_page_no, /* in: root page number */
+ mtr_t* mtr); /* in: a mini-transaction which has already
+ been started */
+/*****************************************************************
+Makes tree one level higher by splitting the root, and inserts
+the tuple. It is assumed that mtr contains an x-latch on the tree.
+NOTE that the operation of this function must always succeed,
+we cannot reverse it: therefore enough free disk space must be
+guaranteed to be available before this function is called. */
+
+rec_t*
+btr_root_raise_and_insert(
+/*======================*/
+ /* out: inserted record */
+ btr_cur_t* cursor, /* in: cursor at which to insert: must be
+ on the root page; when the function returns,
+ the cursor is positioned on the predecessor
+ of the inserted record */
+ dtuple_t* tuple, /* in: tuple to insert */
+ mtr_t* mtr); /* in: mtr */
+/*****************************************************************
+Reorganizes an index page. */
+
+void
+btr_page_reorganize(
+/*================*/
+ page_t* page, /* in: page to be reorganized */
+ mtr_t* mtr); /* in: mtr */
+/*****************************************************************
+Reorganizes an index page. */
+
+void
+btr_page_reorganize_low(
+/*====================*/
+ ibool low, /* in: TRUE if locks should not be updated, i.e.,
+ there cannot exist locks on the page */
+ page_t* page, /* in: page to be reorganized */
+ mtr_t* mtr); /* in: mtr */
+/*****************************************************************
+Decides if the page should be split at the convergence point of
+inserts converging to left. */
+
+ibool
+btr_page_get_split_rec_to_left(
+/*===========================*/
+ /* out: TRUE if split recommended */
+ btr_cur_t* cursor, /* in: cursor at which to insert */
+ rec_t** split_rec);/* out: if split recommended,
+ the first record on upper half page,
+ or NULL if tuple should be first */
+/*****************************************************************
+Decides if the page should be split at the convergence point of
+inserts converging to right. */
+
+ibool
+btr_page_get_split_rec_to_right(
+/*============================*/
+ /* out: TRUE if split recommended */
+ btr_cur_t* cursor, /* in: cursor at which to insert */
+ rec_t** split_rec);/* out: if split recommended,
+ the first record on upper half page,
+ or NULL if tuple should be first */
+/*****************************************************************
+Splits an index page to halves and inserts the tuple. It is assumed
+that mtr holds an x-latch to the index tree. NOTE: the tree x-latch
+is released within this function! NOTE that the operation of this
+function must always succeed, we cannot reverse it: therefore
+enough free disk space must be guaranteed to be available before
+this function is called. */
+
+rec_t*
+btr_page_split_and_insert(
+/*======================*/
+ /* out: inserted record; NOTE: the tree
+ x-latch is released! NOTE: 2 free disk
+ pages must be available! */
+ btr_cur_t* cursor, /* in: cursor at which to insert; when the
+ function returns, the cursor is positioned
+ on the predecessor of the inserted record */
+ dtuple_t* tuple, /* in: tuple to insert */
+ mtr_t* mtr); /* in: mtr */
+/***********************************************************
+Inserts a data tuple to a tree on a non-leaf level. It is assumed
+that mtr holds an x-latch on the tree. */
+
+void
+btr_insert_on_non_leaf_level(
+/*=========================*/
+ dict_tree_t* tree, /* in: tree */
+ ulint level, /* in: level, must be > 0 */
+ dtuple_t* tuple, /* in: the record to be inserted */
+ mtr_t* mtr); /* in: mtr */
+/********************************************************************
+Sets a record as the predefined minimum record. */
+
+void
+btr_set_min_rec_mark(
+/*=================*/
+ rec_t* rec, /* in: record */
+ mtr_t* mtr); /* in: mtr */
+/*****************************************************************
+Deletes on the upper level the node pointer to a page. */
+
+void
+btr_node_ptr_delete(
+/*================*/
+ dict_tree_t* tree, /* in: index tree */
+ page_t* page, /* in: page whose node pointer is deleted */
+ mtr_t* mtr); /* in: mtr */
+/****************************************************************
+Checks that the node pointer to a page is appropriate. */
+
+ibool
+btr_check_node_ptr(
+/*===============*/
+ /* out: TRUE */
+ dict_tree_t* tree, /* in: index tree */
+ page_t* page, /* in: index page */
+ mtr_t* mtr); /* in: mtr */
+/*****************************************************************
+Tries to merge the page first to the left immediate brother if such a
+brother exists, and the node pointers to the current page and to the
+brother reside on the same page. If the left brother does not satisfy these
+conditions, looks at the right brother. If the page is the only one on that
+level lifts the records of the page to the father page, thus reducing the
+tree height. It is assumed that mtr holds an x-latch on the tree and on the
+page. If cursor is on the leaf level, mtr must also hold x-latches to
+the brothers, if they exist. NOTE: it is assumed that the caller has reserved
+enough free extents so that the compression will always succeed if done! */
+void
+btr_compress(
+/*=========*/
+ btr_cur_t* cursor, /* in: cursor on the page to merge or lift;
+ the page must not be empty: in record delete
+ use btr_discard_page if the page would become
+ empty */
+ mtr_t* mtr); /* in: mtr */
+/*****************************************************************
+Discards a page from a B-tree. This is used to remove the last record from
+a B-tree page: the whole page must be removed at the same time. This cannot
+be used for the root page, which is allowed to be empty. */
+
+void
+btr_discard_page(
+/*=============*/
+ btr_cur_t* cursor, /* in: cursor on the page to discard: not on
+ the root page */
+ mtr_t* mtr); /* in: mtr */
+/************************************************************************
+Declares the latching order level for the page latch in the debug version. */
+UNIV_INLINE
+void
+btr_declare_page_latch(
+/*===================*/
+ page_t* page, /* in: page */
+ ibool leaf); /* in: TRUE if a leaf */
+/********************************************************************
+Parses the redo log record for setting an index record as the predefined
+minimum record. */
+
+byte*
+btr_parse_set_min_rec_mark(
+/*=======================*/
+ /* out: end of log record or NULL */
+ byte* ptr, /* in: buffer */
+ byte* end_ptr,/* in: buffer end */
+ page_t* page, /* in: page or NULL */
+ mtr_t* mtr); /* in: mtr or NULL */
+/***************************************************************
+Parses a redo log record of reorganizing a page. */
+
+byte*
+btr_parse_page_reorganize(
+/*======================*/
+ /* out: end of log record or NULL */
+ byte* ptr, /* in: buffer */
+ byte* end_ptr,/* in: buffer end */
+ page_t* page, /* in: page or NULL */
+ mtr_t* mtr); /* in: mtr or NULL */
+/******************************************************************
+Gets the number of pages in a B-tree. */
+
+ulint
+btr_get_size(
+/*=========*/
+ /* out: number of pages */
+ dict_index_t* index, /* in: index */
+ ulint flag); /* in: BTR_N_LEAF_PAGES or BTR_TOTAL_SIZE */
+/*****************************************************************
+Prints size info of a B-tree. */
+
+void
+btr_print_size(
+/*===========*/
+ dict_tree_t* tree); /* in: index tree */
+/******************************************************************
+Prints directories and other info of all nodes in the tree. */
+
+void
+btr_print_tree(
+/*===========*/
+ dict_tree_t* tree, /* in: tree */
+ ulint width); /* in: print this many entries from start
+ and end */
+/******************************************************************
+Checks the consistency of an index tree. */
+
+void
+btr_validate_tree(
+/*==============*/
+ dict_tree_t* tree); /* in: tree */
+
+#define BTR_N_LEAF_PAGES 1
+#define BTR_TOTAL_SIZE 2
+
+#ifndef UNIV_NONINL
+#include "btr0btr.ic"
+#endif
+
+#endif
diff --git a/innobase/include/btr0btr.ic b/innobase/include/btr0btr.ic
new file mode 100644
index 00000000000..5c1c89e9840
--- /dev/null
+++ b/innobase/include/btr0btr.ic
@@ -0,0 +1,223 @@
+/******************************************************
+The B-tree
+
+(c) 1994-1996 Innobase Oy
+
+Created 6/2/1994 Heikki Tuuri
+*******************************************************/
+
+#include "mach0data.h"
+#include "mtr0mtr.h"
+#include "mtr0log.h"
+
+#define BTR_MAX_NODE_LEVEL 50 /* used in debug checking */
+
+/******************************************************************
+Gets a buffer page and declares its latching order level. */
+UNIV_INLINE
+page_t*
+btr_page_get(
+/*=========*/
+ ulint space, /* in: space id */
+ ulint page_no, /* in: page number */
+ ulint mode, /* in: latch mode */
+ mtr_t* mtr) /* in: mtr */
+{
+ page_t* page;
+
+ page = buf_page_get(space, page_no, mode, mtr);
+#ifdef UNIV_SYNC_DEBUG
+ if (mode != RW_NO_LATCH) {
+
+ buf_page_dbg_add_level(page, SYNC_TREE_NODE);
+ }
+#endif
+ return(page);
+}
+
+/******************************************************************
+Sets the index id field of a page. */
+UNIV_INLINE
+void
+btr_page_set_index_id(
+/*==================*/
+ page_t* page, /* in: page to be created */
+ dulint id, /* in: index id */
+ mtr_t* mtr) /* in: mtr */
+{
+ mlog_write_dulint(page + PAGE_HEADER + PAGE_INDEX_ID, id,
+ MLOG_8BYTES, mtr);
+}
+
+/******************************************************************
+Gets the index id field of a page. */
+UNIV_INLINE
+dulint
+btr_page_get_index_id(
+/*==================*/
+ /* out: index id */
+ page_t* page) /* in: index page */
+{
+ return(mach_read_from_8(page + PAGE_HEADER + PAGE_INDEX_ID));
+}
+
+/************************************************************
+Gets the node level field in an index page. */
+UNIV_INLINE
+ulint
+btr_page_get_level_low(
+/*===================*/
+ /* out: level, leaf level == 0 */
+ page_t* page) /* in: index page */
+{
+ ulint level;
+
+ ut_ad(page);
+
+ level = mach_read_from_2(page + PAGE_HEADER + PAGE_LEVEL);
+
+ ut_ad(level <= BTR_MAX_NODE_LEVEL);
+
+ return(level);
+}
+
+/************************************************************
+Gets the node level field in an index page. */
+UNIV_INLINE
+ulint
+btr_page_get_level(
+/*===============*/
+ /* out: level, leaf level == 0 */
+ page_t* page, /* in: index page */
+ mtr_t* mtr) /* in: mini-transaction handle */
+{
+ ut_ad(page && mtr);
+
+ return(btr_page_get_level_low(page));
+}
+
+/************************************************************
+Sets the node level field in an index page. */
+UNIV_INLINE
+void
+btr_page_set_level(
+/*===============*/
+ page_t* page, /* in: index page */
+ ulint level, /* in: level, leaf level == 0 */
+ mtr_t* mtr) /* in: mini-transaction handle */
+{
+ ut_ad(page && mtr);
+ ut_ad(level <= BTR_MAX_NODE_LEVEL);
+
+ mlog_write_ulint(page + PAGE_HEADER + PAGE_LEVEL, level,
+ MLOG_2BYTES, mtr);
+}
+
+/************************************************************
+Gets the next index page number. */
+UNIV_INLINE
+ulint
+btr_page_get_next(
+/*==============*/
+ /* out: next page number */
+ page_t* page, /* in: index page */
+ mtr_t* mtr) /* in: mini-transaction handle */
+{
+ ut_ad(page && mtr);
+ ut_ad(mtr_memo_contains(mtr, buf_block_align(page),
+ MTR_MEMO_PAGE_X_FIX)
+ || mtr_memo_contains(mtr, buf_block_align(page),
+ MTR_MEMO_PAGE_S_FIX));
+
+ return(mach_read_from_4(page + FIL_PAGE_NEXT));
+}
+
+/************************************************************
+Sets the next index page field. */
+UNIV_INLINE
+void
+btr_page_set_next(
+/*==============*/
+ page_t* page, /* in: index page */
+ ulint next, /* in: next page number */
+ mtr_t* mtr) /* in: mini-transaction handle */
+{
+ ut_ad(page && mtr);
+
+ mlog_write_ulint(page + FIL_PAGE_NEXT, next, MLOG_4BYTES, mtr);
+}
+
+/************************************************************
+Gets the previous index page number. */
+UNIV_INLINE
+ulint
+btr_page_get_prev(
+/*==============*/
+ /* out: prev page number */
+ page_t* page, /* in: index page */
+ mtr_t* mtr) /* in: mini-transaction handle */
+{
+ ut_ad(page && mtr);
+
+ return(mach_read_from_4(page + FIL_PAGE_PREV));
+}
+
+/************************************************************
+Sets the previous index page field. */
+UNIV_INLINE
+void
+btr_page_set_prev(
+/*==============*/
+ page_t* page, /* in: index page */
+ ulint prev, /* in: previous page number */
+ mtr_t* mtr) /* in: mini-transaction handle */
+{
+ ut_ad(page && mtr);
+
+ mlog_write_ulint(page + FIL_PAGE_PREV, prev, MLOG_4BYTES, mtr);
+}
+
+/******************************************************************
+Gets the child node file address in a node pointer. */
+UNIV_INLINE
+ulint
+btr_node_ptr_get_child_page_no(
+/*===========================*/
+ /* out: child node address */
+ rec_t* rec) /* in: node pointer record */
+{
+ ulint n_fields;
+ byte* field;
+ ulint len;
+
+ n_fields = rec_get_n_fields(rec);
+
+ /* The child address is in the last field */
+ field = rec_get_nth_field(rec, n_fields - 1, &len);
+
+ ut_ad(len == 4);
+
+ return(mach_read_from_4(field));
+}
+
+/******************************************************************
+Releases the latches on a leaf page and bufferunfixes it. */
+UNIV_INLINE
+void
+btr_leaf_page_release(
+/*==================*/
+ page_t* page, /* in: page */
+ ulint latch_mode, /* in: BTR_SEARCH_LEAF or BTR_MODIFY_LEAF */
+ mtr_t* mtr) /* in: mtr */
+{
+ ut_ad(!mtr_memo_contains(mtr, buf_block_align(page),
+ MTR_MEMO_MODIFY));
+ if (latch_mode == BTR_SEARCH_LEAF) {
+ mtr_memo_release(mtr, buf_block_align(page),
+ MTR_MEMO_PAGE_S_FIX);
+ } else {
+ ut_ad(latch_mode == BTR_MODIFY_LEAF);
+ mtr_memo_release(mtr, buf_block_align(page),
+ MTR_MEMO_PAGE_X_FIX);
+ }
+}
diff --git a/innobase/include/btr0cur.h b/innobase/include/btr0cur.h
new file mode 100644
index 00000000000..79ec56c8e50
--- /dev/null
+++ b/innobase/include/btr0cur.h
@@ -0,0 +1,519 @@
+/******************************************************
+The index tree cursor
+
+(c) 1994-1996 Innobase Oy
+
+Created 10/16/1994 Heikki Tuuri
+*******************************************************/
+
+#ifndef btr0cur_h
+#define btr0cur_h
+
+#include "univ.i"
+#include "dict0dict.h"
+#include "data0data.h"
+#include "page0cur.h"
+#include "btr0types.h"
+#include "que0types.h"
+#include "row0types.h"
+#include "ha0ha.h"
+
+/* Mode flags for btr_cur operations; these can be ORed */
+#define BTR_NO_UNDO_LOG_FLAG 1 /* do no undo logging */
+#define BTR_NO_LOCKING_FLAG 2 /* do no record lock checking */
+#define BTR_KEEP_SYS_FLAG 4 /* sys fields will be found from the
+ update vector or inserted entry */
+
+#define BTR_CUR_ADAPT
+#define BTR_CUR_HASH_ADAPT
+
+/*************************************************************
+Returns the page cursor component of a tree cursor. */
+UNIV_INLINE
+page_cur_t*
+btr_cur_get_page_cur(
+/*=================*/
+ /* out: pointer to page cursor component */
+ btr_cur_t* cursor); /* in: tree cursor */
+/*************************************************************
+Returns the record pointer of a tree cursor. */
+UNIV_INLINE
+rec_t*
+btr_cur_get_rec(
+/*============*/
+ /* out: pointer to record */
+ btr_cur_t* cursor); /* in: tree cursor */
+/*************************************************************
+Invalidates a tree cursor by setting record pointer to NULL. */
+UNIV_INLINE
+void
+btr_cur_invalidate(
+/*===============*/
+ btr_cur_t* cursor); /* in: tree cursor */
+/*************************************************************
+Returns the page of a tree cursor. */
+UNIV_INLINE
+page_t*
+btr_cur_get_page(
+/*=============*/
+ /* out: pointer to page */
+ btr_cur_t* cursor); /* in: tree cursor */
+/*************************************************************
+Returns the tree of a cursor. */
+UNIV_INLINE
+dict_tree_t*
+btr_cur_get_tree(
+/*=============*/
+ /* out: tree */
+ btr_cur_t* cursor); /* in: tree cursor */
+/*************************************************************
+Positions a tree cursor at a given record. */
+UNIV_INLINE
+void
+btr_cur_position(
+/*=============*/
+ dict_index_t* index, /* in: index */
+ rec_t* rec, /* in: record in tree */
+ btr_cur_t* cursor);/* in: cursor */
+/************************************************************************
+Searches an index tree and positions a tree cursor on a given level.
+NOTE: n_fields_cmp in tuple must be set so that it cannot be compared
+to node pointer page number fields on the upper levels of the tree!
+Note that if mode is PAGE_CUR_LE, which is used in inserts, then
+cursor->up_match and cursor->low_match both will have sensible values.
+If mode is PAGE_CUR_GE, then up_match will a have a sensible value. */
+
+void
+btr_cur_search_to_nth_level(
+/*========================*/
+ dict_index_t* index, /* in: index */
+ ulint level, /* in: the tree level of search */
+ dtuple_t* tuple, /* in: data tuple; NOTE: n_fields_cmp in
+ tuple must be set so that it cannot get
+ compared to the node ptr page number field! */
+ ulint mode, /* in: PAGE_CUR_L, ...;
+ NOTE that if the search is made using a unique
+ prefix of a record, mode should be PAGE_CUR_LE,
+ not PAGE_CUR_GE, as the latter may end up on
+ the previous page of the record! Inserts
+ should always be made using PAGE_CUR_LE to
+ search the position! */
+ ulint latch_mode, /* in: BTR_SEARCH_LEAF, ...;
+ cursor->left_page is used to store a pointer
+ to the left neighbor page, in the cases
+ BTR_SEARCH_PREV and BTR_MODIFY_PREV */
+ btr_cur_t* cursor, /* out: tree cursor; the cursor page is s- or
+ x-latched */
+ ulint has_search_latch,/* in: latch mode the caller
+ currently has on btr_search_latch:
+ RW_S_LATCH, or 0 */
+ mtr_t* mtr); /* in: mtr */
+/*********************************************************************
+Opens a cursor at either end of an index. */
+
+void
+btr_cur_open_at_index_side(
+/*=======================*/
+ ibool from_left, /* in: TRUE if open to the low end,
+ FALSE if to the high end */
+ dict_index_t* index, /* in: index */
+ ulint latch_mode, /* in: latch mode */
+ btr_cur_t* cursor, /* in: cursor */
+ mtr_t* mtr); /* in: mtr */
+/**************************************************************************
+Positions a cursor at a randomly chosen position within a B-tree. */
+
+void
+btr_cur_open_at_rnd_pos(
+/*====================*/
+ dict_index_t* index, /* in: index */
+ ulint latch_mode, /* in: BTR_SEARCH_LEAF, ... */
+ btr_cur_t* cursor, /* in/out: B-tree cursor */
+ mtr_t* mtr); /* in: mtr */
+/*****************************************************************
+Tries to perform an insert to a page in an index tree, next to cursor.
+It is assumed that mtr holds an x-latch on the page. The operation does
+not succeed if there is too little space on the page. If there is just
+one record on the page, the insert will always succeed; this is to
+prevent trying to split a page with just one record. */
+
+ulint
+btr_cur_optimistic_insert(
+/*======================*/
+ /* out: DB_SUCCESS, DB_WAIT_LOCK,
+ DB_FAIL, or error number */
+ ulint flags, /* in: undo logging and locking flags: if not
+ zero, the parameters index and thr should be
+ specified */
+ btr_cur_t* cursor, /* in: cursor on page after which
+ to insert; cursor stays valid */
+ dtuple_t* entry, /* in: entry to insert */
+ rec_t** rec, /* out: pointer to inserted record if
+ succeed */
+ que_thr_t* thr, /* in: query thread or NULL */
+ mtr_t* mtr); /* in: mtr */
+/*****************************************************************
+Performs an insert on a page of an index tree. It is assumed that mtr
+holds an x-latch on the tree and on the cursor page. If the insert is
+made on the leaf level, to avoid deadlocks, mtr must also own x-latches
+to brothers of page, if those brothers exist. */
+
+ulint
+btr_cur_pessimistic_insert(
+/*=======================*/
+ /* out: DB_SUCCESS or error number */
+ ulint flags, /* in: undo logging and locking flags: if not
+ zero, the parameters index and thr should be
+ specified */
+ btr_cur_t* cursor, /* in: cursor after which to insert;
+ cursor does not stay valid */
+ dtuple_t* entry, /* in: entry to insert */
+ rec_t** rec, /* out: pointer to inserted record if
+ succeed */
+ que_thr_t* thr, /* in: query thread or NULL */
+ mtr_t* mtr); /* in: mtr */
+/*****************************************************************
+Updates a record when the update causes no size changes in its fields. */
+
+ulint
+btr_cur_update_in_place(
+/*====================*/
+ /* out: DB_SUCCESS or error number */
+ ulint flags, /* in: undo logging and locking flags */
+ btr_cur_t* cursor, /* in: cursor on the record to update;
+ cursor stays valid and positioned on the
+ same record */
+ upd_t* update, /* in: update vector */
+ ulint cmpl_info,/* in: compiler info on secondary index
+ updates */
+ que_thr_t* thr, /* in: query thread */
+ mtr_t* mtr); /* in: mtr */
+/*****************************************************************
+Tries to update a record on a page in an index tree. It is assumed that mtr
+holds an x-latch on the page. The operation does not succeed if there is too
+little space on the page or if the update would result in too empty a page,
+so that tree compression is recommended. */
+
+ulint
+btr_cur_optimistic_update(
+/*======================*/
+ /* out: DB_SUCCESS, or DB_OVERFLOW if the
+ updated record does not fit, DB_UNDERFLOW
+ if the page would become too empty */
+ ulint flags, /* in: undo logging and locking flags */
+ btr_cur_t* cursor, /* in: cursor on the record to update;
+ cursor stays valid and positioned on the
+ same record */
+ upd_t* update, /* in: update vector; this must also
+ contain trx id and roll ptr fields */
+ ulint cmpl_info,/* in: compiler info on secondary index
+ updates */
+ que_thr_t* thr, /* in: query thread */
+ mtr_t* mtr); /* in: mtr */
+/*****************************************************************
+Performs an update of a record on a page of a tree. It is assumed
+that mtr holds an x-latch on the tree and on the cursor page. If the
+update is made on the leaf level, to avoid deadlocks, mtr must also
+own x-latches to brothers of page, if those brothers exist. */
+
+ulint
+btr_cur_pessimistic_update(
+/*=======================*/
+ /* out: DB_SUCCESS or error code */
+ ulint flags, /* in: undo logging, locking, and rollback
+ flags */
+ btr_cur_t* cursor, /* in: cursor on the record to update;
+ cursor does not stay valid */
+ upd_t* update, /* in: update vector; this is allowed also
+ contain trx id and roll ptr fields, but
+ the values in update vector have no effect */
+ ulint cmpl_info,/* in: compiler info on secondary index
+ updates */
+ que_thr_t* thr, /* in: query thread */
+ mtr_t* mtr); /* in: mtr */
+/***************************************************************
+Marks a clustered index record deleted. Writes an undo log record to
+undo log on this delete marking. Writes in the trx id field the id
+of the deleting transaction, and in the roll ptr field pointer to the
+undo log record created. */
+
+ulint
+btr_cur_del_mark_set_clust_rec(
+/*===========================*/
+ /* out: DB_SUCCESS, DB_LOCK_WAIT, or error
+ number */
+ ulint flags, /* in: undo logging and locking flags */
+ btr_cur_t* cursor, /* in: cursor */
+ ibool val, /* in: value to set */
+ que_thr_t* thr, /* in: query thread */
+ mtr_t* mtr); /* in: mtr */
+/***************************************************************
+Sets a secondary index record delete mark to TRUE or FALSE. */
+
+ulint
+btr_cur_del_mark_set_sec_rec(
+/*=========================*/
+ /* out: DB_SUCCESS, DB_LOCK_WAIT, or error
+ number */
+ ulint flags, /* in: locking flag */
+ btr_cur_t* cursor, /* in: cursor */
+ ibool val, /* in: value to set */
+ que_thr_t* thr, /* in: query thread */
+ mtr_t* mtr); /* in: mtr */
+/***************************************************************
+Sets a secondary index record delete mark to FALSE. This function is
+only used by the insert buffer insert merge mechanism. */
+
+void
+btr_cur_del_unmark_for_ibuf(
+/*========================*/
+ rec_t* rec, /* in: record to delete unmark */
+ mtr_t* mtr); /* in: mtr */
+/*****************************************************************
+Tries to compress a page of the tree on the leaf level. It is assumed
+that mtr holds an x-latch on the tree and on the cursor page. To avoid
+deadlocks, mtr must also own x-latches to brothers of page, if those
+brothers exist. NOTE: it is assumed that the caller has reserved enough
+free extents so that the compression will always succeed if done! */
+
+void
+btr_cur_compress(
+/*=============*/
+ btr_cur_t* cursor, /* in: cursor on the page to compress;
+ cursor does not stay valid */
+ mtr_t* mtr); /* in: mtr */
+/*****************************************************************
+Tries to compress a page of the tree if it seems useful. It is assumed
+that mtr holds an x-latch on the tree and on the cursor page. To avoid
+deadlocks, mtr must also own x-latches to brothers of page, if those
+brothers exist. NOTE: it is assumed that the caller has reserved enough
+free extents so that the compression will always succeed if done! */
+
+ibool
+btr_cur_compress_if_useful(
+/*=======================*/
+ /* out: TRUE if compression occurred */
+ btr_cur_t* cursor, /* in: cursor on the page to compress;
+ cursor does not stay valid if compression
+ occurs */
+ mtr_t* mtr); /* in: mtr */
+/***********************************************************
+Removes the record on which the tree cursor is positioned. It is assumed
+that the mtr has an x-latch on the page where the cursor is positioned,
+but no latch on the whole tree. */
+
+ibool
+btr_cur_optimistic_delete(
+/*======================*/
+ /* out: TRUE if success, i.e., the page
+ did not become too empty */
+ btr_cur_t* cursor, /* in: cursor on the record to delete;
+ cursor stays valid: if deletion succeeds,
+ on function exit it points to the successor
+ of the deleted record */
+ mtr_t* mtr); /* in: mtr */
+/*****************************************************************
+Removes the record on which the tree cursor is positioned. Tries
+to compress the page if its fillfactor drops below a threshold
+or if it is the only page on the level. It is assumed that mtr holds
+an x-latch on the tree and on the cursor page. To avoid deadlocks,
+mtr must also own x-latches to brothers of page, if those brothers
+exist. */
+
+ibool
+btr_cur_pessimistic_delete(
+/*=======================*/
+ /* out: TRUE if compression occurred */
+ ulint* err, /* out: DB_SUCCESS or DB_OUT_OF_FILE_SPACE;
+ the latter may occur because we may have
+ to update node pointers on upper levels,
+ and in the case of variable length keys
+ these may actually grow in size */
+ ibool has_reserved_extents, /* in: TRUE if the
+ caller has already reserved enough free
+ extents so that he knows that the operation
+ will succeed */
+ btr_cur_t* cursor, /* in: cursor on the record to delete;
+ if compression does not occur, the cursor
+ stays valid: it points to successor of
+ deleted record on function exit */
+ mtr_t* mtr); /* in: mtr */
+/***************************************************************
+Parses a redo log record of updating a record in-place. */
+
+byte*
+btr_cur_parse_update_in_place(
+/*==========================*/
+ /* out: end of log record or NULL */
+ byte* ptr, /* in: buffer */
+ byte* end_ptr,/* in: buffer end */
+ page_t* page); /* in: page or NULL */
+/***************************************************************
+Parses a redo log record of updating a record, but not in-place. */
+
+byte*
+btr_cur_parse_opt_update(
+/*=====================*/
+ /* out: end of log record or NULL */
+ byte* ptr, /* in: buffer */
+ byte* end_ptr,/* in: buffer end */
+ page_t* page, /* in: page or NULL */
+ mtr_t* mtr); /* in: mtr or NULL */
+/********************************************************************
+Parses the redo log record for delete marking or unmarking of a clustered
+index record. */
+
+byte*
+btr_cur_parse_del_mark_set_clust_rec(
+/*=================================*/
+ /* out: end of log record or NULL */
+ byte* ptr, /* in: buffer */
+ byte* end_ptr,/* in: buffer end */
+ page_t* page); /* in: page or NULL */
+/********************************************************************
+Parses the redo log record for delete marking or unmarking of a secondary
+index record. */
+
+byte*
+btr_cur_parse_del_mark_set_sec_rec(
+/*===============================*/
+ /* out: end of log record or NULL */
+ byte* ptr, /* in: buffer */
+ byte* end_ptr,/* in: buffer end */
+ page_t* page); /* in: page or NULL */
+/***********************************************************************
+Estimates the number of rows in a given index range. */
+
+ulint
+btr_estimate_n_rows_in_range(
+/*=========================*/
+ /* out: estimated number of rows */
+ dict_index_t* index, /* in: index */
+ dtuple_t* tuple1, /* in: range start, may also be empty tuple */
+ ulint mode1, /* in: search mode for range start */
+ dtuple_t* tuple2, /* in: range end, may also be empty tuple */
+ ulint mode2); /* in: search mode for range end */
+/***********************************************************************
+Estimates the number of different key values in a given index. */
+
+ulint
+btr_estimate_number_of_different_key_vals(
+/*======================================*/
+ /* out: estimated number of key values */
+ dict_index_t* index); /* in: index */
+
+
+/*######################################################################*/
+
+/* In the pessimistic delete, if the page data size drops below this
+limit, merging it to a neighbor is tried */
+
+#define BTR_CUR_PAGE_COMPRESS_LIMIT (UNIV_PAGE_SIZE / 2)
+
+/* A slot in the path array. We store here info on a search path down the
+tree. Each slot contains data on a single level of the tree. */
+
+typedef struct btr_path_struct btr_path_t;
+struct btr_path_struct{
+ ulint nth_rec; /* index of the record
+ where the page cursor stopped on
+ this level (index in alphabetical
+ order); value ULINT_UNDEFINED
+ denotes array end */
+ ulint n_recs; /* number of records on the page */
+};
+
+#define BTR_PATH_ARRAY_N_SLOTS 250 /* size of path array (in slots) */
+
+/* The tree cursor: the definition appears here only for the compiler
+to know struct size! */
+
+struct btr_cur_struct {
+ dict_index_t* index; /* index where positioned */
+ page_cur_t page_cur; /* page cursor */
+ page_t* left_page; /* this field is used to store a pointer
+ to the left neighbor page, in the cases
+ BTR_SEARCH_PREV and BTR_MODIFY_PREV */
+ /*------------------------------*/
+ que_thr_t* thr; /* this field is only used when
+ btr_cur_search_... is called for an
+ index entry insertion: the calling
+ query thread is passed here to be
+ used in the insert buffer */
+ /*------------------------------*/
+ /* The following fields are used in btr_cur_search... to pass
+ information: */
+ ulint flag; /* BTR_CUR_HASH, BTR_CUR_HASH_FAIL,
+ BTR_CUR_BINARY, or
+ BTR_CUR_INSERT_TO_IBUF */
+ ulint tree_height; /* Tree height if the search is done
+ for a pessimistic insert or update
+ operation */
+ ulint up_match; /* If the search mode was PAGE_CUR_LE,
+ the number of matched fields to the
+ the first user record to the right of
+ the cursor record after
+ btr_cur_search_...;
+ for the mode PAGE_CUR_GE, the matched
+ fields to the first user record AT THE
+ CURSOR or to the right of it;
+ NOTE that the up_match and low_match
+ values may exceed the correct values
+ for comparison to the adjacent user
+ record if that record is on a
+ different leaf page! (See the note in
+ row_ins_duplicate_key.) */
+ ulint up_bytes; /* number of matched bytes to the
+ right at the time cursor positioned;
+ only used internally in searches: not
+ defined after the search */
+ ulint low_match; /* if search mode was PAGE_CUR_LE,
+ the number of matched fields to the
+ first user record AT THE CURSOR or
+ to the left of it after
+ btr_cur_search_...;
+ NOT defined for PAGE_CUR_GE or any
+ other search modes; see also the NOTE
+ in up_match! */
+ ulint low_bytes; /* number of matched bytes to the
+ right at the time cursor positioned;
+ only used internally in searches: not
+ defined after the search */
+ ulint n_fields; /* prefix length used in a hash
+ search if hash_node != NULL */
+ ulint n_bytes; /* hash prefix bytes if hash_node !=
+ NULL */
+ ulint fold; /* fold value used in the search if
+ flag is BTR_CUR_HASH */
+ /*------------------------------*/
+ btr_path_t* path_arr; /* in estimating the number of
+ rows in range, we store in this array
+ information of the path through
+ the tree */
+};
+
+/* Values for the flag documenting the used search method */
+#define BTR_CUR_HASH 1 /* successful shortcut using the hash
+ index */
+#define BTR_CUR_HASH_FAIL 2 /* failure using hash, success using
+ binary search: the misleading hash
+ reference is stored in the field
+ hash_node, and might be necessary to
+ update */
+#define BTR_CUR_BINARY 3 /* success using the binary search */
+#define BTR_CUR_INSERT_TO_IBUF 4 /* performed the intended insert to
+ the insert buffer */
+
+/* If pessimistic delete fails because of lack of file space,
+there is still a good change of success a little later: try this many times,
+and sleep this many microseconds in between */
+#define BTR_CUR_RETRY_DELETE_N_TIMES 100
+#define BTR_CUR_RETRY_SLEEP_TIME 50000
+
+extern ulint btr_cur_n_non_sea;
+
+#ifndef UNIV_NONINL
+#include "btr0cur.ic"
+#endif
+
+#endif
diff --git a/innobase/include/btr0cur.ic b/innobase/include/btr0cur.ic
new file mode 100644
index 00000000000..a3a04b60c45
--- /dev/null
+++ b/innobase/include/btr0cur.ic
@@ -0,0 +1,172 @@
+/******************************************************
+The index tree cursor
+
+(c) 1994-1996 Innobase Oy
+
+Created 10/16/1994 Heikki Tuuri
+*******************************************************/
+
+#include "btr0btr.h"
+
+/*************************************************************
+Returns the page cursor component of a tree cursor. */
+UNIV_INLINE
+page_cur_t*
+btr_cur_get_page_cur(
+/*=================*/
+ /* out: pointer to page cursor component */
+ btr_cur_t* cursor) /* in: tree cursor */
+{
+ return(&(cursor->page_cur));
+}
+
+/*************************************************************
+Returns the record pointer of a tree cursor. */
+UNIV_INLINE
+rec_t*
+btr_cur_get_rec(
+/*============*/
+ /* out: pointer to record */
+ btr_cur_t* cursor) /* in: tree cursor */
+{
+ return(page_cur_get_rec(&(cursor->page_cur)));
+}
+
+/*************************************************************
+Invalidates a tree cursor by setting record pointer to NULL. */
+UNIV_INLINE
+void
+btr_cur_invalidate(
+/*===============*/
+ btr_cur_t* cursor) /* in: tree cursor */
+{
+ page_cur_invalidate(&(cursor->page_cur));
+}
+
+/*************************************************************
+Returns the page of a tree cursor. */
+UNIV_INLINE
+page_t*
+btr_cur_get_page(
+/*=============*/
+ /* out: pointer to page */
+ btr_cur_t* cursor) /* in: tree cursor */
+{
+ return(buf_frame_align(page_cur_get_rec(&(cursor->page_cur))));
+}
+
+/*************************************************************
+Returns the tree of a cursor. */
+UNIV_INLINE
+dict_tree_t*
+btr_cur_get_tree(
+/*=============*/
+ /* out: tree */
+ btr_cur_t* cursor) /* in: tree cursor */
+{
+ return((cursor->index)->tree);
+}
+
+/*************************************************************
+Positions a tree cursor at a given record. */
+UNIV_INLINE
+void
+btr_cur_position(
+/*=============*/
+ dict_index_t* index, /* in: index */
+ rec_t* rec, /* in: record in tree */
+ btr_cur_t* cursor) /* in: cursor */
+{
+ page_cur_position(rec, btr_cur_get_page_cur(cursor));
+
+ cursor->index = index;
+}
+
+/*************************************************************************
+Checks if compressing an index page where a btr cursor is placed makes
+sense. */
+UNIV_INLINE
+ibool
+btr_cur_compress_recommendation(
+/*============================*/
+ /* out: TRUE if compression is recommended */
+ btr_cur_t* cursor, /* in: btr cursor */
+ mtr_t* mtr) /* in: mtr */
+{
+ page_t* page;
+
+ ut_ad(mtr_memo_contains(mtr, buf_block_align(
+ btr_cur_get_page(cursor)),
+ MTR_MEMO_PAGE_X_FIX));
+
+ page = btr_cur_get_page(cursor);
+
+ if ((page_get_data_size(page) < BTR_CUR_PAGE_COMPRESS_LIMIT)
+ || ((btr_page_get_next(page, mtr) == FIL_NULL)
+ && (btr_page_get_prev(page, mtr) == FIL_NULL))) {
+
+ /* The page fillfactor has dropped below a predefined
+ minimum value OR the level in the B-tree contains just
+ one page: we recommend compression if this is not the
+ root page. */
+
+ if (dict_tree_get_page((cursor->index)->tree)
+ == buf_frame_get_page_no(page)) {
+
+ /* It is the root page */
+
+ return(FALSE);
+ }
+
+ return(TRUE);
+ }
+
+ return(FALSE);
+}
+
+/*************************************************************************
+Checks if the record on which the cursor is placed can be deleted without
+making tree compression necessary (or, recommended). */
+UNIV_INLINE
+ibool
+btr_cur_can_delete_without_compress(
+/*================================*/
+ /* out: TRUE if can be deleted without
+ recommended compression */
+ btr_cur_t* cursor, /* in: btr cursor */
+ mtr_t* mtr) /* in: mtr */
+{
+ ulint rec_size;
+ page_t* page;
+
+ ut_ad(mtr_memo_contains(mtr, buf_block_align(
+ btr_cur_get_page(cursor)),
+ MTR_MEMO_PAGE_X_FIX));
+
+ rec_size = rec_get_size(btr_cur_get_rec(cursor));
+
+ page = btr_cur_get_page(cursor);
+
+ if ((page_get_data_size(page) - rec_size < BTR_CUR_PAGE_COMPRESS_LIMIT)
+ || ((btr_page_get_next(page, mtr) == FIL_NULL)
+ && (btr_page_get_prev(page, mtr) == FIL_NULL))
+ || (page_get_n_recs(page) < 2)) {
+
+ /* The page fillfactor will drop below a predefined
+ minimum value, OR the level in the B-tree contains just
+ one page, OR the page will become empty: we recommend
+ compression if this is not the root page. */
+
+ if (dict_tree_get_page((cursor->index)->tree)
+ == buf_frame_get_page_no(page)) {
+
+ /* It is the root page */
+
+ return(TRUE);
+ }
+
+ return(FALSE);
+ }
+
+ return(TRUE);
+}
diff --git a/innobase/include/btr0pcur.h b/innobase/include/btr0pcur.h
new file mode 100644
index 00000000000..c07d5199d8c
--- /dev/null
+++ b/innobase/include/btr0pcur.h
@@ -0,0 +1,486 @@
+/******************************************************
+The index tree persistent cursor
+
+(c) 1996 Innobase Oy
+
+Created 2/23/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef btr0pcur_h
+#define btr0pcur_h
+
+#include "univ.i"
+#include "dict0dict.h"
+#include "data0data.h"
+#include "mtr0mtr.h"
+#include "page0cur.h"
+#include "btr0cur.h"
+#include "btr0btr.h"
+#include "btr0types.h"
+
+/* Relative positions for a stored cursor position */
+#define BTR_PCUR_ON 1
+#define BTR_PCUR_BEFORE 2
+#define BTR_PCUR_AFTER 3
+
+/******************************************************************
+Allocates memory for a persistent cursor object and initializes the cursor. */
+
+btr_pcur_t*
+btr_pcur_create_for_mysql(void);
+/*============================*/
+ /* out, own: persistent cursor */
+/******************************************************************
+Frees the memory for a persistent cursor object. */
+
+void
+btr_pcur_free_for_mysql(
+/*====================*/
+ btr_pcur_t* cursor); /* in, own: persistent cursor */
+/******************************************************************
+Copies the stored position of a pcur to another pcur. */
+
+void
+btr_pcur_copy_stored_position(
+/*==========================*/
+ btr_pcur_t* pcur_receive, /* in: pcur which will receive the
+ position info */
+ btr_pcur_t* pcur_donate); /* in: pcur from which the info is
+ copied */
+/******************************************************************
+Sets the old_rec_buf field to NULL. */
+UNIV_INLINE
+void
+btr_pcur_init(
+/*==========*/
+ btr_pcur_t* pcur); /* in: persistent cursor */
+/******************************************************************
+Initializes and opens a persistent cursor to an index tree. It should be
+closed with btr_pcur_close. */
+UNIV_INLINE
+void
+btr_pcur_open(
+/*==========*/
+ dict_index_t* index, /* in: index */
+ dtuple_t* tuple, /* in: tuple on which search done */
+ ulint mode, /* in: PAGE_CUR_L, ...;
+ NOTE that if the search is made using a unique
+ prefix of a record, mode should be
+ PAGE_CUR_LE, not PAGE_CUR_GE, as the latter
+ may end up on the previous page from the
+ record! */
+ ulint latch_mode,/* in: BTR_SEARCH_LEAF, ... */
+ btr_pcur_t* cursor, /* in: memory buffer for persistent cursor */
+ mtr_t* mtr); /* in: mtr */
+/******************************************************************
+Opens an persistent cursor to an index tree without initializing the
+cursor. */
+UNIV_INLINE
+void
+btr_pcur_open_with_no_init(
+/*=======================*/
+ dict_index_t* index, /* in: index */
+ dtuple_t* tuple, /* in: tuple on which search done */
+ ulint mode, /* in: PAGE_CUR_L, ...;
+ NOTE that if the search is made using a unique
+ prefix of a record, mode should be
+ PAGE_CUR_LE, not PAGE_CUR_GE, as the latter
+ may end up on the previous page of the
+ record! */
+ ulint latch_mode,/* in: BTR_SEARCH_LEAF, ... */
+ btr_pcur_t* cursor, /* in: memory buffer for persistent cursor */
+ ulint has_search_latch,/* in: latch mode the caller
+ currently has on btr_search_latch:
+ RW_S_LATCH, or 0 */
+ mtr_t* mtr); /* in: mtr */
+/*********************************************************************
+Opens a persistent cursor at either end of an index. */
+UNIV_INLINE
+void
+btr_pcur_open_at_index_side(
+/*========================*/
+ ibool from_left, /* in: TRUE if open to the low end,
+ FALSE if to the high end */
+ dict_index_t* index, /* in: index */
+ ulint latch_mode, /* in: latch mode */
+ btr_pcur_t* pcur, /* in: cursor */
+ ibool do_init, /* in: TRUE if should be initialized */
+ mtr_t* mtr); /* in: mtr */
+/******************************************************************
+Gets the up_match value for a pcur after a search. */
+UNIV_INLINE
+ulint
+btr_pcur_get_up_match(
+/*==================*/
+ /* out: number of matched fields at the cursor
+ or to the right if search mode was PAGE_CUR_GE,
+ otherwise undefined */
+ btr_pcur_t* cursor); /* in: memory buffer for persistent cursor */
+/******************************************************************
+Gets the low_match value for a pcur after a search. */
+UNIV_INLINE
+ulint
+btr_pcur_get_low_match(
+/*===================*/
+ /* out: number of matched fields at the cursor
+ or to the right if search mode was PAGE_CUR_LE,
+ otherwise undefined */
+ btr_pcur_t* cursor); /* in: memory buffer for persistent cursor */
+/******************************************************************
+If mode is PAGE_CUR_G or PAGE_CUR_GE, opens a persistent cursor on the first
+user record satisfying the search condition, in the case PAGE_CUR_L or
+PAGE_CUR_LE, on the last user record. If no such user record exists, then
+in the first case sets the cursor after last in tree, and in the latter case
+before first in tree. The latching mode must be BTR_SEARCH_LEAF or
+BTR_MODIFY_LEAF. */
+
+void
+btr_pcur_open_on_user_rec(
+/*======================*/
+ dict_index_t* index, /* in: index */
+ dtuple_t* tuple, /* in: tuple on which search done */
+ ulint mode, /* in: PAGE_CUR_L, ... */
+ ulint latch_mode, /* in: BTR_SEARCH_LEAF or
+ BTR_MODIFY_LEAF */
+ btr_pcur_t* cursor, /* in: memory buffer for persistent
+ cursor */
+ mtr_t* mtr); /* in: mtr */
+/**************************************************************************
+Positions a cursor at a randomly chosen position within a B-tree. */
+UNIV_INLINE
+void
+btr_pcur_open_at_rnd_pos(
+/*=====================*/
+ dict_index_t* index, /* in: index */
+ ulint latch_mode, /* in: BTR_SEARCH_LEAF, ... */
+ btr_pcur_t* cursor, /* in/out: B-tree pcur */
+ mtr_t* mtr); /* in: mtr */
+/******************************************************************
+Frees the possible old_rec_buf buffer of a persistent cursor and sets the
+latch mode of the persistent cursor to BTR_NO_LATCHES. */
+UNIV_INLINE
+void
+btr_pcur_close(
+/*===========*/
+ btr_pcur_t* cursor); /* in: persistent cursor */
+/******************************************************************
+The position of the cursor is stored by taking an initial segment of the
+record the cursor is positioned on, before, or after, and copying it to the
+cursor data structure. NOTE that the page where the cursor is positioned
+must not be empty! */
+
+void
+btr_pcur_store_position(
+/*====================*/
+ btr_pcur_t* cursor, /* in: persistent cursor */
+ mtr_t* mtr); /* in: mtr */
+/******************************************************************
+If the latch mode of the cursor is BTR_LEAF_SEARCH or BTR_LEAF_MODIFY,
+releases the page latch and bufferfix reserved by the cursor.
+NOTE! In the case of BTR_LEAF_MODIFY, there should not exist changes
+made by the current mini-transaction to the data protected by the
+cursor latch, as then the latch must not be released until mtr_commit. */
+
+void
+btr_pcur_release_leaf(
+/*==================*/
+ btr_pcur_t* cursor, /* in: persistent cursor */
+ mtr_t* mtr); /* in: mtr */
+/*************************************************************
+Gets the rel_pos field for a cursor whose position has been stored. */
+UNIV_INLINE
+ulint
+btr_pcur_get_rel_pos(
+/*=================*/
+ /* out: BTR_PCUR_ON, ... */
+ btr_pcur_t* cursor);/* in: persistent cursor */
+/******************************************************************
+Restores the stored position of a persistent cursor bufferfixing the page and
+obtaining the specified latches. If the cursor position was saved when the
+(1) cursor was positioned on a user record: this function restores the position
+to the last record LESS OR EQUAL to the stored record;
+(2) cursor was positioned on a page infimum record: restores the position to
+the last record LESS than the user record which was the successor of the page
+infimum;
+(3) cursor was positioned on the page supremum: restores to the first record
+GREATER than the user record which was the predecessor of the supremum. */
+
+ibool
+btr_pcur_restore_position(
+/*======================*/
+ /* out: TRUE if the cursor position
+ was stored when it was on a user record
+ and it can be restored on a user record
+ whose ordering fields are identical to
+ the ones of the original user record */
+ ulint latch_mode, /* in: BTR_SEARCH_LEAF, ... */
+ btr_pcur_t* cursor, /* in: detached persistent cursor */
+ mtr_t* mtr); /* in: mtr */
+/*************************************************************
+Sets the mtr field for a pcur. */
+UNIV_INLINE
+void
+btr_pcur_set_mtr(
+/*=============*/
+ btr_pcur_t* cursor, /* in: persistent cursor */
+ mtr_t* mtr); /* in, own: mtr */
+/*************************************************************
+Gets the mtr field for a pcur. */
+UNIV_INLINE
+mtr_t*
+btr_pcur_get_mtr(
+/*=============*/
+ /* out: mtr */
+ btr_pcur_t* cursor); /* in: persistent cursor */
+/******************************************************************
+Commits the pcur mtr and sets the pcur latch mode to BTR_NO_LATCHES,
+that is, the cursor becomes detached. If there have been modifications
+to the page where pcur is positioned, this can be used instead of
+btr_pcur_release_leaf. Function btr_pcur_store_position should be used
+before calling this, if restoration of cursor is wanted later. */
+UNIV_INLINE
+void
+btr_pcur_commit(
+/*============*/
+ btr_pcur_t* pcur); /* in: persistent cursor */
+/******************************************************************
+Differs from btr_pcur_commit in that we can specify the mtr to commit. */
+UNIV_INLINE
+void
+btr_pcur_commit_specify_mtr(
+/*========================*/
+ btr_pcur_t* pcur, /* in: persistent cursor */
+ mtr_t* mtr); /* in: mtr to commit */
+/******************************************************************
+Tests if a cursor is detached: that is the latch mode is BTR_NO_LATCHES. */
+UNIV_INLINE
+ibool
+btr_pcur_is_detached(
+/*=================*/
+ /* out: TRUE if detached */
+ btr_pcur_t* pcur); /* in: persistent cursor */
+/*************************************************************
+Moves the persistent cursor to the next record in the tree. If no records are
+left, the cursor stays 'after last in tree'. */
+UNIV_INLINE
+ibool
+btr_pcur_move_to_next(
+/*==================*/
+ /* out: TRUE if the cursor was not after last
+ in tree */
+ btr_pcur_t* cursor, /* in: persistent cursor; NOTE that the
+ function may release the page latch */
+ mtr_t* mtr); /* in: mtr */
+/*************************************************************
+Moves the persistent cursor to the previous record in the tree. If no records
+are left, the cursor stays 'before first in tree'. */
+
+ibool
+btr_pcur_move_to_prev(
+/*==================*/
+ /* out: TRUE if the cursor was not before first
+ in tree */
+ btr_pcur_t* cursor, /* in: persistent cursor; NOTE that the
+ function may release the page latch */
+ mtr_t* mtr); /* in: mtr */
+/*************************************************************
+Moves the persistent cursor to the next user record in the tree. If no user
+records are left, the cursor ends up 'after last in tree'. */
+UNIV_INLINE
+ibool
+btr_pcur_move_to_next_user_rec(
+/*===========================*/
+ /* out: TRUE if the cursor moved forward,
+ ending on a user record */
+ btr_pcur_t* cursor, /* in: persistent cursor; NOTE that the
+ function may release the page latch */
+ mtr_t* mtr); /* in: mtr */
+/*************************************************************
+Moves the persistent cursor to the first record on the next page.
+Releases the latch on the current page, and bufferunfixes it.
+Note that there must not be modifications on the current page,
+as then the x-latch can be released only in mtr_commit. */
+
+void
+btr_pcur_move_to_next_page(
+/*=======================*/
+ btr_pcur_t* cursor, /* in: persistent cursor; must be on the
+ last record of the current page */
+ mtr_t* mtr); /* in: mtr */
+/*************************************************************
+Moves the persistent cursor backward if it is on the first record
+of the page. Releases the latch on the current page, and bufferunfixes
+it. Note that to prevent a possible deadlock, the operation first
+stores the position of the cursor, releases the leaf latch, acquires
+necessary latches and restores the cursor position again before returning.
+The alphabetical position of the cursor is guaranteed to be sensible
+on return, but it may happen that the cursor is not positioned on the
+last record of any page, because the structure of the tree may have
+changed while the cursor had no latches. */
+
+void
+btr_pcur_move_backward_from_page(
+/*=============================*/
+ btr_pcur_t* cursor, /* in: persistent cursor, must be on the
+ first record of the current page */
+ mtr_t* mtr); /* in: mtr */
+/*************************************************************
+Returns the btr cursor component of a persistent cursor. */
+UNIV_INLINE
+btr_cur_t*
+btr_pcur_get_btr_cur(
+/*=================*/
+ /* out: pointer to btr cursor component */
+ btr_pcur_t* cursor); /* in: persistent cursor */
+/*************************************************************
+Returns the page cursor component of a persistent cursor. */
+UNIV_INLINE
+page_cur_t*
+btr_pcur_get_page_cur(
+/*==================*/
+ /* out: pointer to page cursor component */
+ btr_pcur_t* cursor); /* in: persistent cursor */
+/*************************************************************
+Returns the page of a persistent cursor. */
+UNIV_INLINE
+page_t*
+btr_pcur_get_page(
+/*==============*/
+ /* out: pointer to the page */
+ btr_pcur_t* cursor);/* in: persistent cursor */
+/*************************************************************
+Returns the record of a persistent cursor. */
+UNIV_INLINE
+rec_t*
+btr_pcur_get_rec(
+/*=============*/
+ /* out: pointer to the record */
+ btr_pcur_t* cursor);/* in: persistent cursor */
+/*************************************************************
+Checks if the persistent cursor is on a user record. */
+UNIV_INLINE
+ibool
+btr_pcur_is_on_user_rec(
+/*====================*/
+ btr_pcur_t* cursor, /* in: persistent cursor */
+ mtr_t* mtr); /* in: mtr */
+/*************************************************************
+Checks if the persistent cursor is after the last user record on
+a page. */
+UNIV_INLINE
+ibool
+btr_pcur_is_after_last_on_page(
+/*===========================*/
+ btr_pcur_t* cursor, /* in: persistent cursor */
+ mtr_t* mtr); /* in: mtr */
+/*************************************************************
+Checks if the persistent cursor is before the first user record on
+a page. */
+UNIV_INLINE
+ibool
+btr_pcur_is_before_first_on_page(
+/*=============================*/
+ btr_pcur_t* cursor, /* in: persistent cursor */
+ mtr_t* mtr); /* in: mtr */
+/*************************************************************
+Checks if the persistent cursor is before the first user record in
+the index tree. */
+UNIV_INLINE
+ibool
+btr_pcur_is_before_first_in_tree(
+/*=============================*/
+ btr_pcur_t* cursor, /* in: persistent cursor */
+ mtr_t* mtr); /* in: mtr */
+/*************************************************************
+Checks if the persistent cursor is after the last user record in
+the index tree. */
+UNIV_INLINE
+ibool
+btr_pcur_is_after_last_in_tree(
+/*===========================*/
+ btr_pcur_t* cursor, /* in: persistent cursor */
+ mtr_t* mtr); /* in: mtr */
+/*************************************************************
+Moves the persistent cursor to the next record on the same page. */
+UNIV_INLINE
+void
+btr_pcur_move_to_next_on_page(
+/*==========================*/
+ btr_pcur_t* cursor, /* in: persistent cursor */
+ mtr_t* mtr); /* in: mtr */
+/*************************************************************
+Moves the persistent cursor to the previous record on the same page. */
+UNIV_INLINE
+void
+btr_pcur_move_to_prev_on_page(
+/*==========================*/
+ btr_pcur_t* cursor, /* in: persistent cursor */
+ mtr_t* mtr); /* in: mtr */
+
+
+/* The persistent B-tree cursor structure. This is used mainly for SQL
+selects, updates, and deletes. */
+
+struct btr_pcur_struct{
+ btr_cur_t btr_cur; /* a B-tree cursor */
+ ulint latch_mode; /* see FIXME note below!
+ BTR_SEARCH_LEAF, BTR_MODIFY_LEAF,
+ BTR_MODIFY_TREE, or BTR_NO_LATCHES,
+ depending on the latching state of
+ the page and tree where the cursor is
+ positioned; the last value means that
+ the cursor is not currently positioned:
+ we say then that the cursor is
+ detached; it can be restored to
+ attached if the old position was
+ stored in old_rec */
+ ulint old_stored; /* BTR_PCUR_OLD_STORED
+ or BTR_PCUR_OLD_NOT_STORED */
+ rec_t* old_rec; /* if cursor position is stored,
+ contains an initial segment of the
+ latest record cursor was positioned
+ either on, before, or after */
+ ulint rel_pos; /* BTR_PCUR_ON, BTR_PCUR_BEFORE, or
+ BTR_PCUR_AFTER, depending on whether
+ cursor was on, before, or after the
+ old_rec record */
+ dulint modify_clock; /* the modify clock value of the
+ buffer block when the cursor position
+ was stored */
+ ulint pos_state; /* see FIXME note below!
+ BTR_PCUR_IS_POSITIONED,
+ BTR_PCUR_WAS_POSITIONED,
+ BTR_PCUR_NOT_POSITIONED */
+ ulint search_mode; /* PAGE_CUR_G, ... */
+ /*-----------------------------*/
+ /* NOTE that the following fields may possess dynamically allocated
+ memory, which should be freed if not needed anymore! */
+
+ mtr_t* mtr; /* NULL, or this field may contain
+ a mini-transaction which holds the
+ latch on the cursor page */
+ byte* old_rec_buf; /* NULL, or a dynamically allocated
+ buffer for old_rec */
+ ulint buf_size; /* old_rec_buf size if old_rec_buf
+ is not NULL */
+};
+
+#define BTR_PCUR_IS_POSITIONED 1997660512 /* FIXME: currently, the state
+ can be BTR_PCUR_IS_POSITIONED,
+ though it really should be
+ BTR_PCUR_WAS_POSITIONED,
+ because we have no obligation
+ to commit the cursor with
+ mtr; similarly latch_mode may
+ be out of date */
+#define BTR_PCUR_WAS_POSITIONED 1187549791
+#define BTR_PCUR_NOT_POSITIONED 1328997689
+
+#define BTR_PCUR_OLD_STORED 908467085
+#define BTR_PCUR_OLD_NOT_STORED 122766467
+
+#ifndef UNIV_NONINL
+#include "btr0pcur.ic"
+#endif
+
+#endif
diff --git a/innobase/include/btr0pcur.ic b/innobase/include/btr0pcur.ic
new file mode 100644
index 00000000000..7f31f8fe502
--- /dev/null
+++ b/innobase/include/btr0pcur.ic
@@ -0,0 +1,598 @@
+/******************************************************
+The index tree persistent cursor
+
+(c) 1996 Innobase Oy
+
+Created 2/23/1996 Heikki Tuuri
+*******************************************************/
+
+
+/*************************************************************
+Gets the rel_pos field for a cursor whose position has been stored. */
+UNIV_INLINE
+ulint
+btr_pcur_get_rel_pos(
+/*=================*/
+ /* out: BTR_PCUR_ON, ... */
+ btr_pcur_t* cursor) /* in: persistent cursor */
+{
+ ut_ad(cursor);
+ ut_ad(cursor->old_rec);
+ ut_ad(cursor->old_stored == BTR_PCUR_OLD_STORED);
+ ut_ad((cursor->pos_state == BTR_PCUR_WAS_POSITIONED)
+ || (cursor->pos_state == BTR_PCUR_IS_POSITIONED));
+
+ return(cursor->rel_pos);
+}
+
+/*************************************************************
+Sets the mtr field for a pcur. */
+UNIV_INLINE
+void
+btr_pcur_set_mtr(
+/*=============*/
+ btr_pcur_t* cursor, /* in: persistent cursor */
+ mtr_t* mtr) /* in, own: mtr */
+{
+ ut_ad(cursor);
+
+ cursor->mtr = mtr;
+}
+
+/*************************************************************
+Gets the mtr field for a pcur. */
+UNIV_INLINE
+mtr_t*
+btr_pcur_get_mtr(
+/*=============*/
+ /* out: mtr */
+ btr_pcur_t* cursor) /* in: persistent cursor */
+{
+ ut_ad(cursor);
+
+ return(cursor->mtr);
+}
+
+/*************************************************************
+Returns the btr cursor component of a persistent cursor. */
+UNIV_INLINE
+btr_cur_t*
+btr_pcur_get_btr_cur(
+/*=================*/
+ /* out: pointer to btr cursor component */
+ btr_pcur_t* cursor) /* in: persistent cursor */
+{
+ return(&(cursor->btr_cur));
+}
+
+/*************************************************************
+Returns the page cursor component of a persistent cursor. */
+UNIV_INLINE
+page_cur_t*
+btr_pcur_get_page_cur(
+/*==================*/
+ /* out: pointer to page cursor component */
+ btr_pcur_t* cursor) /* in: persistent cursor */
+{
+ return(btr_cur_get_page_cur(&(cursor->btr_cur)));
+}
+
+/*************************************************************
+Returns the page of a persistent cursor. */
+UNIV_INLINE
+page_t*
+btr_pcur_get_page(
+/*==============*/
+ /* out: pointer to the page */
+ btr_pcur_t* cursor) /* in: persistent cursor */
+{
+ ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+
+ return(page_cur_get_page(btr_pcur_get_page_cur(cursor)));
+}
+
+/*************************************************************
+Returns the record of a persistent cursor. */
+UNIV_INLINE
+rec_t*
+btr_pcur_get_rec(
+/*=============*/
+ /* out: pointer to the record */
+ btr_pcur_t* cursor) /* in: persistent cursor */
+{
+ ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+ ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+
+ return(page_cur_get_rec(btr_pcur_get_page_cur(cursor)));
+}
+
+/******************************************************************
+Gets the up_match value for a pcur after a search. */
+UNIV_INLINE
+ulint
+btr_pcur_get_up_match(
+/*==================*/
+ /* out: number of matched fields at the cursor
+ or to the right if search mode was PAGE_CUR_GE,
+ otherwise undefined */
+ btr_pcur_t* cursor) /* in: memory buffer for persistent cursor */
+{
+ btr_cur_t* btr_cursor;
+
+ ut_ad((cursor->pos_state == BTR_PCUR_WAS_POSITIONED)
+ || (cursor->pos_state == BTR_PCUR_IS_POSITIONED));
+
+ btr_cursor = btr_pcur_get_btr_cur(cursor);
+
+ ut_ad(btr_cursor->up_match != ULINT_UNDEFINED);
+
+ return(btr_cursor->up_match);
+}
+
+/******************************************************************
+Gets the low_match value for a pcur after a search. */
+UNIV_INLINE
+ulint
+btr_pcur_get_low_match(
+/*===================*/
+ /* out: number of matched fields at the cursor
+ or to the right if search mode was PAGE_CUR_LE,
+ otherwise undefined */
+ btr_pcur_t* cursor) /* in: memory buffer for persistent cursor */
+{
+ btr_cur_t* btr_cursor;
+
+ ut_ad((cursor->pos_state == BTR_PCUR_WAS_POSITIONED)
+ || (cursor->pos_state == BTR_PCUR_IS_POSITIONED));
+
+ btr_cursor = btr_pcur_get_btr_cur(cursor);
+ ut_ad(btr_cursor->low_match != ULINT_UNDEFINED);
+
+ return(btr_cursor->low_match);
+}
+
+/*************************************************************
+Checks if the persistent cursor is after the last user record on
+a page. */
+UNIV_INLINE
+ibool
+btr_pcur_is_after_last_on_page(
+/*===========================*/
+ btr_pcur_t* cursor, /* in: persistent cursor */
+ mtr_t* mtr) /* in: mtr */
+{
+ UT_NOT_USED(mtr);
+ ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+ ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+
+ return(page_cur_is_after_last(btr_pcur_get_page_cur(cursor)));
+}
+
+/*************************************************************
+Checks if the persistent cursor is before the first user record on
+a page. */
+UNIV_INLINE
+ibool
+btr_pcur_is_before_first_on_page(
+/*=============================*/
+ btr_pcur_t* cursor, /* in: persistent cursor */
+ mtr_t* mtr) /* in: mtr */
+{
+ UT_NOT_USED(mtr);
+ ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+ ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+
+ return(page_cur_is_before_first(btr_pcur_get_page_cur(cursor)));
+}
+
+/*************************************************************
+Checks if the persistent cursor is on a user record. */
+UNIV_INLINE
+ibool
+btr_pcur_is_on_user_rec(
+/*====================*/
+ btr_pcur_t* cursor, /* in: persistent cursor */
+ mtr_t* mtr) /* in: mtr */
+{
+ ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+ ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+
+ if ((btr_pcur_is_before_first_on_page(cursor, mtr))
+ || (btr_pcur_is_after_last_on_page(cursor, mtr))) {
+
+ return(FALSE);
+ }
+
+ return(TRUE);
+}
+
+/*************************************************************
+Checks if the persistent cursor is before the first user record in
+the index tree. */
+UNIV_INLINE
+ibool
+btr_pcur_is_before_first_in_tree(
+/*=============================*/
+ btr_pcur_t* cursor, /* in: persistent cursor */
+ mtr_t* mtr) /* in: mtr */
+{
+ ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+ ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+
+ if (btr_page_get_prev(btr_pcur_get_page(cursor), mtr) != FIL_NULL) {
+
+ return(FALSE);
+ }
+
+ return(page_cur_is_before_first(btr_pcur_get_page_cur(cursor)));
+}
+
+/*************************************************************
+Checks if the persistent cursor is after the last user record in
+the index tree. */
+UNIV_INLINE
+ibool
+btr_pcur_is_after_last_in_tree(
+/*===========================*/
+ btr_pcur_t* cursor, /* in: persistent cursor */
+ mtr_t* mtr) /* in: mtr */
+{
+ ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+ ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+
+ if (btr_page_get_next(btr_pcur_get_page(cursor), mtr) != FIL_NULL) {
+
+ return(FALSE);
+ }
+
+ return(page_cur_is_after_last(btr_pcur_get_page_cur(cursor)));
+}
+
+/*************************************************************
+Moves the persistent cursor to the next record on the same page. */
+UNIV_INLINE
+void
+btr_pcur_move_to_next_on_page(
+/*==========================*/
+ btr_pcur_t* cursor, /* in: persistent cursor */
+ mtr_t* mtr) /* in: mtr */
+{
+ UT_NOT_USED(mtr);
+ ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+ ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+
+ page_cur_move_to_next(btr_pcur_get_page_cur(cursor));
+
+ cursor->old_stored = BTR_PCUR_OLD_NOT_STORED;
+}
+
+/*************************************************************
+Moves the persistent cursor to the previous record on the same page. */
+UNIV_INLINE
+void
+btr_pcur_move_to_prev_on_page(
+/*==========================*/
+ btr_pcur_t* cursor, /* in: persistent cursor */
+ mtr_t* mtr) /* in: mtr */
+{
+ UT_NOT_USED(mtr);
+ ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+ ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+
+ page_cur_move_to_prev(btr_pcur_get_page_cur(cursor));
+
+ cursor->old_stored = BTR_PCUR_OLD_NOT_STORED;
+}
+
+/*************************************************************
+Moves the persistent cursor to the next user record in the tree. If no user
+records are left, the cursor ends up 'after last in tree'. */
+UNIV_INLINE
+ibool
+btr_pcur_move_to_next_user_rec(
+/*===========================*/
+ /* out: TRUE if the cursor moved forward,
+ ending on a user record */
+ btr_pcur_t* cursor, /* in: persistent cursor; NOTE that the
+ function may release the page latch */
+ mtr_t* mtr) /* in: mtr */
+{
+ ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+ ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+ cursor->old_stored = BTR_PCUR_OLD_NOT_STORED;
+loop:
+ if (btr_pcur_is_after_last_on_page(cursor, mtr)) {
+
+ if (btr_pcur_is_after_last_in_tree(cursor, mtr)) {
+
+ return(FALSE);
+ }
+
+ btr_pcur_move_to_next_page(cursor, mtr);
+ } else {
+ btr_pcur_move_to_next_on_page(cursor, mtr);
+ }
+
+ if (btr_pcur_is_on_user_rec(cursor, mtr)) {
+
+ return(TRUE);
+ }
+
+ goto loop;
+}
+
+/*************************************************************
+Moves the persistent cursor to the next record in the tree. If no records are
+left, the cursor stays 'after last in tree'. */
+UNIV_INLINE
+ibool
+btr_pcur_move_to_next(
+/*==================*/
+ /* out: TRUE if the cursor was not after last
+ in tree */
+ btr_pcur_t* cursor, /* in: persistent cursor; NOTE that the
+ function may release the page latch */
+ mtr_t* mtr) /* in: mtr */
+{
+ ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+ ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+
+ cursor->old_stored = BTR_PCUR_OLD_NOT_STORED;
+
+ if (btr_pcur_is_after_last_on_page(cursor, mtr)) {
+
+ if (btr_pcur_is_after_last_in_tree(cursor, mtr)) {
+
+ return(FALSE);
+ }
+
+ btr_pcur_move_to_next_page(cursor, mtr);
+
+ return(TRUE);
+ }
+
+ btr_pcur_move_to_next_on_page(cursor, mtr);
+
+ return(TRUE);
+}
+
+/******************************************************************
+Commits the pcur mtr and sets the pcur latch mode to BTR_NO_LATCHES,
+that is, the cursor becomes detached. If there have been modifications
+to the page where pcur is positioned, this can be used instead of
+btr_pcur_release_leaf. Function btr_pcur_store_position should be used
+before calling this, if restoration of cursor is wanted later. */
+UNIV_INLINE
+void
+btr_pcur_commit(
+/*============*/
+ btr_pcur_t* pcur) /* in: persistent cursor */
+{
+ ut_a(pcur->pos_state == BTR_PCUR_IS_POSITIONED);
+
+ pcur->latch_mode = BTR_NO_LATCHES;
+
+ mtr_commit(pcur->mtr);
+
+ pcur->pos_state = BTR_PCUR_WAS_POSITIONED;
+}
+
+/******************************************************************
+Differs from btr_pcur_commit in that we can specify the mtr to commit. */
+UNIV_INLINE
+void
+btr_pcur_commit_specify_mtr(
+/*========================*/
+ btr_pcur_t* pcur, /* in: persistent cursor */
+ mtr_t* mtr) /* in: mtr to commit */
+{
+ ut_a(pcur->pos_state == BTR_PCUR_IS_POSITIONED);
+
+ pcur->latch_mode = BTR_NO_LATCHES;
+
+ mtr_commit(mtr);
+
+ pcur->pos_state = BTR_PCUR_WAS_POSITIONED;
+}
+
+/******************************************************************
+Sets the pcur latch mode to BTR_NO_LATCHES. */
+UNIV_INLINE
+void
+btr_pcur_detach(
+/*============*/
+ btr_pcur_t* pcur) /* in: persistent cursor */
+{
+ ut_a(pcur->pos_state == BTR_PCUR_IS_POSITIONED);
+
+ pcur->latch_mode = BTR_NO_LATCHES;
+
+ pcur->pos_state = BTR_PCUR_WAS_POSITIONED;
+}
+
+/******************************************************************
+Tests if a cursor is detached: that is the latch mode is BTR_NO_LATCHES. */
+UNIV_INLINE
+ibool
+btr_pcur_is_detached(
+/*=================*/
+ /* out: TRUE if detached */
+ btr_pcur_t* pcur) /* in: persistent cursor */
+{
+ if (pcur->latch_mode == BTR_NO_LATCHES) {
+
+ return(TRUE);
+ }
+
+ return(FALSE);
+}
+
+/******************************************************************
+Sets the old_rec_buf field to NULL. */
+UNIV_INLINE
+void
+btr_pcur_init(
+/*==========*/
+ btr_pcur_t* pcur) /* in: persistent cursor */
+{
+ pcur->old_stored = BTR_PCUR_OLD_NOT_STORED;
+ pcur->old_rec_buf = NULL;
+ pcur->old_rec = NULL;
+}
+
+/******************************************************************
+Initializes and opens a persistent cursor to an index tree. It should be
+closed with btr_pcur_close. */
+UNIV_INLINE
+void
+btr_pcur_open(
+/*==========*/
+ dict_index_t* index, /* in: index */
+ dtuple_t* tuple, /* in: tuple on which search done */
+ ulint mode, /* in: PAGE_CUR_L, ...;
+ NOTE that if the search is made using a unique
+ prefix of a record, mode should be
+ PAGE_CUR_LE, not PAGE_CUR_GE, as the latter
+ may end up on the previous page from the
+ record! */
+ ulint latch_mode,/* in: BTR_SEARCH_LEAF, ... */
+ btr_pcur_t* cursor, /* in: memory buffer for persistent cursor */
+ mtr_t* mtr) /* in: mtr */
+{
+ btr_cur_t* btr_cursor;
+
+ /* Initialize the cursor */
+
+ btr_pcur_init(cursor);
+
+ cursor->latch_mode = latch_mode;
+ cursor->search_mode = mode;
+
+ /* Search with the tree cursor */
+
+ btr_cursor = btr_pcur_get_btr_cur(cursor);
+
+ btr_cur_search_to_nth_level(index, 0, tuple, mode, latch_mode,
+ btr_cursor, 0, mtr);
+ cursor->pos_state = BTR_PCUR_IS_POSITIONED;
+}
+
+/******************************************************************
+Opens an persistent cursor to an index tree without initializing the
+cursor. */
+UNIV_INLINE
+void
+btr_pcur_open_with_no_init(
+/*=======================*/
+ dict_index_t* index, /* in: index */
+ dtuple_t* tuple, /* in: tuple on which search done */
+ ulint mode, /* in: PAGE_CUR_L, ...;
+ NOTE that if the search is made using a unique
+ prefix of a record, mode should be
+ PAGE_CUR_LE, not PAGE_CUR_GE, as the latter
+ may end up on the previous page of the
+ record! */
+ ulint latch_mode,/* in: BTR_SEARCH_LEAF, ... */
+ btr_pcur_t* cursor, /* in: memory buffer for persistent cursor */
+ ulint has_search_latch,/* in: latch mode the caller
+ currently has on btr_search_latch:
+ RW_S_LATCH, or 0 */
+ mtr_t* mtr) /* in: mtr */
+{
+ btr_cur_t* btr_cursor;
+
+ cursor->latch_mode = latch_mode;
+ cursor->search_mode = mode;
+
+ /* Search with the tree cursor */
+
+ btr_cursor = btr_pcur_get_btr_cur(cursor);
+
+ btr_cur_search_to_nth_level(index, 0, tuple, mode, latch_mode,
+ btr_cursor, has_search_latch, mtr);
+ cursor->pos_state = BTR_PCUR_IS_POSITIONED;
+
+ cursor->old_stored = BTR_PCUR_OLD_NOT_STORED;
+}
+
+/*********************************************************************
+Opens a persistent cursor at either end of an index. */
+UNIV_INLINE
+void
+btr_pcur_open_at_index_side(
+/*========================*/
+ ibool from_left, /* in: TRUE if open to the low end,
+ FALSE if to the high end */
+ dict_index_t* index, /* in: index */
+ ulint latch_mode, /* in: latch mode */
+ btr_pcur_t* pcur, /* in: cursor */
+ ibool do_init, /* in: TRUE if should be initialized */
+ mtr_t* mtr) /* in: mtr */
+{
+ pcur->latch_mode = latch_mode;
+
+ if (from_left) {
+ pcur->search_mode = PAGE_CUR_G;
+ } else {
+ pcur->search_mode = PAGE_CUR_L;
+ }
+
+ if (do_init) {
+ btr_pcur_init(pcur);
+ }
+
+ btr_cur_open_at_index_side(from_left, index, latch_mode,
+ btr_pcur_get_btr_cur(pcur), mtr);
+ pcur->pos_state = BTR_PCUR_IS_POSITIONED;
+
+ pcur->old_stored = BTR_PCUR_OLD_NOT_STORED;
+}
+
+/**************************************************************************
+Positions a cursor at a randomly chosen position within a B-tree. */
+UNIV_INLINE
+void
+btr_pcur_open_at_rnd_pos(
+/*=====================*/
+ dict_index_t* index, /* in: index */
+ ulint latch_mode, /* in: BTR_SEARCH_LEAF, ... */
+ btr_pcur_t* cursor, /* in/out: B-tree pcur */
+ mtr_t* mtr) /* in: mtr */
+{
+ /* Initialize the cursor */
+
+ cursor->latch_mode = latch_mode;
+ cursor->search_mode = PAGE_CUR_G;
+
+ btr_pcur_init(cursor);
+
+ btr_cur_open_at_rnd_pos(index, latch_mode,
+ btr_pcur_get_btr_cur(cursor), mtr);
+ cursor->pos_state = BTR_PCUR_IS_POSITIONED;
+ cursor->old_stored = BTR_PCUR_OLD_NOT_STORED;
+}
+
+/******************************************************************
+Frees the possible memory heap of a persistent cursor and sets the latch
+mode of the persistent cursor to BTR_NO_LATCHES. */
+UNIV_INLINE
+void
+btr_pcur_close(
+/*===========*/
+ btr_pcur_t* cursor) /* in: persistent cursor */
+{
+ if (cursor->old_rec_buf != NULL) {
+
+ mem_free(cursor->old_rec_buf);
+
+ cursor->old_rec = NULL;
+ cursor->old_rec_buf = NULL;
+ }
+
+ cursor->btr_cur.page_cur.rec = NULL;
+ cursor->old_rec = NULL;
+ cursor->old_stored = BTR_PCUR_OLD_NOT_STORED;
+
+ cursor->latch_mode = BTR_NO_LATCHES;
+ cursor->pos_state = BTR_PCUR_NOT_POSITIONED;
+}
diff --git a/innobase/include/btr0sea.h b/innobase/include/btr0sea.h
new file mode 100644
index 00000000000..c319e16d740
--- /dev/null
+++ b/innobase/include/btr0sea.h
@@ -0,0 +1,269 @@
+/************************************************************************
+The index tree adaptive search
+
+(c) 1996 Innobase Oy
+
+Created 2/17/1996 Heikki Tuuri
+*************************************************************************/
+
+#ifndef btr0sea_h
+#define btr0sea_h
+
+#include "univ.i"
+
+#include "rem0rec.h"
+#include "dict0dict.h"
+#include "btr0types.h"
+#include "mtr0mtr.h"
+#include "ha0ha.h"
+
+/*********************************************************************
+Creates and initializes the adaptive search system at a database start. */
+
+void
+btr_search_sys_create(
+/*==================*/
+ ulint hash_size); /* in: hash index hash table size */
+/************************************************************************
+Returns search info for an index. */
+UNIV_INLINE
+btr_search_t*
+btr_search_get_info(
+/*================*/
+ /* out: search info; search mutex reserved */
+ dict_index_t* index); /* in: index */
+/*********************************************************************
+Creates and initializes a search info struct. */
+
+btr_search_t*
+btr_search_info_create(
+/*===================*/
+ /* out, own: search info struct */
+ mem_heap_t* heap); /* in: heap where created */
+/*************************************************************************
+Updates the search info. */
+UNIV_INLINE
+void
+btr_search_info_update(
+/*===================*/
+ dict_index_t* index, /* in: index of the cursor */
+ btr_cur_t* cursor);/* in: cursor which was just positioned */
+/**********************************************************************
+Tries to guess the right search position based on the search pattern info
+of the index. */
+
+ibool
+btr_search_guess_on_pattern(
+/*========================*/
+ /* out: TRUE if succeeded */
+ dict_index_t* index, /* in: index */
+ btr_search_t* info, /* in: index search info */
+ dtuple_t* tuple, /* in: logical record */
+ ulint mode, /* in: PAGE_CUR_L, ... */
+ ulint latch_mode, /* in: BTR_SEARCH_LEAF, ... */
+ btr_cur_t* cursor, /* out: tree cursor */
+ mtr_t* mtr); /* in: mtr */
+/**********************************************************************
+Tries to guess the right search position based on the hash search info
+of the index. Note that if mode is PAGE_CUR_LE, which is used in inserts,
+and the function returns TRUE, then cursor->up_match and cursor->low_match
+both have sensible values. */
+
+ibool
+btr_search_guess_on_hash(
+/*=====================*/
+ /* out: TRUE if succeeded */
+ dict_index_t* index, /* in: index */
+ btr_search_t* info, /* in: index search info */
+ dtuple_t* tuple, /* in: logical record */
+ ulint mode, /* in: PAGE_CUR_L, ... */
+ ulint latch_mode, /* in: BTR_SEARCH_LEAF, ... */
+ btr_cur_t* cursor, /* out: tree cursor */
+ ulint has_search_latch,/* in: latch mode the caller
+ currently has on btr_search_latch:
+ RW_S_LATCH, RW_X_LATCH, or 0 */
+ mtr_t* mtr); /* in: mtr */
+/************************************************************************
+Moves or deletes hash entries for moved records. If new_page is already hashed,
+then the hash index for page, if any, is dropped. If new_page is not hashed,
+and page is hashed, then a new hash index is built to new_page with the same
+parameters as page (this often happens when a page is split). */
+
+void
+btr_search_move_or_delete_hash_entries(
+/*===================================*/
+ page_t* new_page, /* in: records are copied to this page */
+ page_t* page); /* in: index page */
+/************************************************************************
+Drops a page hash index. */
+
+void
+btr_search_drop_page_hash_index(
+/*============================*/
+ page_t* page); /* in: index page, s- or x-latched */
+/************************************************************************
+Drops a page hash index when a page is freed from a fseg to the file system.
+Drops possible hash index if the page happens to be in the buffer pool. */
+
+void
+btr_search_drop_page_hash_when_freed(
+/*=================================*/
+ ulint space, /* in: space id */
+ ulint page_no); /* in: page number */
+/************************************************************************
+Updates the page hash index when a single record is inserted on a page. */
+
+void
+btr_search_update_hash_node_on_insert(
+/*==================================*/
+ btr_cur_t* cursor);/* in: cursor which was positioned to the
+ place to insert using btr_cur_search_...,
+ and the new record has been inserted next
+ to the cursor */
+/************************************************************************
+Updates the page hash index when a single record is inserted on a page. */
+
+void
+btr_search_update_hash_on_insert(
+/*=============================*/
+ btr_cur_t* cursor);/* in: cursor which was positioned to the
+ place to insert using btr_cur_search_...,
+ and the new record has been inserted next
+ to the cursor */
+/************************************************************************
+Updates the page hash index when a single record is deleted from a page. */
+
+void
+btr_search_update_hash_on_delete(
+/*=============================*/
+ btr_cur_t* cursor);/* in: cursor which was positioned on the
+ record to delete using btr_cur_search_...,
+ the record is not yet deleted */
+/************************************************************************
+Prints info of the search system. */
+
+void
+btr_search_print_info(void);
+/*=======================*/
+/************************************************************************
+Prints info of searches on an index. */
+
+void
+btr_search_index_print_info(
+/*========================*/
+ dict_index_t* index); /* in: index */
+/************************************************************************
+Prints info of searches on a table. */
+
+void
+btr_search_table_print_info(
+/*========================*/
+ char* name); /* in: table name */
+/************************************************************************
+Validates the search system. */
+
+ibool
+btr_search_validate(void);
+/*=====================*/
+
+
+/* Search info directions */
+#define BTR_SEA_NO_DIRECTION 1
+#define BTR_SEA_LEFT 2
+#define BTR_SEA_RIGHT 3
+#define BTR_SEA_SAME_REC 4
+
+/* The search info struct in an index */
+
+struct btr_search_struct{
+ /* The following 4 fields are currently not used: */
+ rec_t* last_search; /* pointer to the lower limit record of the
+ previous search; NULL if not known */
+ ulint n_direction; /* number of consecutive searches in the
+ same direction */
+ ulint direction; /* BTR_SEA_NO_DIRECTION, BTR_SEA_LEFT,
+ BTR_SEA_RIGHT, BTR_SEA_SAME_REC,
+ or BTR_SEA_SAME_PAGE */
+ dulint modify_clock; /* value of modify clock at the time
+ last_search was stored */
+ /*----------------------*/
+ /* The following 4 fields are not protected by any latch: */
+ page_t* root_guess; /* the root page frame when it was last time
+ fetched, or NULL */
+ ulint hash_analysis; /* when this exceeds a certain value, the
+ hash analysis starts; this is reset if no
+ success noticed */
+ ibool last_hash_succ; /* TRUE if the last search would have
+ succeeded, or did succeed, using the hash
+ index; NOTE that the value here is not exact:
+ it is not calculated for every search, and the
+ calculation itself is not always accurate! */
+ ulint n_hash_potential;/* number of consecutive searches which would
+ have succeeded, or did succeed, using the hash
+ index */
+ /*----------------------*/
+ ulint n_fields; /* recommended prefix length for hash search:
+ number of full fields */
+ ulint n_bytes; /* recommended prefix: number of bytes in
+ an incomplete field */
+ ulint side; /* BTR_SEARCH_LEFT_SIDE or
+ BTR_SEARCH_RIGHT_SIDE, depending on whether
+ the leftmost record of several records with
+ the same prefix should be indexed in the
+ hash index */
+ /*----------------------*/
+ ulint n_hash_succ; /* number of successful hash searches thus
+ far */
+ ulint n_hash_fail; /* number of failed hash searches */
+ ulint n_patt_succ; /* number of successful pattern searches thus
+ far */
+ ulint n_searches; /* number of searches */
+};
+
+/* The hash index system */
+
+typedef struct btr_search_sys_struct btr_search_sys_t;
+
+struct btr_search_sys_struct{
+ hash_table_t* hash_index;
+};
+
+extern btr_search_sys_t* btr_search_sys;
+
+/* The latch protecting the adaptive search system: this latch protects the
+(1) positions of records on those pages where a hash index has been built.
+NOTE: It does not protect values of non-ordering fields within a record from
+being updated in-place! We can use fact (1) to perform unique searches to
+indexes. */
+
+extern rw_lock_t* btr_search_latch_temp;
+
+#define btr_search_latch (*btr_search_latch_temp)
+
+extern ulint btr_search_n_succ;
+extern ulint btr_search_n_hash_fail;
+
+/* After change in n_fields or n_bytes in info, this many rounds are waited
+before starting the hash analysis again: this is to save CPU time when there
+is no hope in building a hash index. */
+
+#define BTR_SEARCH_HASH_ANALYSIS 17
+
+#define BTR_SEARCH_LEFT_SIDE 1
+#define BTR_SEARCH_RIGHT_SIDE 2
+
+/* Limit of consecutive searches for trying a search shortcut on the search
+pattern */
+
+#define BTR_SEARCH_ON_PATTERN_LIMIT 3
+
+/* Limit of consecutive searches for trying a search shortcut using the hash
+index */
+
+#define BTR_SEARCH_ON_HASH_LIMIT 3
+
+#ifndef UNIV_NONINL
+#include "btr0sea.ic"
+#endif
+
+#endif
diff --git a/innobase/include/btr0sea.ic b/innobase/include/btr0sea.ic
new file mode 100644
index 00000000000..63a3a658cf4
--- /dev/null
+++ b/innobase/include/btr0sea.ic
@@ -0,0 +1,65 @@
+/************************************************************************
+The index tree adaptive search
+
+(c) 1996 Innobase Oy
+
+Created 2/17/1996 Heikki Tuuri
+*************************************************************************/
+
+#include "dict0mem.h"
+#include "btr0cur.h"
+#include "buf0buf.h"
+
+/*************************************************************************
+Updates the search info. */
+
+void
+btr_search_info_update_slow(
+/*========================*/
+ btr_search_t* info, /* in: search info */
+ btr_cur_t* cursor);/* in: cursor which was just positioned */
+
+/************************************************************************
+Returns search info for an index. */
+UNIV_INLINE
+btr_search_t*
+btr_search_get_info(
+/*================*/
+ /* out: search info; search mutex reserved */
+ dict_index_t* index) /* in: index */
+{
+ ut_ad(index);
+
+ return(index->search_info);
+}
+
+/*************************************************************************
+Updates the search info. */
+UNIV_INLINE
+void
+btr_search_info_update(
+/*===================*/
+ dict_index_t* index, /* in: index of the cursor */
+ btr_cur_t* cursor) /* in: cursor which was just positioned */
+{
+ btr_search_t* info;
+
+ ut_ad(!rw_lock_own(&btr_search_latch, RW_LOCK_SHARED)
+ && !rw_lock_own(&btr_search_latch, RW_LOCK_EX));
+
+ info = btr_search_get_info(index);
+
+ info->hash_analysis++;
+
+ if (info->hash_analysis < BTR_SEARCH_HASH_ANALYSIS) {
+
+ /* Do nothing */
+
+ return;
+
+ }
+
+ ut_ad(cursor->flag != BTR_CUR_HASH);
+
+ btr_search_info_update_slow(info, cursor);
+}
diff --git a/innobase/include/btr0types.h b/innobase/include/btr0types.h
new file mode 100644
index 00000000000..03a61480e2e
--- /dev/null
+++ b/innobase/include/btr0types.h
@@ -0,0 +1,21 @@
+/************************************************************************
+The index tree general types
+
+(c) 1996 Innobase Oy
+
+Created 2/17/1996 Heikki Tuuri
+*************************************************************************/
+
+#ifndef btr0types_h
+#define btr0types_h
+
+#include "univ.i"
+
+#include "rem0types.h"
+#include "page0types.h"
+
+typedef struct btr_pcur_struct btr_pcur_t;
+typedef struct btr_cur_struct btr_cur_t;
+typedef struct btr_search_struct btr_search_t;
+
+#endif
diff --git a/innobase/include/buf0buf.h b/innobase/include/buf0buf.h
new file mode 100644
index 00000000000..08c59d60c91
--- /dev/null
+++ b/innobase/include/buf0buf.h
@@ -0,0 +1,834 @@
+/* Innobase relational database engine; Copyright (C) 2001 Innobase Oy
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License 2
+ as published by the Free Software Foundation in June 1991.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License 2
+ along with this program (in file COPYING); if not, write to the Free
+ Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
+/******************************************************
+The database buffer pool high-level routines
+
+(c) 1995 Innobase Oy
+
+Created 11/5/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef buf0buf_h
+#define buf0buf_h
+
+#include "univ.i"
+#include "fil0fil.h"
+#include "mtr0types.h"
+#include "buf0types.h"
+#include "sync0rw.h"
+#include "hash0hash.h"
+#include "ut0byte.h"
+
+/* Flags for flush types */
+#define BUF_FLUSH_LRU 1
+#define BUF_FLUSH_SINGLE_PAGE 2
+#define BUF_FLUSH_LIST 3 /* An array in the pool struct
+ has size BUF_FLUSH_LIST + 1: if you
+ add more flush types, put them in
+ the middle! */
+/* Modes for buf_page_get_gen */
+#define BUF_GET 10 /* get always */
+#define BUF_GET_IF_IN_POOL 11 /* get if in pool */
+#define BUF_GET_NOWAIT 12 /* get if can set the latch without
+ waiting */
+#define BUF_GET_NO_LATCH 14 /* get and bufferfix, but set no latch;
+ we have separated this case, because
+ it is error-prone programming not to
+ set a latch, and it should be used
+ with care */
+/* Modes for buf_page_get_known_nowait */
+#define BUF_MAKE_YOUNG 51
+#define BUF_KEEP_OLD 52
+
+extern buf_pool_t* buf_pool; /* The buffer pool of the database */
+extern ibool buf_debug_prints;/* If this is set TRUE, the program
+ prints info whenever read or flush
+ occurs */
+
+/************************************************************************
+Initializes the buffer pool of the database. */
+
+void
+buf_pool_init(
+/*==========*/
+ ulint max_size, /* in: maximum size of the pool in blocks */
+ ulint curr_size); /* in: current size to use, must be <=
+ max_size */
+/*************************************************************************
+Gets the current size of buffer pool in bytes. */
+UNIV_INLINE
+ulint
+buf_pool_get_curr_size(void);
+/*========================*/
+ /* out: size in bytes */
+/*************************************************************************
+Gets the maximum size of buffer pool in bytes. */
+UNIV_INLINE
+ulint
+buf_pool_get_max_size(void);
+/*=======================*/
+ /* out: size in bytes */
+/************************************************************************
+Gets the smallest oldest_modification lsn for any page in the pool. Returns
+ut_dulint_zero if all modified pages have been flushed to disk. */
+UNIV_INLINE
+dulint
+buf_pool_get_oldest_modification(void);
+/*==================================*/
+ /* out: oldest modification in pool,
+ ut_dulint_zero if none */
+/*************************************************************************
+Allocates a buffer frame. */
+
+buf_frame_t*
+buf_frame_alloc(void);
+/*==================*/
+ /* out: buffer frame */
+/*************************************************************************
+Frees a buffer frame which does not contain a file page. */
+
+void
+buf_frame_free(
+/*===========*/
+ buf_frame_t* frame); /* in: buffer frame */
+/*************************************************************************
+Copies contents of a buffer frame to a given buffer. */
+UNIV_INLINE
+byte*
+buf_frame_copy(
+/*===========*/
+ /* out: buf */
+ byte* buf, /* in: buffer to copy to */
+ buf_frame_t* frame); /* in: buffer frame */
+/******************************************************************
+NOTE! The following macros should be used instead of buf_page_get_gen,
+to improve debugging. Only values RW_S_LATCH and RW_X_LATCH are allowed
+in LA! */
+#ifdef UNIV_SYNC_DEBUG
+#define buf_page_get(SP, OF, LA, MTR) buf_page_get_gen(\
+ SP, OF, LA, NULL,\
+ BUF_GET, __FILE__, __LINE__, MTR)
+#else
+#define buf_page_get(SP, OF, LA, MTR) buf_page_get_gen(\
+ SP, OF, LA, NULL,\
+ BUF_GET, MTR)
+#endif
+/******************************************************************
+Use these macros to bufferfix a page with no latching. Remember not to
+read the contents of the page unless you know it is safe. Do not modify
+the contents of the page! We have separated this case, because it is
+error-prone programming not to set a latch, and it should be used
+with care. */
+#ifdef UNIV_SYNC_DEBUG
+#define buf_page_get_with_no_latch(SP, OF, MTR) buf_page_get_gen(\
+ SP, OF, RW_NO_LATCH, NULL,\
+ BUF_GET_NO_LATCH, __FILE__, __LINE__, MTR)
+#else
+#define buf_page_get_with_no_latch(SP, OF, MTR) buf_page_get_gen(\
+ SP, OF, RW_NO_LATCH, NULL,\
+ BUF_GET_NO_LATCH, MTR)
+#endif
+/******************************************************************
+NOTE! The following macros should be used instead of buf_page_get_gen, to
+improve debugging. Only values RW_S_LATCH and RW_X_LATCH are allowed as LA! */
+#ifdef UNIV_SYNC_DEBUG
+#define buf_page_get_nowait(SP, OF, LA, MTR) buf_page_get_gen(\
+ SP, OF, LA, NULL,\
+ BUF_GET_NOWAIT, __FILE__, __LINE__, MTR)
+#else
+#define buf_page_get_nowait(SP, OF, LA, MTR) buf_page_get_gen(\
+ SP, OF, LA, NULL,\
+ BUF_GET_NOWAIT, MTR)
+#endif
+/******************************************************************
+NOTE! The following macros should be used instead of
+buf_page_optimistic_get_func, to improve debugging. Only values RW_S_LATCH and
+RW_X_LATCH are allowed as LA! */
+#ifdef UNIV_SYNC_DEBUG
+#define buf_page_optimistic_get(LA, G, MC, MTR) buf_page_optimistic_get_func(\
+ LA, G, MC, __FILE__, __LINE__, MTR)
+#else
+#define buf_page_optimistic_get(LA, G, MC, MTR) buf_page_optimistic_get_func(\
+ LA, G, MC, MTR)
+#endif
+/************************************************************************
+This is the general function used to get optimistic access to a database
+page. */
+
+ibool
+buf_page_optimistic_get_func(
+/*=========================*/
+ /* out: TRUE if success */
+ ulint rw_latch,/* in: RW_S_LATCH, RW_X_LATCH */
+ buf_frame_t* guess, /* in: guessed frame */
+ dulint modify_clock,/* in: modify clock value if mode is
+ ..._GUESS_ON_CLOCK */
+#ifdef UNIV_SYNC_DEBUG
+ char* file, /* in: file name */
+ ulint line, /* in: line where called */
+#endif
+ mtr_t* mtr); /* in: mini-transaction */
+/************************************************************************
+Tries to get the page, but if file io is required, releases all latches
+in mtr down to the given savepoint. If io is required, this function
+retrieves the page to buffer buf_pool, but does not bufferfix it or latch
+it. */
+UNIV_INLINE
+buf_frame_t*
+buf_page_get_release_on_io(
+/*=======================*/
+ /* out: pointer to the frame, or NULL
+ if not in buffer buf_pool */
+ ulint space, /* in: space id */
+ ulint offset, /* in: offset of the page within space
+ in units of a page */
+ buf_frame_t* guess, /* in: guessed frame or NULL */
+ ulint rw_latch, /* in: RW_X_LATCH, RW_S_LATCH,
+ or RW_NO_LATCH */
+ ulint savepoint, /* in: mtr savepoint */
+ mtr_t* mtr); /* in: mtr */
+/************************************************************************
+This is used to get access to a known database page, when no waiting can be
+done. */
+
+ibool
+buf_page_get_known_nowait(
+/*======================*/
+ /* out: TRUE if success */
+ ulint rw_latch,/* in: RW_S_LATCH, RW_X_LATCH */
+ buf_frame_t* guess, /* in: the known page frame */
+ ulint mode, /* in: BUF_MAKE_YOUNG or BUF_KEEP_OLD */
+#ifdef UNIV_SYNC_DEBUG
+ char* file, /* in: file name */
+ ulint line, /* in: line where called */
+#endif
+ mtr_t* mtr); /* in: mini-transaction */
+/************************************************************************
+This is the general function used to get access to a database page. */
+
+buf_frame_t*
+buf_page_get_gen(
+/*=============*/
+ /* out: pointer to the frame or NULL */
+ ulint space, /* in: space id */
+ ulint offset, /* in: page number */
+ ulint rw_latch,/* in: RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH */
+ buf_frame_t* guess, /* in: guessed frame or NULL */
+ ulint mode, /* in: BUF_GET, BUF_GET_IF_IN_POOL,
+ BUF_GET_NO_LATCH */
+#ifdef UNIV_SYNC_DEBUG
+ char* file, /* in: file name */
+ ulint line, /* in: line where called */
+#endif
+ mtr_t* mtr); /* in: mini-transaction */
+/************************************************************************
+Initializes a page to the buffer buf_pool. The page is usually not read
+from a file even if it cannot be found in the buffer buf_pool. This is one
+of the functions which perform to a block a state transition NOT_USED =>
+FILE_PAGE (the other is buf_page_init_for_read above). */
+
+buf_frame_t*
+buf_page_create(
+/*============*/
+ /* out: pointer to the frame, page bufferfixed */
+ ulint space, /* in: space id */
+ ulint offset, /* in: offset of the page within space in units of
+ a page */
+ mtr_t* mtr); /* in: mini-transaction handle */
+/************************************************************************
+Decrements the bufferfix count of a buffer control block and releases
+a latch, if specified. */
+UNIV_INLINE
+void
+buf_page_release(
+/*=============*/
+ buf_block_t* block, /* in: buffer block */
+ ulint rw_latch, /* in: RW_S_LATCH, RW_X_LATCH,
+ RW_NO_LATCH */
+ mtr_t* mtr); /* in: mtr */
+/************************************************************************
+Moves a page to the start of the buffer pool LRU list. This high-level
+function can be used to prevent an important page from from slipping out of
+the buffer pool. */
+
+void
+buf_page_make_young(
+/*=================*/
+ buf_frame_t* frame); /* in: buffer frame of a file page */
+/************************************************************************
+Returns TRUE if the page can be found in the buffer pool hash table. NOTE
+that it is possible that the page is not yet read from disk, though. */
+
+ibool
+buf_page_peek(
+/*==========*/
+ /* out: TRUE if found from page hash table,
+ NOTE that the page is not necessarily yet read
+ from disk! */
+ ulint space, /* in: space id */
+ ulint offset);/* in: page number */
+/************************************************************************
+Returns the buffer control block if the page can be found in the buffer
+pool. NOTE that it is possible that the page is not yet read
+from disk, though. This is a very low-level function: use with care! */
+
+buf_block_t*
+buf_page_peek_block(
+/*================*/
+ /* out: control block if found from page hash table,
+ otherwise NULL; NOTE that the page is not necessarily
+ yet read from disk! */
+ ulint space, /* in: space id */
+ ulint offset);/* in: page number */
+/************************************************************************
+Recommends a move of a block to the start of the LRU list if there is danger
+of dropping from the buffer pool. NOTE: does not reserve the buffer pool
+mutex. */
+UNIV_INLINE
+ibool
+buf_block_peek_if_too_old(
+/*======================*/
+ /* out: TRUE if should be made younger */
+ buf_block_t* block); /* in: block to make younger */
+/************************************************************************
+Returns the current state of is_hashed of a page. FALSE if the page is
+not in the pool. NOTE that this operation does not fix the page in the
+pool if it is found there. */
+
+ibool
+buf_page_peek_if_search_hashed(
+/*===========================*/
+ /* out: TRUE if page hash index is built in search
+ system */
+ ulint space, /* in: space id */
+ ulint offset);/* in: page number */
+/************************************************************************
+Gets the youngest modification log sequence number for a frame.
+Returns zero if not file page or no modification occurred yet. */
+UNIV_INLINE
+dulint
+buf_frame_get_newest_modification(
+/*==============================*/
+ /* out: newest modification to page */
+ buf_frame_t* frame); /* in: pointer to a frame */
+/************************************************************************
+Increments the modify clock of a frame by 1. The caller must (1) own the
+pool mutex and block bufferfix count has to be zero, (2) or own an x-lock
+on the block. */
+UNIV_INLINE
+dulint
+buf_frame_modify_clock_inc(
+/*=======================*/
+ /* out: new value */
+ buf_frame_t* frame); /* in: pointer to a frame */
+/************************************************************************
+Returns the value of the modify clock. The caller must have an s-lock
+or x-lock on the block. */
+UNIV_INLINE
+dulint
+buf_frame_get_modify_clock(
+/*=======================*/
+ /* out: value */
+ buf_frame_t* frame); /* in: pointer to a frame */
+/**************************************************************************
+Gets the page number of a pointer pointing within a buffer frame containing
+a file page. */
+UNIV_INLINE
+ulint
+buf_frame_get_page_no(
+/*==================*/
+ /* out: page number */
+ byte* ptr); /* in: pointer to within a buffer frame */
+/**************************************************************************
+Gets the space id of a pointer pointing within a buffer frame containing a
+file page. */
+UNIV_INLINE
+ulint
+buf_frame_get_space_id(
+/*===================*/
+ /* out: space id */
+ byte* ptr); /* in: pointer to within a buffer frame */
+/**************************************************************************
+Gets the space id, page offset, and byte offset within page of a
+pointer pointing to a buffer frame containing a file page. */
+UNIV_INLINE
+void
+buf_ptr_get_fsp_addr(
+/*=================*/
+ byte* ptr, /* in: pointer to a buffer frame */
+ ulint* space, /* out: space id */
+ fil_addr_t* addr); /* out: page offset and byte offset */
+/**************************************************************************
+Gets the hash value of the page the pointer is pointing to. This can be used
+in searches in the lock hash table. */
+UNIV_INLINE
+ulint
+buf_frame_get_lock_hash_val(
+/*========================*/
+ /* out: lock hash value */
+ byte* ptr); /* in: pointer to within a buffer frame */
+/**************************************************************************
+Gets the mutex number protecting the page record lock hash chain in the lock
+table. */
+UNIV_INLINE
+mutex_t*
+buf_frame_get_lock_mutex(
+/*=====================*/
+ /* out: mutex */
+ byte* ptr); /* in: pointer to within a buffer frame */
+/***********************************************************************
+Gets the frame the pointer is pointing to. */
+UNIV_INLINE
+buf_frame_t*
+buf_frame_align(
+/*============*/
+ /* out: pointer to block */
+ byte* ptr); /* in: pointer to a frame */
+/***********************************************************************
+Checks if a pointer points to the block array of the buffer pool (blocks, not
+the frames). */
+UNIV_INLINE
+ibool
+buf_pool_is_block(
+/*==============*/
+ /* out: TRUE if pointer to block */
+ void* ptr); /* in: pointer to memory */
+/*************************************************************************
+Validates the buffer pool data structure. */
+
+ibool
+buf_validate(void);
+/*==============*/
+/*************************************************************************
+Prints info of the buffer pool data structure. */
+
+void
+buf_print(void);
+/*===========*/
+/*************************************************************************
+Prints info of the buffer i/o. */
+
+void
+buf_print_io(void);
+/*==============*/
+/*************************************************************************
+Checks that all file pages in the buffer are in a replaceable state. */
+
+ibool
+buf_all_freed(void);
+/*===============*/
+/*************************************************************************
+Checks that there currently are no pending i/o-operations for the buffer
+pool. */
+
+ibool
+buf_pool_check_no_pending_io(void);
+/*==============================*/
+ /* out: TRUE if there is no pending i/o */
+/*************************************************************************
+Invalidates the file pages in the buffer pool when an archive recovery is
+completed. All the file pages buffered must be in a replaceable state when
+this function is called: not latched and not modified. */
+
+void
+buf_pool_invalidate(void);
+/*=====================*/
+
+/*========================================================================
+--------------------------- LOWER LEVEL ROUTINES -------------------------
+=========================================================================*/
+
+/*************************************************************************
+Adds latch level info for the rw-lock protecting the buffer frame. This
+should be called in the debug version after a successful latching of a
+page if we know the latching order level of the acquired latch. If
+UNIV_SYNC_DEBUG is not defined, compiles to an empty function. */
+UNIV_INLINE
+void
+buf_page_dbg_add_level(
+/*===================*/
+ buf_frame_t* frame, /* in: buffer page where we have acquired
+ a latch */
+ ulint level); /* in: latching order level */
+/*************************************************************************
+Gets a pointer to the memory frame of a block. */
+UNIV_INLINE
+buf_frame_t*
+buf_block_get_frame(
+/*================*/
+ /* out: pointer to the frame */
+ buf_block_t* block); /* in: pointer to the control block */
+/*************************************************************************
+Gets the space id of a block. */
+UNIV_INLINE
+ulint
+buf_block_get_space(
+/*================*/
+ /* out: space id */
+ buf_block_t* block); /* in: pointer to the control block */
+/*************************************************************************
+Gets the page number of a block. */
+UNIV_INLINE
+ulint
+buf_block_get_page_no(
+/*==================*/
+ /* out: page number */
+ buf_block_t* block); /* in: pointer to the control block */
+/***********************************************************************
+Gets the block to whose frame the pointer is pointing to. */
+UNIV_INLINE
+buf_block_t*
+buf_block_align(
+/*============*/
+ /* out: pointer to block */
+ byte* ptr); /* in: pointer to a frame */
+/************************************************************************
+This function is used to get info if there is an io operation
+going on on a buffer page. */
+UNIV_INLINE
+ibool
+buf_page_io_query(
+/*==============*/
+ /* out: TRUE if io going on */
+ buf_block_t* block); /* in: pool block, must be bufferfixed */
+/***********************************************************************
+Accessor function for block array. */
+UNIV_INLINE
+buf_block_t*
+buf_pool_get_nth_block(
+/*===================*/
+ /* out: pointer to block */
+ buf_pool_t* pool, /* in: pool */
+ ulint i); /* in: index of the block */
+/************************************************************************
+Function which inits a page for read to the buffer buf_pool. If the page is
+already in buf_pool, does nothing. Sets the io_fix flag to BUF_IO_READ and
+sets a non-recursive exclusive lock on the buffer frame. The io-handler must
+take care that the flag is cleared and the lock released later. This is one
+of the functions which perform the state transition NOT_USED => FILE_PAGE to
+a block (the other is buf_page_create). */
+
+buf_block_t*
+buf_page_init_for_read(
+/*===================*/
+ /* out: pointer to the block */
+ ulint mode, /* in: BUF_READ_IBUF_PAGES_ONLY, ... */
+ ulint space, /* in: space id */
+ ulint offset);/* in: page number */
+/************************************************************************
+Completes an asynchronous read or write request of a file page to or from
+the buffer pool. */
+
+void
+buf_page_io_complete(
+/*=================*/
+ buf_block_t* block); /* in: pointer to the block in question */
+/************************************************************************
+Calculates a folded value of a file page address to use in the page hash
+table. */
+UNIV_INLINE
+ulint
+buf_page_address_fold(
+/*==================*/
+ /* out: the folded value */
+ ulint space, /* in: space id */
+ ulint offset);/* in: offset of the page within space */
+/**********************************************************************
+Returns the control block of a file page, NULL if not found. */
+UNIV_INLINE
+buf_block_t*
+buf_page_hash_get(
+/*==============*/
+ /* out: block, NULL if not found */
+ ulint space, /* in: space id */
+ ulint offset);/* in: offset of the page within space */
+/***********************************************************************
+Increments the pool clock by one and returns its new value. Remember that
+in the 32 bit version the clock wraps around at 4 billion! */
+UNIV_INLINE
+ulint
+buf_pool_clock_tic(void);
+/*====================*/
+ /* out: new clock value */
+/*************************************************************************
+Gets the current length of the free list of buffer blocks. */
+
+ulint
+buf_get_free_list_len(void);
+/*=======================*/
+
+
+
+/* The buffer control block structure */
+
+struct buf_block_struct{
+
+ /* 1. General fields */
+
+ ulint state; /* state of the control block:
+ BUF_BLOCK_NOT_USED, ... */
+ byte* frame; /* pointer to buffer frame which
+ is of size UNIV_PAGE_SIZE, and
+ aligned to an address divisible by
+ UNIV_PAGE_SIZE */
+ ulint space; /* space id of the page */
+ ulint offset; /* page number within the space */
+ ulint lock_hash_val; /* hashed value of the page address
+ in the record lock hash table */
+ mutex_t* lock_mutex; /* mutex protecting the chain in the
+ record lock hash table */
+ rw_lock_t lock; /* read-write lock of the buffer
+ frame */
+ rw_lock_t read_lock; /* rw-lock reserved when a page read
+ to the frame is requested; a thread
+ can wait for this rw-lock if it wants
+ to wait for the read to complete;
+ the usual way is to wait for lock,
+ but if the thread just wants a
+ bufferfix and no latch on the page,
+ then it can wait for this rw-lock */
+ buf_block_t* hash; /* node used in chaining to the page
+ hash table */
+ /* 2. Page flushing fields */
+
+ UT_LIST_NODE_T(buf_block_t) flush_list;
+ /* node of the modified, not yet
+ flushed blocks list */
+ dulint newest_modification;
+ /* log sequence number of the youngest
+ modification to this block, zero if
+ not modified */
+ dulint oldest_modification;
+ /* log sequence number of the START of
+ the log entry written of the oldest
+ modification to this block which has
+ not yet been flushed on disk; zero if
+ all modifications are on disk */
+ ulint flush_type; /* if this block is currently being
+ flushed to disk, this tells the
+ flush_type: BUF_FLUSH_LRU or
+ BUF_FLUSH_LIST */
+
+ /* 3. LRU replacement algorithm fields */
+
+ UT_LIST_NODE_T(buf_block_t) free;
+ /* node of the free block list */
+ UT_LIST_NODE_T(buf_block_t) LRU;
+ /* node of the LRU list */
+ ulint LRU_position; /* value which monotonically
+ decreases (or may stay constant if
+ the block is in the old blocks) toward
+ the end of the LRU list, if the pool
+ ulint_clock has not wrapped around:
+ NOTE that this value can only be used
+ in heuristic algorithms, because of
+ the possibility of a wrap-around! */
+ ulint freed_page_clock;/* the value of freed_page_clock
+ buffer pool when this block was
+ last time put to the head of the
+ LRU list */
+ ibool old; /* TRUE if the block is in the old
+ blocks in the LRU list */
+ ibool accessed; /* TRUE if the page has been accessed
+ while in the buffer pool: read-ahead
+ may read in pages which have not been
+ accessed yet */
+ ulint buf_fix_count; /* count of how manyfold this block
+ is currently bufferfixed */
+ ulint io_fix; /* if a read is pending to the frame,
+ io_fix is BUF_IO_READ, in the case
+ of a write BUF_IO_WRITE, otherwise 0 */
+ /* 4. Optimistic search field */
+
+ dulint modify_clock; /* this clock is incremented every
+ time a pointer to a record on the
+ page may become obsolete; this is
+ used in the optimistic cursor
+ positioning: if the modify clock has
+ not changed, we know that the pointer
+ is still valid; this field may be
+ changed if the thread (1) owns the
+ pool mutex and the page is not
+ bufferfixed, or (2) the thread has an
+ x-latch on the block */
+
+ /* 5. Hash search fields: NOTE that these fields are protected by
+ btr_search_mutex */
+
+ ulint n_hash_helps; /* counter which controls building
+ of a new hash index for the page */
+ ulint n_fields; /* recommended prefix length for hash
+ search: number of full fields */
+ ulint n_bytes; /* recommended prefix: number of bytes
+ in an incomplete field */
+ ulint side; /* BTR_SEARCH_LEFT_SIDE or
+ BTR_SEARCH_RIGHT_SIDE, depending on
+ whether the leftmost record of several
+ records with the same prefix should be
+ indexed in the hash index */
+ ibool is_hashed; /* TRUE if hash index has already been
+ built on this page; note that it does
+ not guarantee that the index is
+ complete, though: there may have been
+ hash collisions, record deletions,
+ etc. */
+ ulint curr_n_fields; /* prefix length for hash indexing:
+ number of full fields */
+ ulint curr_n_bytes; /* number of bytes in hash indexing */
+ ulint curr_side; /* BTR_SEARCH_LEFT_SIDE or
+ BTR_SEARCH_RIGHT_SIDE in hash
+ indexing */
+ /* 6. Debug fields */
+
+ rw_lock_t debug_latch; /* in the debug version, each thread
+ which bufferfixes the block acquires
+ an s-latch here; so we can use the
+ debug utilities in sync0rw */
+};
+
+/* The buffer pool structure. NOTE! The definition appears here only for
+other modules of this directory (buf) to see it. Do not use from outside! */
+
+struct buf_pool_struct{
+
+ /* 1. General fields */
+
+ mutex_t mutex; /* mutex protecting the buffer pool
+ struct and control blocks, except the
+ read-write lock in them */
+ byte* frame_mem; /* pointer to the memory area which
+ was allocated for the frames */
+ byte* frame_zero; /* pointer to the first buffer frame:
+ this may differ from frame_mem, because
+ this is aligned by the frame size */
+ buf_block_t* blocks; /* array of buffer control blocks */
+ ulint max_size; /* number of control blocks ==
+ maximum pool size in pages */
+ ulint curr_size; /* current pool size in pages */
+ hash_table_t* page_hash; /* hash table of the file pages */
+
+ ulint n_pend_reads; /* number of pending read operations */
+ ulint n_pages_read; /* number read operations */
+ ulint n_pages_written;/* number write operations */
+ ulint n_pages_created;/* number of pages created in the pool
+ with no read */
+ /* 2. Page flushing algorithm fields */
+
+ UT_LIST_BASE_NODE_T(buf_block_t) flush_list;
+ /* base node of the modified block
+ list */
+ ibool init_flush[BUF_FLUSH_LIST + 1];
+ /* this is TRUE when a flush of the
+ given type is being initialized */
+ ulint n_flush[BUF_FLUSH_LIST + 1];
+ /* this is the number of pending
+ writes in the given flush type */
+ os_event_t no_flush[BUF_FLUSH_LIST + 1];
+ /* this is in the set state when there
+ is no flush batch of the given type
+ running */
+ ulint ulint_clock; /* a sequence number used to count
+ time. NOTE! This counter wraps
+ around at 4 billion (if ulint ==
+ 32 bits)! */
+ ulint freed_page_clock;/* a sequence number used to count the
+ number of buffer blocks removed from
+ the end of the LRU list; NOTE that
+ this counter may wrap around at 4
+ billion! */
+ ulint LRU_flush_ended;/* when an LRU flush ends for a page,
+ this is incremented by one; this is
+ set to zero when a buffer block is
+ allocated */
+
+ /* 3. LRU replacement algorithm fields */
+
+ UT_LIST_BASE_NODE_T(buf_block_t) free;
+ /* base node of the free block list */
+ UT_LIST_BASE_NODE_T(buf_block_t) LRU;
+ /* base node of the LRU list */
+ buf_block_t* LRU_old; /* pointer to the about 3/8 oldest
+ blocks in the LRU list; NULL if LRU
+ length less than BUF_LRU_OLD_MIN_LEN */
+ ulint LRU_old_len; /* length of the LRU list from
+ the block to which LRU_old points
+ onward, including that block;
+ see buf0lru.c for the restrictions
+ on this value; not defined if
+ LRU_old == NULL */
+};
+
+/* States of a control block */
+#define BUF_BLOCK_NOT_USED 211 /* is in the free list */
+#define BUF_BLOCK_READY_FOR_USE 212 /* when buf_get_free_block returns
+ a block, it is in this state */
+#define BUF_BLOCK_FILE_PAGE 213 /* contains a buffered file page */
+#define BUF_BLOCK_MEMORY 214 /* contains some main memory object */
+#define BUF_BLOCK_REMOVE_HASH 215 /* hash index should be removed
+ before putting to the free list */
+
+/* Io_fix states of a control block; these must be != 0 */
+#define BUF_IO_READ 561
+#define BUF_IO_WRITE 562
+
+/************************************************************************
+Let us list the consistency conditions for different control block states.
+
+NOT_USED: is in free list, not in LRU list, not in flush list, nor
+ page hash table
+READY_FOR_USE: is not in free list, LRU list, or flush list, nor page
+ hash table
+MEMORY: is not in free list, LRU list, or flush list, nor page
+ hash table
+FILE_PAGE: space and offset are defined, is in page hash table
+ if io_fix == BUF_IO_WRITE,
+ pool: no_flush[block->flush_type] is in reset state,
+ pool: n_flush[block->flush_type] > 0
+
+ (1) if buf_fix_count == 0, then
+ is in LRU list, not in free list
+ is in flush list,
+ if and only if oldest_modification > 0
+ is x-locked,
+ if and only if io_fix == BUF_IO_READ
+ is s-locked,
+ if and only if io_fix == BUF_IO_WRITE
+
+ (2) if buf_fix_count > 0, then
+ is not in LRU list, not in free list
+ is in flush list,
+ if and only if oldest_modification > 0
+ if io_fix == BUF_IO_READ,
+ is x-locked
+ if io_fix == BUF_IO_WRITE,
+ is s-locked
+
+State transitions:
+
+NOT_USED => READY_FOR_USE
+READY_FOR_USE => MEMORY
+READY_FOR_USE => FILE_PAGE
+MEMORY => NOT_USED
+FILE_PAGE => NOT_USED NOTE: This transition is allowed if and only if
+ (1) buf_fix_count == 0,
+ (2) oldest_modification == 0, and
+ (3) io_fix == 0.
+*/
+
+#ifndef UNIV_NONINL
+#include "buf0buf.ic"
+#endif
+
+#endif
diff --git a/innobase/include/buf0buf.ic b/innobase/include/buf0buf.ic
new file mode 100644
index 00000000000..24ada36bca2
--- /dev/null
+++ b/innobase/include/buf0buf.ic
@@ -0,0 +1,641 @@
+/******************************************************
+The database buffer buf_pool
+
+(c) 1995 Innobase Oy
+
+Created 11/5/1995 Heikki Tuuri
+*******************************************************/
+
+#include "buf0flu.h"
+#include "buf0lru.h"
+#include "buf0rea.h"
+#include "mtr0mtr.h"
+
+extern ulint buf_dbg_counter; /* This is used to insert validation
+ operations in execution in the
+ debug version */
+
+/************************************************************************
+Recommends a move of a block to the start of the LRU list if there is danger
+of dropping from the buffer pool. NOTE: does not reserve the buffer pool
+mutex. */
+UNIV_INLINE
+ibool
+buf_block_peek_if_too_old(
+/*======================*/
+ /* out: TRUE if should be made younger */
+ buf_block_t* block) /* in: block to make younger */
+{
+ if (buf_pool->freed_page_clock >= block->freed_page_clock
+ + 1 + (buf_pool->curr_size / 1024)) {
+
+ return(TRUE);
+ }
+
+ return(FALSE);
+}
+
+/*************************************************************************
+Gets the current size of buffer buf_pool in bytes. */
+UNIV_INLINE
+ulint
+buf_pool_get_curr_size(void)
+/*========================*/
+ /* out: size in bytes */
+{
+ return((buf_pool->curr_size) * UNIV_PAGE_SIZE);
+}
+
+/*************************************************************************
+Gets the maximum size of buffer buf_pool in bytes. */
+UNIV_INLINE
+ulint
+buf_pool_get_max_size(void)
+/*=======================*/
+ /* out: size in bytes */
+{
+ return((buf_pool->max_size) * UNIV_PAGE_SIZE);
+}
+
+/***********************************************************************
+Accessor function for block array. */
+UNIV_INLINE
+buf_block_t*
+buf_pool_get_nth_block(
+/*===================*/
+ /* out: pointer to block */
+ buf_pool_t* buf_pool,/* in: buf_pool */
+ ulint i) /* in: index of the block */
+{
+ ut_ad(buf_pool);
+ ut_ad(i < buf_pool->max_size);
+
+ return(i + buf_pool->blocks);
+}
+
+/***********************************************************************
+Checks if a pointer points to the block array of the buffer pool (blocks, not
+the frames). */
+UNIV_INLINE
+ibool
+buf_pool_is_block(
+/*==============*/
+ /* out: TRUE if pointer to block */
+ void* ptr) /* in: pointer to memory */
+{
+ if ((buf_pool->blocks <= (buf_block_t*)ptr)
+ && ((buf_block_t*)ptr < buf_pool->blocks + buf_pool->max_size)) {
+
+ return(TRUE);
+ }
+
+ return(FALSE);
+}
+
+/************************************************************************
+Gets the smallest oldest_modification lsn for any page in the pool. Returns
+ut_dulint_zero if all modified pages have been flushed to disk. */
+UNIV_INLINE
+dulint
+buf_pool_get_oldest_modification(void)
+/*==================================*/
+ /* out: oldest modification in pool,
+ ut_dulint_zero if none */
+{
+ buf_block_t* block;
+ dulint lsn;
+
+ mutex_enter(&(buf_pool->mutex));
+
+ block = UT_LIST_GET_LAST(buf_pool->flush_list);
+
+ if (block == NULL) {
+ lsn = ut_dulint_zero;
+ } else {
+ lsn = block->oldest_modification;
+ }
+
+ mutex_exit(&(buf_pool->mutex));
+
+ return(lsn);
+}
+
+/***********************************************************************
+Increments the buf_pool clock by one and returns its new value. Remember
+that in the 32 bit version the clock wraps around at 4 billion! */
+UNIV_INLINE
+ulint
+buf_pool_clock_tic(void)
+/*====================*/
+ /* out: new clock value */
+{
+ ut_ad(mutex_own(&(buf_pool->mutex)));
+
+ buf_pool->ulint_clock++;
+
+ return(buf_pool->ulint_clock);
+}
+
+/*************************************************************************
+Gets a pointer to the memory frame of a block. */
+UNIV_INLINE
+buf_frame_t*
+buf_block_get_frame(
+/*================*/
+ /* out: pointer to the frame */
+ buf_block_t* block) /* in: pointer to the control block */
+{
+ ut_ad(block);
+ ut_ad(block >= buf_pool->blocks);
+ ut_ad(block < buf_pool->blocks + buf_pool->max_size);
+ ut_ad(block->state != BUF_BLOCK_NOT_USED);
+ ut_ad((block->state != BUF_BLOCK_FILE_PAGE)
+ || (block->buf_fix_count > 0));
+
+ return(block->frame);
+}
+
+/*************************************************************************
+Gets the space id of a block. */
+UNIV_INLINE
+ulint
+buf_block_get_space(
+/*================*/
+ /* out: space id */
+ buf_block_t* block) /* in: pointer to the control block */
+{
+ ut_ad(block);
+ ut_ad(block >= buf_pool->blocks);
+ ut_ad(block < buf_pool->blocks + buf_pool->max_size);
+ ut_ad(block->state == BUF_BLOCK_FILE_PAGE);
+ ut_ad(block->buf_fix_count > 0);
+
+ return(block->space);
+}
+
+/*************************************************************************
+Gets the page number of a block. */
+UNIV_INLINE
+ulint
+buf_block_get_page_no(
+/*==================*/
+ /* out: page number */
+ buf_block_t* block) /* in: pointer to the control block */
+{
+ ut_ad(block);
+ ut_ad(block >= buf_pool->blocks);
+ ut_ad(block < buf_pool->blocks + buf_pool->max_size);
+ ut_ad(block->state == BUF_BLOCK_FILE_PAGE);
+ ut_ad(block->buf_fix_count > 0);
+
+ return(block->offset);
+}
+
+/***********************************************************************
+Gets the block to whose frame the pointer is pointing to. */
+UNIV_INLINE
+buf_block_t*
+buf_block_align(
+/*============*/
+ /* out: pointer to block */
+ byte* ptr) /* in: pointer to a frame */
+{
+ buf_block_t* block;
+ buf_frame_t* frame_zero;
+
+ ut_ad(ptr);
+
+ frame_zero = buf_pool->frame_zero;
+
+ ut_ad((ulint)ptr >= (ulint)frame_zero);
+
+ block = buf_pool_get_nth_block(buf_pool, (ptr - frame_zero)
+ >> UNIV_PAGE_SIZE_SHIFT);
+ return(block);
+}
+
+/***********************************************************************
+Gets the block to whose frame the pointer is pointing to. Does not
+require a file page to be bufferfixed. */
+UNIV_INLINE
+buf_block_t*
+buf_block_align_low(
+/*================*/
+ /* out: pointer to block */
+ byte* ptr) /* in: pointer to a frame */
+{
+ buf_block_t* block;
+ buf_frame_t* frame_zero;
+
+ ut_ad(ptr);
+
+ frame_zero = buf_pool->frame_zero;
+
+ ut_ad((ulint)ptr >= (ulint)frame_zero);
+
+ block = buf_pool_get_nth_block(buf_pool, (ptr - frame_zero)
+ >> UNIV_PAGE_SIZE_SHIFT);
+ return(block);
+}
+
+/***********************************************************************
+Gets the frame the pointer is pointing to. */
+UNIV_INLINE
+buf_frame_t*
+buf_frame_align(
+/*============*/
+ /* out: pointer to block */
+ byte* ptr) /* in: pointer to a frame */
+{
+ buf_frame_t* frame;
+
+ ut_ad(ptr);
+
+ frame = ut_align_down(ptr, UNIV_PAGE_SIZE);
+
+ ut_ad((ulint)frame
+ >= (ulint)(buf_pool_get_nth_block(buf_pool, 0)->frame));
+ ut_ad((ulint)frame <= (ulint)(buf_pool_get_nth_block(buf_pool,
+ buf_pool->max_size - 1)->frame));
+ return(frame);
+}
+
+/**************************************************************************
+Gets the page number of a pointer pointing within a buffer frame containing
+a file page. */
+UNIV_INLINE
+ulint
+buf_frame_get_page_no(
+/*==================*/
+ /* out: page number */
+ byte* ptr) /* in: pointer to within a buffer frame */
+{
+ return(buf_block_get_page_no(buf_block_align(ptr)));
+}
+
+/**************************************************************************
+Gets the space id of a pointer pointing within a buffer frame containing a
+file page. */
+UNIV_INLINE
+ulint
+buf_frame_get_space_id(
+/*===================*/
+ /* out: space id */
+ byte* ptr) /* in: pointer to within a buffer frame */
+{
+ return(buf_block_get_space(buf_block_align(ptr)));
+}
+
+/**************************************************************************
+Gets the space id, page offset, and byte offset within page of a
+pointer pointing to a buffer frame containing a file page. */
+UNIV_INLINE
+void
+buf_ptr_get_fsp_addr(
+/*=================*/
+ byte* ptr, /* in: pointer to a buffer frame */
+ ulint* space, /* out: space id */
+ fil_addr_t* addr) /* out: page offset and byte offset */
+{
+ buf_block_t* block;
+
+ block = buf_block_align(ptr);
+
+ *space = buf_block_get_space(block);
+ addr->page = buf_block_get_page_no(block);
+ addr->boffset = ptr - buf_frame_align(ptr);
+}
+
+/**************************************************************************
+Gets the hash value of the page the pointer is pointing to. This can be used
+in searches in the lock hash table. */
+UNIV_INLINE
+ulint
+buf_frame_get_lock_hash_val(
+/*========================*/
+ /* out: lock hash value */
+ byte* ptr) /* in: pointer to within a buffer frame */
+{
+ buf_block_t* block;
+
+ block = buf_block_align(ptr);
+
+ return(block->lock_hash_val);
+}
+
+/**************************************************************************
+Gets the mutex number protecting the page record lock hash chain in the lock
+table. */
+UNIV_INLINE
+mutex_t*
+buf_frame_get_lock_mutex(
+/*=====================*/
+ /* out: mutex */
+ byte* ptr) /* in: pointer to within a buffer frame */
+{
+ buf_block_t* block;
+
+ block = buf_block_align(ptr);
+
+ return(block->lock_mutex);
+}
+
+/*************************************************************************
+Copies contents of a buffer frame to a given buffer. */
+UNIV_INLINE
+byte*
+buf_frame_copy(
+/*===========*/
+ /* out: buf */
+ byte* buf, /* in: buffer to copy to */
+ buf_frame_t* frame) /* in: buffer frame */
+{
+ ut_ad(buf && frame);
+
+ ut_memcpy(buf, frame, UNIV_PAGE_SIZE);
+
+ return(buf);
+}
+
+/************************************************************************
+Calculates a folded value of a file page address to use in the page hash
+table. */
+UNIV_INLINE
+ulint
+buf_page_address_fold(
+/*==================*/
+ /* out: the folded value */
+ ulint space, /* in: space id */
+ ulint offset) /* in: offset of the page within space */
+{
+ return((space << 20) + space + offset);
+}
+
+/************************************************************************
+This function is used to get info if there is an io operation
+going on on a buffer page. */
+UNIV_INLINE
+ibool
+buf_page_io_query(
+/*==============*/
+ /* out: TRUE if io going on */
+ buf_block_t* block) /* in: buf_pool block, must be bufferfixed */
+{
+ mutex_enter(&(buf_pool->mutex));
+
+ ut_ad(block->state == BUF_BLOCK_FILE_PAGE);
+ ut_ad(block->buf_fix_count > 0);
+
+ if (block->io_fix != 0) {
+ mutex_exit(&(buf_pool->mutex));
+
+ return(TRUE);
+ }
+
+ mutex_exit(&(buf_pool->mutex));
+
+ return(FALSE);
+}
+
+/************************************************************************
+Gets the youngest modification log sequence number for a frame. Returns zero
+if not a file page or no modification occurred yet. */
+UNIV_INLINE
+dulint
+buf_frame_get_newest_modification(
+/*==============================*/
+ /* out: newest modification to the page */
+ buf_frame_t* frame) /* in: pointer to a frame */
+{
+ buf_block_t* block;
+ dulint lsn;
+
+ ut_ad(frame);
+
+ block = buf_block_align(frame);
+
+ mutex_enter(&(buf_pool->mutex));
+
+ if (block->state == BUF_BLOCK_FILE_PAGE) {
+ lsn = block->newest_modification;
+ } else {
+ lsn = ut_dulint_zero;
+ }
+
+ mutex_exit(&(buf_pool->mutex));
+
+ return(lsn);
+}
+
+/************************************************************************
+Increments the modify clock of a frame by 1. The caller must (1) own the
+buf_pool mutex and block bufferfix count has to be zero, (2) or own an x-lock
+on the block. */
+UNIV_INLINE
+dulint
+buf_frame_modify_clock_inc(
+/*=======================*/
+ /* out: new value */
+ buf_frame_t* frame) /* in: pointer to a frame */
+{
+ buf_block_t* block;
+
+ ut_ad(frame);
+
+ block = buf_block_align_low(frame);
+
+ ut_ad((mutex_own(&(buf_pool->mutex)) && (block->buf_fix_count == 0))
+ || rw_lock_own(&(block->lock), RW_LOCK_EXCLUSIVE));
+
+ UT_DULINT_INC(block->modify_clock);
+
+ return(block->modify_clock);
+}
+
+/************************************************************************
+Returns the value of the modify clock. The caller must have an s-lock
+or x-lock on the block. */
+UNIV_INLINE
+dulint
+buf_frame_get_modify_clock(
+/*=======================*/
+ /* out: value */
+ buf_frame_t* frame) /* in: pointer to a frame */
+{
+ buf_block_t* block;
+
+ ut_ad(frame);
+
+ block = buf_block_align(frame);
+
+ ut_ad(rw_lock_own(&(block->lock), RW_LOCK_SHARED)
+ || rw_lock_own(&(block->lock), RW_LOCK_EXCLUSIVE));
+
+ return(block->modify_clock);
+}
+
+/***********************************************************************
+Increments the bufferfix count. */
+UNIV_INLINE
+void
+buf_block_buf_fix_inc_debug(
+/*========================*/
+ buf_block_t* block, /* in: block to bufferfix */
+ char* file, /* in: file name */
+ ulint line) /* in: line */
+{
+ ibool ret;
+
+ ret = rw_lock_s_lock_func_nowait(&(block->debug_latch)
+#ifdef UNIV_SYNC_DEBUG
+ ,file, line
+#endif
+ );
+
+ ut_ad(ret == TRUE);
+
+ block->buf_fix_count++;
+}
+
+/***********************************************************************
+Increments the bufferfix count. */
+UNIV_INLINE
+void
+buf_block_buf_fix_inc(
+/*==================*/
+ buf_block_t* block) /* in: block to bufferfix */
+{
+ block->buf_fix_count++;
+}
+
+/**********************************************************************
+Returns the control block of a file page, NULL if not found. */
+UNIV_INLINE
+buf_block_t*
+buf_page_hash_get(
+/*==============*/
+ /* out: block, NULL if not found */
+ ulint space, /* in: space id */
+ ulint offset) /* in: offset of the page within space */
+{
+ buf_block_t* block;
+ ulint fold;
+
+ ut_ad(buf_pool);
+ ut_ad(mutex_own(&(buf_pool->mutex)));
+
+ /* Look for the page in the hash table */
+
+ fold = buf_page_address_fold(space, offset);
+
+ HASH_SEARCH(hash, buf_pool->page_hash, fold, block,
+ (block->space == space) && (block->offset == offset));
+ return(block);
+}
+
+/************************************************************************
+Tries to get the page, but if file io is required, releases all latches
+in mtr down to the given savepoint. If io is required, this function
+retrieves the page to buffer buf_pool, but does not bufferfix it or latch
+it. */
+UNIV_INLINE
+buf_frame_t*
+buf_page_get_release_on_io(
+/*=======================*/
+ /* out: pointer to the frame, or NULL
+ if not in buffer buf_pool */
+ ulint space, /* in: space id */
+ ulint offset, /* in: offset of the page within space
+ in units of a page */
+ buf_frame_t* guess, /* in: guessed frame or NULL */
+ ulint rw_latch, /* in: RW_X_LATCH, RW_S_LATCH,
+ or RW_NO_LATCH */
+ ulint savepoint, /* in: mtr savepoint */
+ mtr_t* mtr) /* in: mtr */
+{
+ buf_frame_t* frame;
+
+ frame = buf_page_get_gen(space, offset, rw_latch, guess,
+ BUF_GET_IF_IN_POOL,
+#ifdef UNIV_SYNC_DEBUG
+ __FILE__, __LINE__,
+#endif
+ mtr);
+ if (frame != NULL) {
+
+ return(frame);
+ }
+
+ /* The page was not in the buffer buf_pool: release the latches
+ down to the savepoint */
+
+ mtr_rollback_to_savepoint(mtr, savepoint);
+
+ buf_page_get(space, offset, RW_S_LATCH, mtr);
+
+ /* When we get here, the page is in buffer, but we release
+ the latches again down to the savepoint, before returning */
+
+ mtr_rollback_to_savepoint(mtr, savepoint);
+
+ return(NULL);
+}
+
+/************************************************************************
+Decrements the bufferfix count of a buffer control block and releases
+a latch, if specified. */
+UNIV_INLINE
+void
+buf_page_release(
+/*=============*/
+ buf_block_t* block, /* in: buffer block */
+ ulint rw_latch, /* in: RW_S_LATCH, RW_X_LATCH,
+ RW_NO_LATCH */
+ mtr_t* mtr) /* in: mtr */
+{
+ ulint buf_fix_count;
+
+ ut_ad(block);
+
+ mutex_enter_fast(&(buf_pool->mutex));
+
+ ut_ad(block->state == BUF_BLOCK_FILE_PAGE);
+ ut_ad(block->buf_fix_count > 0);
+
+ if (rw_latch == RW_X_LATCH && mtr->modifications) {
+
+ buf_flush_note_modification(block, mtr);
+ }
+
+#ifdef UNIV_SYNC_DEBUG
+ rw_lock_s_unlock(&(block->debug_latch));
+#endif
+ buf_fix_count = block->buf_fix_count;
+ block->buf_fix_count = buf_fix_count - 1;
+
+ mutex_exit(&(buf_pool->mutex));
+
+ if (rw_latch == RW_S_LATCH) {
+ rw_lock_s_unlock(&(block->lock));
+ } else if (rw_latch == RW_X_LATCH) {
+ rw_lock_x_unlock(&(block->lock));
+ }
+}
+
+/*************************************************************************
+Adds latch level info for the rw-lock protecting the buffer frame. This
+should be called in the debug version after a successful latching of a
+page if we know the latching order level of the acquired latch. If
+UNIV_SYNC_DEBUG is not defined, compiles to an empty function. */
+UNIV_INLINE
+void
+buf_page_dbg_add_level(
+/*===================*/
+ buf_frame_t* frame, /* in: buffer page where we have acquired
+ a latch */
+ ulint level) /* in: latching order level */
+{
+#ifdef UNIV_SYNC_DEBUG
+ sync_thread_add_level(&(buf_block_align(frame)->lock), level);
+#endif
+}
diff --git a/innobase/include/buf0flu.h b/innobase/include/buf0flu.h
new file mode 100644
index 00000000000..9317950904f
--- /dev/null
+++ b/innobase/include/buf0flu.h
@@ -0,0 +1,110 @@
+/******************************************************
+The database buffer pool flush algorithm
+
+(c) 1995 Innobase Oy
+
+Created 11/5/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef buf0flu_h
+#define buf0flu_h
+
+#include "univ.i"
+#include "buf0types.h"
+#include "ut0byte.h"
+#include "mtr0types.h"
+
+/************************************************************************
+Updates the flush system data structures when a write is completed. */
+
+void
+buf_flush_write_complete(
+/*=====================*/
+ buf_block_t* block); /* in: pointer to the block in question */
+/*************************************************************************
+Flushes pages from the end of the LRU list if there is too small
+a margin of replaceable pages there. */
+
+void
+buf_flush_free_margin(void);
+/*=======================*/
+/***********************************************************************
+This utility flushes dirty blocks from the end of the LRU list or flush_list.
+NOTE 1: in the case of an LRU flush the calling thread may own latches to
+pages: to avoid deadlocks, this function must be written so that it cannot
+end up waiting for these latches! NOTE 2: in the case of a flush list flush,
+the calling thread is not allowed to own any latches on pages! */
+
+ulint
+buf_flush_batch(
+/*============*/
+ /* out: number of blocks for which the write
+ request was queued */
+ ulint flush_type, /* in: BUF_FLUSH_LRU or BUF_FLUSH_LIST; if
+ BUF_FLUSH_LIST, then the caller must not own
+ any latches on pages */
+ ulint min_n, /* in: wished minimum mumber of blocks flushed
+ (it is not guaranteed that the actual number
+ is that big, though) */
+ dulint lsn_limit); /* in the case BUF_FLUSH_LIST all blocks whose
+ oldest_modification is smaller than this
+ should be flushed (if their number does not
+ exceed min_n), otherwise ignored */
+/**********************************************************************
+Waits until a flush batch of the given type ends */
+
+void
+buf_flush_wait_batch_end(
+/*=====================*/
+ ulint type); /* in: BUF_FLUSH_LRU or BUF_FLUSH_LIST */
+/************************************************************************
+This function should be called at a mini-transaction commit, if a page was
+modified in it. Puts the block to the list of modified blocks, if it not
+already in it. */
+UNIV_INLINE
+void
+buf_flush_note_modification(
+/*========================*/
+ buf_block_t* block, /* in: block which is modified */
+ mtr_t* mtr); /* in: mtr */
+/************************************************************************
+This function should be called when recovery has modified a buffer page. */
+UNIV_INLINE
+void
+buf_flush_recv_note_modification(
+/*=============================*/
+ buf_block_t* block, /* in: block which is modified */
+ dulint start_lsn, /* in: start lsn of the first mtr in a
+ set of mtr's */
+ dulint end_lsn); /* in: end lsn of the last mtr in the
+ set of mtr's */
+/************************************************************************
+Returns TRUE if the file page block is immediately suitable for replacement,
+i.e., transition FILE_PAGE => NOT_USED allowed. */
+ibool
+buf_flush_ready_for_replace(
+/*========================*/
+ /* out: TRUE if can replace immediately */
+ buf_block_t* block); /* in: buffer control block, must be in state
+ BUF_BLOCK_FILE_PAGE and in the LRU list */
+/**********************************************************************
+Validates the flush list. */
+
+ibool
+buf_flush_validate(void);
+/*====================*/
+ /* out: TRUE if ok */
+
+/* When buf_flush_free_margin is called, it tries to make this many blocks
+available to replacement in the free list and at the end of the LRU list (to
+make sure that a read-ahead batch can be read efficiently in a single
+sweep). */
+
+#define BUF_FLUSH_FREE_BLOCK_MARGIN (5 + BUF_READ_AHEAD_AREA)
+#define BUF_FLUSH_EXTRA_MARGIN (BUF_FLUSH_FREE_BLOCK_MARGIN / 4)
+
+#ifndef UNIV_NONINL
+#include "buf0flu.ic"
+#endif
+
+#endif
diff --git a/innobase/include/buf0flu.ic b/innobase/include/buf0flu.ic
new file mode 100644
index 00000000000..e2faf773cab
--- /dev/null
+++ b/innobase/include/buf0flu.ic
@@ -0,0 +1,100 @@
+/******************************************************
+The database buffer pool flush algorithm
+
+(c) 1995 Innobase Oy
+
+Created 11/5/1995 Heikki Tuuri
+*******************************************************/
+
+#include "buf0buf.h"
+#include "mtr0mtr.h"
+
+/************************************************************************
+Inserts a modified block into the flush list. */
+
+void
+buf_flush_insert_into_flush_list(
+/*=============================*/
+ buf_block_t* block); /* in: block which is modified */
+/************************************************************************
+Inserts a modified block into the flush list in the right sorted position.
+This function is used by recovery, because there the modifications do not
+necessarily come in the order of lsn's. */
+
+void
+buf_flush_insert_sorted_into_flush_list(
+/*====================================*/
+ buf_block_t* block); /* in: block which is modified */
+
+/************************************************************************
+This function should be called at a mini-transaction commit, if a page was
+modified in it. Puts the block to the list of modified blocks, if it is not
+already in it. */
+UNIV_INLINE
+void
+buf_flush_note_modification(
+/*========================*/
+ buf_block_t* block, /* in: block which is modified */
+ mtr_t* mtr) /* in: mtr */
+{
+ ut_ad(block);
+ ut_ad(block->state == BUF_BLOCK_FILE_PAGE);
+ ut_ad(block->buf_fix_count > 0);
+ ut_ad(rw_lock_own(&(block->lock), RW_LOCK_EX));
+ ut_ad(mutex_own(&(buf_pool->mutex)));
+
+ ut_ad(ut_dulint_cmp(mtr->start_lsn, ut_dulint_zero) != 0);
+ ut_ad(mtr->modifications);
+ ut_ad(ut_dulint_cmp(block->newest_modification, mtr->end_lsn) <= 0);
+
+ block->newest_modification = mtr->end_lsn;
+
+ if (ut_dulint_is_zero(block->oldest_modification)) {
+
+ block->oldest_modification = mtr->start_lsn;
+ ut_ad(!ut_dulint_is_zero(block->oldest_modification));
+
+ buf_flush_insert_into_flush_list(block);
+ } else {
+ ut_ad(ut_dulint_cmp(block->oldest_modification,
+ mtr->start_lsn) <= 0);
+ }
+}
+
+/************************************************************************
+This function should be called when recovery has modified a buffer page. */
+UNIV_INLINE
+void
+buf_flush_recv_note_modification(
+/*=============================*/
+ buf_block_t* block, /* in: block which is modified */
+ dulint start_lsn, /* in: start lsn of the first mtr in a
+ set of mtr's */
+ dulint end_lsn) /* in: end lsn of the last mtr in the
+ set of mtr's */
+{
+ ut_ad(block);
+ ut_ad(block->state == BUF_BLOCK_FILE_PAGE);
+ ut_ad(block->buf_fix_count > 0);
+ ut_ad(rw_lock_own(&(block->lock), RW_LOCK_EX));
+
+ mutex_enter(&(buf_pool->mutex));
+
+ ut_ad(ut_dulint_cmp(block->newest_modification, end_lsn) <= 0);
+
+ block->newest_modification = end_lsn;
+
+ if (ut_dulint_is_zero(block->oldest_modification)) {
+
+ block->oldest_modification = start_lsn;
+
+ ut_ad(!ut_dulint_is_zero(block->oldest_modification));
+
+ buf_flush_insert_sorted_into_flush_list(block);
+ } else {
+ ut_ad(ut_dulint_cmp(block->oldest_modification,
+ start_lsn) <= 0);
+ }
+
+ mutex_exit(&(buf_pool->mutex));
+}
diff --git a/innobase/include/buf0lru.h b/innobase/include/buf0lru.h
new file mode 100644
index 00000000000..946b6c4e31d
--- /dev/null
+++ b/innobase/include/buf0lru.h
@@ -0,0 +1,117 @@
+/******************************************************
+The database buffer pool LRU replacement algorithm
+
+(c) 1995 Innobase Oy
+
+Created 11/5/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef buf0lru_h
+#define buf0lru_h
+
+#include "univ.i"
+#include "ut0byte.h"
+#include "buf0types.h"
+
+/**********************************************************************
+Tries to remove LRU flushed blocks from the end of the LRU list and put them
+to the free list. This is beneficial for the efficiency of the insert buffer
+operation, as flushed pages from non-unique non-clustered indexes are here
+taken out of the buffer pool, and their inserts redirected to the insert
+buffer. Otherwise, the flushed blocks could get modified again before read
+operations need new buffer blocks, and the i/o work done in flushing would be
+wasted. */
+
+void
+buf_LRU_try_free_flushed_blocks(void);
+/*==================================*/
+
+/*#######################################################################
+These are low-level functions
+#########################################################################*/
+
+/* Minimum LRU list length for which the LRU_old pointer is defined */
+
+#define BUF_LRU_OLD_MIN_LEN 80
+
+#define BUF_LRU_FREE_SEARCH_LEN (5 + 2 * BUF_READ_AHEAD_AREA)
+
+/**********************************************************************
+Gets the minimum LRU_position field for the blocks in an initial segment
+(determined by BUF_LRU_INITIAL_RATIO) of the LRU list. The limit is not
+guaranteed to be precise, because the ulint_clock may wrap around. */
+
+ulint
+buf_LRU_get_recent_limit(void);
+/*==========================*/
+ /* out: the limit; zero if could not determine it */
+/**********************************************************************
+Returns a free block from the buf_pool. The block is taken off the
+free list. If it is empty, blocks are moved from the end of the
+LRU list to the free list. */
+
+buf_block_t*
+buf_LRU_get_free_block(void);
+/*=========================*/
+ /* out: the free control block */
+/**********************************************************************
+Puts a block back to the free list. */
+
+void
+buf_LRU_block_free_non_file_page(
+/*=============================*/
+ buf_block_t* block); /* in: block, must not contain a file page */
+/**********************************************************************
+Adds a block to the LRU list. */
+
+void
+buf_LRU_add_block(
+/*==============*/
+ buf_block_t* block, /* in: control block */
+ ibool old); /* in: TRUE if should be put to the old
+ blocks in the LRU list, else put to the
+ start; if the LRU list is very short, added to
+ the start regardless of this parameter */
+/**********************************************************************
+Moves a block to the start of the LRU list. */
+
+void
+buf_LRU_make_block_young(
+/*=====================*/
+ buf_block_t* block); /* in: control block */
+/**********************************************************************
+Moves a block to the end of the LRU list. */
+
+void
+buf_LRU_make_block_old(
+/*===================*/
+ buf_block_t* block); /* in: control block */
+/**********************************************************************
+Look for a replaceable block from the end of the LRU list and put it to
+the free list if found. */
+
+ibool
+buf_LRU_search_and_free_block(
+/*==========================*/
+ /* out: TRUE if freed */
+ ulint n_iterations); /* in: how many times this has been called
+ repeatedly without result: a high value
+ means that we should search farther */
+/**************************************************************************
+Validates the LRU list. */
+
+ibool
+buf_LRU_validate(void);
+/*==================*/
+/**************************************************************************
+Prints the LRU list. */
+
+void
+buf_LRU_print(void);
+/*===============*/
+
+#ifndef UNIV_NONINL
+#include "buf0lru.ic"
+#endif
+
+#endif
diff --git a/innobase/include/buf0lru.ic b/innobase/include/buf0lru.ic
new file mode 100644
index 00000000000..7b8ee457b0b
--- /dev/null
+++ b/innobase/include/buf0lru.ic
@@ -0,0 +1,8 @@
+/******************************************************
+The database buffer replacement algorithm
+
+(c) 1995 Innobase Oy
+
+Created 11/5/1995 Heikki Tuuri
+*******************************************************/
+
diff --git a/innobase/include/buf0rea.h b/innobase/include/buf0rea.h
new file mode 100644
index 00000000000..1efe67369ab
--- /dev/null
+++ b/innobase/include/buf0rea.h
@@ -0,0 +1,98 @@
+/******************************************************
+The database buffer read
+
+(c) 1995 Innobase Oy
+
+Created 11/5/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef buf0rea_h
+#define buf0rea_h
+
+#include "univ.i"
+#include "buf0types.h"
+
+/************************************************************************
+High-level function which reads a page asynchronously from a file to the
+buffer buf_pool if it is not already there. Sets the io_fix flag and sets
+an exclusive lock on the buffer frame. The flag is cleared and the x-lock
+released by the i/o-handler thread. Does a random read-ahead if it seems
+sensible. */
+
+ulint
+buf_read_page(
+/*==========*/
+ /* out: number of page read requests issued: this can
+ be > 1 if read-ahead occurred */
+ ulint space, /* in: space id */
+ ulint offset);/* in: page number */
+/************************************************************************
+Applies linear read-ahead if in the buf_pool the page is a border page of
+a linear read-ahead area and all the pages in the area have been accessed.
+Does not read any page if the read-ahead mechanism is not activated. Note
+that the the algorithm looks at the 'natural' adjacent successor and
+predecessor of the page, which on the leaf level of a B-tree are the next
+and previous page in the chain of leaves. To know these, the page specified
+in (space, offset) must already be present in the buf_pool. Thus, the
+natural way to use this function is to call it when a page in the buf_pool
+is accessed the first time, calling this function just after it has been
+bufferfixed.
+NOTE 1: as this function looks at the natural predecessor and successor
+fields on the page, what happens, if these are not initialized to any
+sensible value? No problem, before applying read-ahead we check that the
+area to read is within the span of the space, if not, read-ahead is not
+applied. An uninitialized value may result in a useless read operation, but
+only very improbably.
+NOTE 2: the calling thread may own latches on pages: to avoid deadlocks this
+function must be written such that it cannot end up waiting for these
+latches!
+NOTE 3: the calling thread must want access to the page given: this rule is
+set to prevent unintended read-aheads performed by ibuf routines, a situation
+which could result in a deadlock if the OS does not support asynchronous io. */
+
+ulint
+buf_read_ahead_linear(
+/*==================*/
+ /* out: number of page read requests issued */
+ ulint space, /* in: space id */
+ ulint offset);/* in: page number of a page; NOTE: the current thread
+ must want access to this page (see NOTE 3 above) */
+/************************************************************************
+Issues read requests for pages which the ibuf module wants to read in, in
+order to contract insert buffer trees. Technically, this function is like
+a read-ahead function. */
+
+void
+buf_read_ibuf_merge_pages(
+/*======================*/
+ ibool sync, /* in: TRUE if the caller wants this function
+ to wait for the highest address page to get
+ read in, before this function returns */
+ ulint space, /* in: space id */
+ ulint* page_nos, /* in: array of page numbers to read, with
+ the highest page number last in the array */
+ ulint n_stored); /* in: number of page numbers in the array */
+/************************************************************************
+Issues read requests for pages which recovery wants to read in. */
+
+void
+buf_read_recv_pages(
+/*================*/
+ ibool sync, /* in: TRUE if the caller wants this function
+ to wait for the highest address page to get
+ read in, before this function returns */
+ ulint space, /* in: space id */
+ ulint* page_nos, /* in: array of page numbers to read, with the
+ highest page number the last in the array */
+ ulint n_stored); /* in: number of page numbers in the array */
+
+/* The size in pages of the area which the read-ahead algorithms read if
+invoked */
+
+#define BUF_READ_AHEAD_AREA ut_min(32, buf_pool->curr_size / 16)
+
+/* Modes used in read-ahead */
+#define BUF_READ_IBUF_PAGES_ONLY 131
+#define BUF_READ_ANY_PAGE 132
+
+#endif
diff --git a/innobase/include/buf0types.h b/innobase/include/buf0types.h
new file mode 100644
index 00000000000..44fdfa80e73
--- /dev/null
+++ b/innobase/include/buf0types.h
@@ -0,0 +1,20 @@
+/******************************************************
+The database buffer pool global types for the directory
+
+(c) 1995 Innobase Oy
+
+Created 11/17/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef buf0types_h
+#define buf0types_h
+
+typedef struct buf_block_struct buf_block_t;
+typedef struct buf_pool_struct buf_pool_t;
+
+/* The 'type' used of a buffer frame */
+typedef byte buf_frame_t;
+
+
+#endif
+
diff --git a/innobase/include/com0com.h b/innobase/include/com0com.h
new file mode 100644
index 00000000000..6f04b6a3f11
--- /dev/null
+++ b/innobase/include/com0com.h
@@ -0,0 +1,125 @@
+/******************************************************
+The communication primitives
+
+(c) 1995 Innobase Oy
+
+Created 9/23/1995 Heikki Tuuri
+*******************************************************/
+
+/* This module defines a standard datagram communication
+function interface for use in the database. We assume that
+the communication medium is reliable. */
+
+#ifndef com0com_h
+#define com0com_h
+
+#include "univ.i"
+
+/* The communications endpoint type definition */
+typedef struct com_endpoint_struct com_endpoint_t;
+
+/* Possible endpoint communication types */
+#define COM_SHM 1 /* communication through shared memory */
+
+/* Option numbers for endpoint */
+#define COM_OPT_MAX_DGRAM_SIZE 1
+
+/* Error numbers */
+#define COM_ERR_NOT_SPECIFIED 1
+#define COM_ERR_NOT_BOUND 2
+#define COM_ERR_ALREADY_BOUND 3
+#define COM_ERR_MAX_DATAGRAM_SIZE_NOT_SET 4
+#define COM_ERR_DATA_BUFFER_TOO_SMALL 5
+#define COM_ERR_ADDR_BUFFER_TOO_SMALL 6
+#define COM_ERR_DATA_TOO_LONG 7
+#define COM_ERR_ADDR_TOO_LONG 8
+#define COM_ERR_DGRAM_NOT_DELIVERED 9
+
+/* Maximum allowed address length in bytes */
+#define COM_MAX_ADDR_LEN 100
+
+/*************************************************************************
+Creates a communications endpoint. */
+
+com_endpoint_t*
+com_endpoint_create(
+/*================*/
+ /* out, own: communications endpoint, NULL if
+ did not succeed */
+ ulint type); /* in: communication type of endpoint:
+ only COM_SHM supported */
+/*************************************************************************
+Frees a communications endpoint. */
+
+ulint
+com_endpoint_free(
+/*==============*/
+ /* out: O if succeed, else error number */
+ com_endpoint_t* ep); /* in, own: communications endpoint */
+/*************************************************************************
+Sets an option, like the maximum datagram size for an endpoint.
+The options may vary depending on the endpoint type. */
+
+ulint
+com_endpoint_set_option(
+/*====================*/
+ /* out: 0 if succeed, else error number */
+ com_endpoint_t* ep, /* in: endpoint */
+ ulint optno, /* in: option number, only
+ COM_OPT_MAX_DGRAM_SIZE currently supported */
+ byte* optval, /* in: pointer to a buffer containing the
+ option value to set */
+ ulint optlen);/* in: option value buffer length */
+/*************************************************************************
+Binds a communications endpoint to a specified address. */
+
+ulint
+com_bind(
+/*=====*/
+ /* out: 0 if succeed, else error number */
+ com_endpoint_t* ep, /* in: communications endpoint */
+ char* name, /* in: address name */
+ ulint len); /* in: name length */
+/*************************************************************************
+Waits for a datagram to arrive at an endpoint. */
+
+ulint
+com_recvfrom(
+/*=========*/
+ /* out: 0 if succeed, else error number */
+ com_endpoint_t* ep, /* in: communications endpoint */
+ byte* buf, /* out: datagram buffer; the buffer must be
+ supplied by the caller */
+ ulint buf_len,/* in: datagram buffer length */
+ ulint* len, /* out: datagram length */
+ char* from, /* out: address name buffer; the buffer must be
+ supplied by the caller */
+ ulint from_len,/* in: address name buffer length */
+ ulint* addr_len);/* out: address name length */
+/*************************************************************************
+Sends a datagram to a specified destination. */
+
+ulint
+com_sendto(
+/*=======*/
+ /* out: 0 if succeed, else error number */
+ com_endpoint_t* ep, /* in: communications endpoint */
+ byte* buf, /* in: datagram buffer */
+ ulint len, /* in: datagram length */
+ char* to, /* in: address name buffer */
+ ulint tolen); /* in: address name length */
+/*************************************************************************
+Gets the maximum datagram size for an endpoint. */
+
+ulint
+com_endpoint_get_max_size(
+/*======================*/
+ /* out: maximum size */
+ com_endpoint_t* ep); /* in: endpoint */
+
+
+#ifndef UNIV_NONINL
+#include "com0com.ic"
+#endif
+
+#endif
diff --git a/innobase/include/com0com.ic b/innobase/include/com0com.ic
new file mode 100644
index 00000000000..cec1cb190cc
--- /dev/null
+++ b/innobase/include/com0com.ic
@@ -0,0 +1,7 @@
+/******************************************************
+The communication primitives
+
+(c) 1995 Innobase Oy
+
+Created 9/23/1995 Heikki Tuuri
+*******************************************************/
diff --git a/innobase/include/com0shm.h b/innobase/include/com0shm.h
new file mode 100644
index 00000000000..7de9c4ac2de
--- /dev/null
+++ b/innobase/include/com0shm.h
@@ -0,0 +1,103 @@
+/******************************************************
+The communication through shared memory
+
+(c) 1995 Innobase Oy
+
+Created 9/23/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef com0shm_h
+#define com0shm_h
+
+#include "univ.i"
+
+typedef struct com_shm_endpoint_struct com_shm_endpoint_t;
+
+/* The performance of communication in NT depends on how
+many times a system call is made (excluding os_thread_yield,
+as that is the fastest way to switch thread).
+The following variable counts such events. */
+
+extern ulint com_shm_system_call_count;
+
+
+/*************************************************************************
+Creates a communications endpoint. */
+
+com_shm_endpoint_t*
+com_shm_endpoint_create(void);
+/*=========================*/
+ /* out, own: communications endpoint, NULL if
+ did not succeed */
+/*************************************************************************
+Frees a communications endpoint. */
+
+ulint
+com_shm_endpoint_free(
+/*==================*/
+ /* out: O if succeed, else error number */
+ com_shm_endpoint_t* ep);/* in, own: communications endpoint */
+/*************************************************************************
+Sets an option, like the maximum datagram size for an endpoint.
+The options may vary depending on the endpoint type. */
+
+ulint
+com_shm_endpoint_set_option(
+/*========================*/
+ /* out: 0 if succeed, else error number */
+ com_shm_endpoint_t* ep, /* in: endpoint */
+ ulint optno, /* in: option number, only
+ COM_OPT_MAX_DGRAM_SIZE currently supported */
+ byte* optval, /* in: pointer to a buffer containing the
+ option value to set */
+ ulint optlen);/* in: option value buffer length */
+/*************************************************************************
+Bind a communications endpoint to a specified address. */
+
+ulint
+com_shm_bind(
+/*=========*/
+ /* out: 0 if succeed, else error number */
+ com_shm_endpoint_t* ep, /* in: communications endpoint */
+ char* name, /* in: address name */
+ ulint len); /* in: address name length */
+/*************************************************************************
+Waits for a datagram to arrive at an endpoint. */
+
+ulint
+com_shm_recvfrom(
+/*=============*/
+ /* out: 0 if succeed, else error number */
+ com_shm_endpoint_t* ep, /* in: communications endpoint */
+ byte* buf, /* out: datagram buffer; the buffer is
+ supplied by the caller */
+ ulint buf_len,/* in: datagram buffer length */
+ ulint* len, /* out: datagram length */
+ char* from, /* out: address name buffer; the buffer is
+ supplied by the caller */
+ ulint from_len,/* in: address name buffer length */
+ ulint* addr_len);/* out: address name length */
+/*************************************************************************
+Sends a datagram to the specified destination. */
+
+ulint
+com_shm_sendto(
+/*===========*/
+ /* out: 0 if succeed, else error number */
+ com_shm_endpoint_t* ep, /* in: communications endpoint */
+ byte* buf, /* in: datagram buffer */
+ ulint len, /* in: datagram length */
+ char* to, /* in: address name buffer */
+ ulint tolen); /* in: address name length */
+
+ulint
+com_shm_endpoint_get_size(
+/*======================*/
+ com_shm_endpoint_t* ep);
+
+
+#ifndef UNIV_NONINL
+#include "com0shm.ic"
+#endif
+
+#endif
diff --git a/innobase/include/com0shm.ic b/innobase/include/com0shm.ic
new file mode 100644
index 00000000000..e0d3cb26f69
--- /dev/null
+++ b/innobase/include/com0shm.ic
@@ -0,0 +1,7 @@
+/******************************************************
+Communication through shared memory
+
+(c) 1995 Innobase Oy
+
+Created 9/23/1995 Heikki Tuuri
+*******************************************************/
diff --git a/innobase/include/data0data.h b/innobase/include/data0data.h
new file mode 100644
index 00000000000..d7f0986b0b6
--- /dev/null
+++ b/innobase/include/data0data.h
@@ -0,0 +1,430 @@
+/************************************************************************
+SQL data field and tuple
+
+(c) 1994-1996 Innobase Oy
+
+Created 5/30/1994 Heikki Tuuri
+*************************************************************************/
+
+#ifndef data0data_h
+#define data0data_h
+
+#include "univ.i"
+
+#include "data0types.h"
+#include "data0type.h"
+#include "mem0mem.h"
+
+/* Some non-inlined functions used in the MySQL interface: */
+void
+dfield_set_data_noninline(
+ dfield_t* field, /* in: field */
+ void* data, /* in: data */
+ ulint len); /* in: length or UNIV_SQL_NULL */
+void*
+dfield_get_data_noninline(
+ dfield_t* field); /* in: field */
+ulint
+dfield_get_len_noninline(
+ dfield_t* field); /* in: field */
+ulint
+dtuple_get_n_fields_noninline(
+ dtuple_t* tuple); /* in: tuple */
+dfield_t*
+dtuple_get_nth_field_noninline(
+ dtuple_t* tuple, /* in: tuple */
+ ulint n); /* in: index of field */
+
+/*************************************************************************
+Gets pointer to the type struct of SQL data field. */
+UNIV_INLINE
+dtype_t*
+dfield_get_type(
+/*============*/
+ /* out: pointer to the type struct */
+ dfield_t* field); /* in: SQL data field */
+/*************************************************************************
+Sets the type struct of SQL data field. */
+UNIV_INLINE
+void
+dfield_set_type(
+/*============*/
+ dfield_t* field, /* in: SQL data field */
+ dtype_t* type); /* in: pointer to data type struct */
+/*************************************************************************
+Gets pointer to the data in a field. */
+UNIV_INLINE
+void*
+dfield_get_data(
+/*============*/
+ /* out: pointer to data */
+ dfield_t* field); /* in: field */
+/*************************************************************************
+Gets length of field data. */
+UNIV_INLINE
+ulint
+dfield_get_len(
+/*===========*/
+ /* out: length of data; UNIV_SQL_NULL if
+ SQL null data */
+ dfield_t* field); /* in: field */
+/*************************************************************************
+Sets length in a field. */
+UNIV_INLINE
+void
+dfield_set_len(
+/*===========*/
+ dfield_t* field, /* in: field */
+ ulint len); /* in: length or UNIV_SQL_NULL */
+/*************************************************************************
+Sets pointer to the data and length in a field. */
+UNIV_INLINE
+void
+dfield_set_data(
+/*============*/
+ dfield_t* field, /* in: field */
+ void* data, /* in: data */
+ ulint len); /* in: length or UNIV_SQL_NULL */
+/**************************************************************************
+Writes an SQL null field full of zeros. */
+UNIV_INLINE
+void
+data_write_sql_null(
+/*================*/
+ byte* data, /* in: pointer to a buffer of size len */
+ ulint len); /* in: SQL null size in bytes */
+/*************************************************************************
+Copies the data and len fields. */
+UNIV_INLINE
+void
+dfield_copy_data(
+/*=============*/
+ dfield_t* field1, /* in: field to copy to */
+ dfield_t* field2);/* in: field to copy from */
+/*************************************************************************
+Copies a data field to another. */
+UNIV_INLINE
+void
+dfield_copy(
+/*========*/
+ dfield_t* field1, /* in: field to copy to */
+ dfield_t* field2);/* in: field to copy from */
+/*************************************************************************
+Tests if data length and content is equal for two dfields. */
+UNIV_INLINE
+ibool
+dfield_datas_are_equal(
+/*===================*/
+ /* out: TRUE if equal */
+ dfield_t* field1, /* in: field */
+ dfield_t* field2);/* in: field */
+/*************************************************************************
+Tests if dfield data length and content is equal to the given. */
+UNIV_INLINE
+ibool
+dfield_data_is_equal(
+/*=================*/
+ /* out: TRUE if equal */
+ dfield_t* field, /* in: field */
+ ulint len, /* in: data length or UNIV_SQL_NULL */
+ byte* data); /* in: data */
+/*************************************************************************
+Gets number of fields in a data tuple. */
+UNIV_INLINE
+ulint
+dtuple_get_n_fields(
+/*================*/
+ /* out: number of fields */
+ dtuple_t* tuple); /* in: tuple */
+/*************************************************************************
+Gets nth field of a tuple. */
+UNIV_INLINE
+dfield_t*
+dtuple_get_nth_field(
+/*=================*/
+ /* out: nth field */
+ dtuple_t* tuple, /* in: tuple */
+ ulint n); /* in: index of field */
+/*************************************************************************
+Gets info bits in a data tuple. */
+UNIV_INLINE
+ulint
+dtuple_get_info_bits(
+/*=================*/
+ /* out: info bits */
+ dtuple_t* tuple); /* in: tuple */
+/*************************************************************************
+Sets info bits in a data tuple. */
+UNIV_INLINE
+void
+dtuple_set_info_bits(
+/*=================*/
+ dtuple_t* tuple, /* in: tuple */
+ ulint info_bits); /* in: info bits */
+/*************************************************************************
+Gets number of fields used in record comparisons. */
+UNIV_INLINE
+ulint
+dtuple_get_n_fields_cmp(
+/*====================*/
+ /* out: number of fields used in comparisons
+ in rem0cmp.* */
+ dtuple_t* tuple); /* in: tuple */
+/*************************************************************************
+Gets number of fields used in record comparisons. */
+UNIV_INLINE
+void
+dtuple_set_n_fields_cmp(
+/*====================*/
+ dtuple_t* tuple, /* in: tuple */
+ ulint n_fields_cmp); /* in: number of fields used in
+ comparisons in rem0cmp.* */
+/**************************************************************
+Creates a data tuple to a memory heap. The default value for number
+of fields used in record comparisons for this tuple is n_fields. */
+UNIV_INLINE
+dtuple_t*
+dtuple_create(
+/*==========*/
+ /* out, own: created tuple */
+ mem_heap_t* heap, /* in: memory heap where the tuple
+ is created */
+ ulint n_fields); /* in: number of fields */
+
+/*************************************************************************
+Creates a dtuple for use in MySQL. */
+
+dtuple_t*
+dtuple_create_for_mysql(
+/*====================*/
+ /* out, own created dtuple */
+ void** heap, /* out: created memory heap */
+ ulint n_fields); /* in: number of fields */
+/*************************************************************************
+Frees a dtuple used in MySQL. */
+
+void
+dtuple_free_for_mysql(
+/*==================*/
+ void* heap);
+/*************************************************************************
+Sets number of fields used in a tuple. Normally this is set in
+dtuple_create, but if you want later to set it smaller, you can use this. */
+
+void
+dtuple_set_n_fields(
+/*================*/
+ dtuple_t* tuple, /* in: tuple */
+ ulint n_fields); /* in: number of fields */
+/**************************************************************
+The following function returns the sum of data lengths of a tuple. The space
+occupied by the field structs or the tuple struct is not counted. */
+UNIV_INLINE
+ulint
+dtuple_get_data_size(
+/*=================*/
+ /* out: sum of data lens */
+ dtuple_t* tuple); /* in: typed data tuple */
+/****************************************************************
+Returns TRUE if lengths of two dtuples are equal and respective data fields
+in them are equal. */
+UNIV_INLINE
+ibool
+dtuple_datas_are_equal(
+/*===================*/
+ /* out: TRUE if length and datas are equal */
+ dtuple_t* tuple1, /* in: tuple 1 */
+ dtuple_t* tuple2); /* in: tuple 2 */
+/****************************************************************
+Folds a prefix given as the number of fields of a tuple. */
+UNIV_INLINE
+ulint
+dtuple_fold(
+/*========*/
+ /* out: the folded value */
+ dtuple_t* tuple, /* in: the tuple */
+ ulint n_fields,/* in: number of complete fields to fold */
+ ulint n_bytes,/* in: number of bytes to fold in an
+ incomplete last field */
+ dulint tree_id);/* in: index tree id */
+/***********************************************************************
+Sets types of fields binary in a tuple. */
+UNIV_INLINE
+void
+dtuple_set_types_binary(
+/*====================*/
+ dtuple_t* tuple, /* in: data tuple */
+ ulint n); /* in: number of fields to set */
+/**************************************************************
+Checks that a data field is typed. Asserts an error if not. */
+
+ibool
+dfield_check_typed(
+/*===============*/
+ /* out: TRUE if ok */
+ dfield_t* field); /* in: data field */
+/**************************************************************
+Checks that a data tuple is typed. Asserts an error if not. */
+
+ibool
+dtuple_check_typed(
+/*===============*/
+ /* out: TRUE if ok */
+ dtuple_t* tuple); /* in: tuple */
+/**************************************************************
+Validates the consistency of a tuple which must be complete, i.e,
+all fields must have been set. */
+
+ibool
+dtuple_validate(
+/*============*/
+ /* out: TRUE if ok */
+ dtuple_t* tuple); /* in: tuple */
+/*****************************************************************
+Pretty prints a dfield value according to its data type. */
+
+void
+dfield_print(
+/*=========*/
+ dfield_t* dfield);/* in: dfield */
+/*****************************************************************
+Pretty prints a dfield value according to its data type. Also the hex string
+is printed if a string contains non-printable characters. */
+
+void
+dfield_print_also_hex(
+/*==================*/
+ dfield_t* dfield); /* in: dfield */
+/**************************************************************
+The following function prints the contents of a tuple. */
+
+void
+dtuple_print(
+/*=========*/
+ dtuple_t* tuple); /* in: tuple */
+/**************************************************************
+The following function prints the contents of a tuple to a buffer. */
+
+ulint
+dtuple_sprintf(
+/*===========*/
+ /* out: printed length in bytes */
+ char* buf, /* in: print buffer */
+ ulint buf_len,/* in: buf length in bytes */
+ dtuple_t* tuple); /* in: tuple */
+/***************************************************************
+Generates a random tuple. */
+
+dtuple_t*
+dtuple_gen_rnd_tuple(
+/*=================*/
+ /* out: pointer to the tuple */
+ mem_heap_t* heap); /* in: memory heap where generated */
+/*******************************************************************
+Generates a test tuple for sort and comparison tests. */
+
+void
+dtuple_gen_test_tuple(
+/*==================*/
+ dtuple_t* tuple, /* in/out: a tuple with 3 fields */
+ ulint i); /* in: a number, 0 <= i < 512 */
+/*******************************************************************
+Generates a test tuple for B-tree speed tests. */
+
+void
+dtuple_gen_test_tuple3(
+/*===================*/
+ dtuple_t* tuple, /* in/out: a tuple with 3 fields */
+ ulint i, /* in: a number < 1000000 */
+ ulint type, /* in: DTUPLE_TEST_FIXED30, ... */
+ byte* buf); /* in: a buffer of size >= 8 bytes */
+/*******************************************************************
+Generates a test tuple for B-tree speed tests. */
+
+void
+dtuple_gen_search_tuple3(
+/*=====================*/
+ dtuple_t* tuple, /* in/out: a tuple with 1 or 2 fields */
+ ulint i, /* in: a number < 1000000 */
+ byte* buf); /* in: a buffer of size >= 8 bytes */
+/*******************************************************************
+Generates a test tuple for TPC-A speed test. */
+
+void
+dtuple_gen_test_tuple_TPC_A(
+/*========================*/
+ dtuple_t* tuple, /* in/out: a tuple with >= 3 fields */
+ ulint i, /* in: a number < 10000 */
+ byte* buf); /* in: a buffer of size >= 16 bytes */
+/*******************************************************************
+Generates a test tuple for B-tree speed tests. */
+
+void
+dtuple_gen_search_tuple_TPC_A(
+/*==========================*/
+ dtuple_t* tuple, /* in/out: a tuple with 1 field */
+ ulint i, /* in: a number < 10000 */
+ byte* buf); /* in: a buffer of size >= 16 bytes */
+/*******************************************************************
+Generates a test tuple for TPC-C speed test. */
+
+void
+dtuple_gen_test_tuple_TPC_C(
+/*========================*/
+ dtuple_t* tuple, /* in/out: a tuple with >= 12 fields */
+ ulint i, /* in: a number < 100000 */
+ byte* buf); /* in: a buffer of size >= 16 bytes */
+/*******************************************************************
+Generates a test tuple for B-tree speed tests. */
+
+void
+dtuple_gen_search_tuple_TPC_C(
+/*==========================*/
+ dtuple_t* tuple, /* in/out: a tuple with 1 field */
+ ulint i, /* in: a number < 100000 */
+ byte* buf); /* in: a buffer of size >= 16 bytes */
+
+/* Types of the third field in dtuple_gen_test_tuple3 */
+#define DTUPLE_TEST_FIXED30 1
+#define DTUPLE_TEST_RND30 2
+#define DTUPLE_TEST_RND3500 3
+#define DTUPLE_TEST_FIXED2000 4
+#define DTUPLE_TEST_FIXED3 5
+
+/*######################################################################*/
+
+/* Structure for an SQL data field */
+struct dfield_struct{
+ void* data; /* pointer to data */
+ ulint len; /* data length; UNIV_SQL_NULL if SQL null */
+ dtype_t type; /* type of data */
+ ulint col_no; /* when building index entries, the column
+ number can be stored here */
+};
+
+struct dtuple_struct {
+ ulint info_bits; /* info bits of an index record:
+ default is 0; this field is used
+ if an index record is built from
+ a data tuple */
+ ulint n_fields; /* number of fields in dtuple */
+ ulint n_fields_cmp; /* number of fields which should
+ be used in comparison services
+ of rem0cmp.*; the index search
+ is performed by comparing only these
+ fields, others are ignored; the
+ default value in dtuple creation is
+ the same value as n_fields */
+ dfield_t* fields; /* fields */
+ UT_LIST_NODE_T(dtuple_t) tuple_list;
+ /* data tuples can be linked into a
+ list using this field */
+ ulint magic_n;
+};
+#define DATA_TUPLE_MAGIC_N 65478679
+
+#ifndef UNIV_NONINL
+#include "data0data.ic"
+#endif
+
+#endif
diff --git a/innobase/include/data0data.ic b/innobase/include/data0data.ic
new file mode 100644
index 00000000000..27b5552d338
--- /dev/null
+++ b/innobase/include/data0data.ic
@@ -0,0 +1,491 @@
+/************************************************************************
+SQL data field and tuple
+
+(c) 1994-1996 Innobase Oy
+
+Created 5/30/1994 Heikki Tuuri
+*************************************************************************/
+
+#include "mem0mem.h"
+#include "ut0rnd.h"
+
+extern byte data_error;
+
+/*************************************************************************
+Gets pointer to the type struct of SQL data field. */
+UNIV_INLINE
+dtype_t*
+dfield_get_type(
+/*============*/
+ /* out: pointer to the type struct */
+ dfield_t* field) /* in: SQL data field */
+{
+ ut_ad(field);
+
+ return(&(field->type));
+}
+
+/*************************************************************************
+Sets the type struct of SQL data field. */
+UNIV_INLINE
+void
+dfield_set_type(
+/*============*/
+ dfield_t* field, /* in: SQL data field */
+ dtype_t* type) /* in: pointer to data type struct */
+{
+ ut_ad(field && type);
+
+ field->type = *type;
+}
+
+/*************************************************************************
+Gets pointer to the data in a field. */
+UNIV_INLINE
+void*
+dfield_get_data(
+/*============*/
+ /* out: pointer to data */
+ dfield_t* field) /* in: field */
+{
+ ut_ad(field);
+ ut_ad((field->len == UNIV_SQL_NULL)
+ || (field->data != &data_error));
+
+ return(field->data);
+}
+
+/*************************************************************************
+Gets length of field data. */
+UNIV_INLINE
+ulint
+dfield_get_len(
+/*===========*/
+ /* out: length of data; UNIV_SQL_NULL if
+ SQL null data */
+ dfield_t* field) /* in: field */
+{
+ ut_ad(field);
+ ut_ad((field->len == UNIV_SQL_NULL)
+ || (field->data != &data_error));
+
+ return(field->len);
+}
+
+/*************************************************************************
+Sets length in a field. */
+UNIV_INLINE
+void
+dfield_set_len(
+/*===========*/
+ dfield_t* field, /* in: field */
+ ulint len) /* in: length or UNIV_SQL_NULL */
+{
+ ut_ad(field);
+
+ field->len = len;
+}
+
+/*************************************************************************
+Sets pointer to the data and length in a field. */
+UNIV_INLINE
+void
+dfield_set_data(
+/*============*/
+ dfield_t* field, /* in: field */
+ void* data, /* in: data */
+ ulint len) /* in: length or UNIV_SQL_NULL */
+{
+ ut_ad(field);
+
+ field->data = data;
+ field->len = len;
+}
+
+/*************************************************************************
+Copies the data and len fields. */
+UNIV_INLINE
+void
+dfield_copy_data(
+/*=============*/
+ dfield_t* field1, /* in: field to copy to */
+ dfield_t* field2) /* in: field to copy from */
+{
+ ut_ad(field1 && field2);
+
+ field1->data = field2->data;
+ field1->len = field2->len;
+}
+
+/*************************************************************************
+Copies a data field to another. */
+UNIV_INLINE
+void
+dfield_copy(
+/*========*/
+ dfield_t* field1, /* in: field to copy to */
+ dfield_t* field2) /* in: field to copy from */
+{
+ *field1 = *field2;
+}
+
+/*************************************************************************
+Tests if data length and content is equal for two dfields. */
+UNIV_INLINE
+ibool
+dfield_datas_are_equal(
+/*===================*/
+ /* out: TRUE if equal */
+ dfield_t* field1, /* in: field */
+ dfield_t* field2) /* in: field */
+{
+ ulint len;
+
+ len = field1->len;
+
+ if ((len != field2->len)
+ || ((len != UNIV_SQL_NULL)
+ && (0 != ut_memcmp(field1->data, field2->data, len)))) {
+
+ return(FALSE);
+ }
+
+ return(TRUE);
+}
+
+/*************************************************************************
+Tests if dfield data length and content is equal to the given. */
+UNIV_INLINE
+ibool
+dfield_data_is_equal(
+/*=================*/
+ /* out: TRUE if equal */
+ dfield_t* field, /* in: field */
+ ulint len, /* in: data length or UNIV_SQL_NULL */
+ byte* data) /* in: data */
+{
+ if (len != field->len) {
+
+ return(FALSE);
+ }
+
+ if ((len != UNIV_SQL_NULL)
+ && (0 != ut_memcmp(field->data, data, len))) {
+
+ return(FALSE);
+ }
+
+ return(TRUE);
+}
+
+/*************************************************************************
+Gets info bits in a data tuple. */
+UNIV_INLINE
+ulint
+dtuple_get_info_bits(
+/*=================*/
+ /* out: info bits */
+ dtuple_t* tuple) /* in: tuple */
+{
+ ut_ad(tuple);
+
+ return(tuple->info_bits);
+}
+
+/*************************************************************************
+Sets info bits in a data tuple. */
+UNIV_INLINE
+void
+dtuple_set_info_bits(
+/*=================*/
+ dtuple_t* tuple, /* in: tuple */
+ ulint info_bits) /* in: info bits */
+{
+ ut_ad(tuple);
+
+ tuple->info_bits = info_bits;
+}
+
+/*************************************************************************
+Gets number of fields used in record comparisons. */
+UNIV_INLINE
+ulint
+dtuple_get_n_fields_cmp(
+/*====================*/
+ /* out: number of fields used in comparisons
+ in rem0cmp.* */
+ dtuple_t* tuple) /* in: tuple */
+{
+ ut_ad(tuple);
+
+ return(tuple->n_fields_cmp);
+}
+
+/*************************************************************************
+Sets number of fields used in record comparisons. */
+UNIV_INLINE
+void
+dtuple_set_n_fields_cmp(
+/*====================*/
+ dtuple_t* tuple, /* in: tuple */
+ ulint n_fields_cmp) /* in: number of fields used in
+ comparisons in rem0cmp.* */
+{
+ ut_ad(tuple);
+ ut_ad(n_fields_cmp <= tuple->n_fields);
+
+ tuple->n_fields_cmp = n_fields_cmp;
+}
+
+/*************************************************************************
+Gets number of fields in a data tuple. */
+UNIV_INLINE
+ulint
+dtuple_get_n_fields(
+/*================*/
+ /* out: number of fields */
+ dtuple_t* tuple) /* in: tuple */
+{
+ ut_ad(tuple);
+
+ return(tuple->n_fields);
+}
+
+/*************************************************************************
+Gets nth field of a tuple. */
+UNIV_INLINE
+dfield_t*
+dtuple_get_nth_field(
+/*=================*/
+ /* out: nth field */
+ dtuple_t* tuple, /* in: tuple */
+ ulint n) /* in: index of field */
+{
+ ut_ad(tuple);
+ ut_ad(n < tuple->n_fields);
+
+ return(tuple->fields + n);
+}
+
+/**************************************************************
+Creates a data tuple to a memory heap. The default value for number
+of fields used in record comparisons for this tuple is n_fields. */
+UNIV_INLINE
+dtuple_t*
+dtuple_create(
+/*==========*/
+ /* out, own: created tuple */
+ mem_heap_t* heap, /* in: memory heap where the tuple
+ is created */
+ ulint n_fields) /* in: number of fields */
+{
+ dtuple_t* tuple;
+
+ ut_ad(heap);
+
+ tuple = (dtuple_t*) mem_heap_alloc(heap, sizeof(dtuple_t)
+ + n_fields * sizeof(dfield_t));
+ tuple->info_bits = 0;
+ tuple->n_fields = n_fields;
+ tuple->n_fields_cmp = n_fields;
+ tuple->fields = (dfield_t*)(((byte*)tuple) + sizeof(dtuple_t));
+
+#ifdef UNIV_DEBUG
+ tuple->magic_n = DATA_TUPLE_MAGIC_N;
+
+ { /* In the debug version, initialize fields to an error value */
+ ulint i;
+
+ for (i = 0; i < n_fields; i++) {
+ (tuple->fields + i)->data = &data_error;
+ dfield_get_type(tuple->fields + i)->mtype = DATA_ERROR;
+ }
+ }
+#endif
+ return(tuple);
+}
+
+/**************************************************************
+The following function returns the sum of data lengths of a tuple. The space
+occupied by the field structs or the tuple struct is not counted. */
+UNIV_INLINE
+ulint
+dtuple_get_data_size(
+/*=================*/
+ /* out: sum of data lens */
+ dtuple_t* tuple) /* in: typed data tuple */
+{
+ dfield_t* field;
+ ulint n_fields;
+ ulint len;
+ ulint i;
+ ulint sum = 0;
+
+ ut_ad(tuple);
+ ut_ad(dtuple_check_typed(tuple));
+ ut_ad(tuple->magic_n = DATA_TUPLE_MAGIC_N);
+
+ n_fields = tuple->n_fields;
+
+ for (i = 0; i < n_fields; i++) {
+ field = dtuple_get_nth_field(tuple, i);
+ len = dfield_get_len(field);
+
+ if (len == UNIV_SQL_NULL) {
+ len = dtype_get_sql_null_size(dfield_get_type(field));
+ }
+
+ sum += len;
+ }
+
+ return(sum);
+}
+
+/****************************************************************
+Returns TRUE if lengths of two dtuples are equal and respective data fields
+in them are equal. */
+UNIV_INLINE
+ibool
+dtuple_datas_are_equal(
+/*===================*/
+ /* out: TRUE if length and datas are equal */
+ dtuple_t* tuple1, /* in: tuple 1 */
+ dtuple_t* tuple2) /* in: tuple 2 */
+{
+ dfield_t* field1;
+ dfield_t* field2;
+ ulint n_fields;
+ byte* data1;
+ byte* data2;
+ ulint len1;
+ ulint len2;
+ ulint i;
+
+ ut_ad(tuple1 && tuple2);
+ ut_ad(tuple1->magic_n = DATA_TUPLE_MAGIC_N);
+ ut_ad(tuple2->magic_n = DATA_TUPLE_MAGIC_N);
+ ut_ad(dtuple_check_typed(tuple1));
+ ut_ad(dtuple_check_typed(tuple2));
+
+ n_fields = dtuple_get_n_fields(tuple1);
+
+ if (n_fields != dtuple_get_n_fields(tuple2)) {
+
+ return(FALSE);
+ }
+
+ for (i = 0; i < n_fields; i++) {
+
+ field1 = dtuple_get_nth_field(tuple1, i);
+ data1 = (byte*) dfield_get_data(field1);
+ len1 = dfield_get_len(field1);
+
+ field2 = dtuple_get_nth_field(tuple2, i);
+ data2 = (byte*) dfield_get_data(field2);
+ len2 = dfield_get_len(field2);
+
+ if (len1 != len2) {
+
+ return(FALSE);
+ }
+
+ if (len1 != UNIV_SQL_NULL) {
+ if (ut_memcmp(data1, data2, len1) != 0) {
+
+ return(FALSE);
+ }
+ }
+ }
+
+ return(TRUE);
+}
+
+/***********************************************************************
+Sets types of fields binary in a tuple. */
+UNIV_INLINE
+void
+dtuple_set_types_binary(
+/*====================*/
+ dtuple_t* tuple, /* in: data tuple */
+ ulint n) /* in: number of fields to set */
+{
+ dtype_t* dfield_type;
+ ulint i;
+
+ for (i = 0; i < n; i++) {
+ dfield_type = dfield_get_type(dtuple_get_nth_field(tuple, i));
+ dtype_set(dfield_type, DATA_BINARY, 0, 0, 0);
+ }
+}
+
+/****************************************************************
+Folds a prefix given as the number of fields of a tuple. */
+UNIV_INLINE
+ulint
+dtuple_fold(
+/*========*/
+ /* out: the folded value */
+ dtuple_t* tuple, /* in: the tuple */
+ ulint n_fields,/* in: number of complete fields to fold */
+ ulint n_bytes,/* in: number of bytes to fold in an
+ incomplete last field */
+ dulint tree_id)/* in: index tree id */
+{
+ dfield_t* field;
+ ulint i;
+ byte* data;
+ ulint len;
+ ulint fold;
+
+ ut_ad(tuple);
+ ut_ad(tuple->magic_n = DATA_TUPLE_MAGIC_N);
+ ut_ad(dtuple_check_typed(tuple));
+
+ fold = ut_fold_dulint(tree_id);
+
+ for (i = 0; i < n_fields; i++) {
+ field = dtuple_get_nth_field(tuple, i);
+
+ data = (byte*) dfield_get_data(field);
+ len = dfield_get_len(field);
+
+ if (len != UNIV_SQL_NULL) {
+ fold = ut_fold_ulint_pair(fold,
+ ut_fold_binary(data, len));
+ }
+ }
+
+ if (n_bytes > 0) {
+ field = dtuple_get_nth_field(tuple, i);
+
+ data = (byte*) dfield_get_data(field);
+ len = dfield_get_len(field);
+
+ if (len != UNIV_SQL_NULL) {
+ if (len > n_bytes) {
+ len = n_bytes;
+ }
+
+ fold = ut_fold_ulint_pair(fold,
+ ut_fold_binary(data, len));
+ }
+ }
+
+ return(fold);
+}
+
+/**************************************************************************
+Writes an SQL null field full of zeros. */
+UNIV_INLINE
+void
+data_write_sql_null(
+/*================*/
+ byte* data, /* in: pointer to a buffer of size len */
+ ulint len) /* in: SQL null size in bytes */
+{
+ ulint j;
+
+ for (j = 0; j < len; j++) {
+ data[j] = '\0';
+ }
+}
diff --git a/innobase/include/data0type.h b/innobase/include/data0type.h
new file mode 100644
index 00000000000..4817f0ca839
--- /dev/null
+++ b/innobase/include/data0type.h
@@ -0,0 +1,214 @@
+/******************************************************
+Data types
+
+(c) 1996 Innobase Oy
+
+Created 1/16/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef data0type_h
+#define data0type_h
+
+#include "univ.i"
+
+/* SQL data type struct */
+typedef struct dtype_struct dtype_t;
+
+/* This variable is initialized as the standard binary variable length
+data type */
+extern dtype_t* dtype_binary;
+
+/* Data main types of SQL data; NOTE! character data types requiring
+collation transformation must have the smallest codes! All codes must be
+less than 256! */
+#define DATA_VARCHAR 1 /* character varying */
+#define DATA_CHAR 2 /* fixed length character */
+#define DATA_FIXBINARY 3 /* binary string of fixed length */
+#define DATA_BINARY 4 /* binary string */
+#define DATA_BLOB 5 /* binary large object */
+#define DATA_INT 6 /* integer: can be any size 1 - 8 bytes */
+#define DATA_SYS_CHILD 7 /* address of the child page in node pointer */
+#define DATA_SYS 8 /* system column */
+/* Data types >= DATA_FLOAT must be compared using the whole field, not as
+binary strings */
+#define DATA_FLOAT 9
+#define DATA_DOUBLE 10
+#define DATA_DECIMAL 11 /* decimal number stored as an ASCII string */
+#define DATA_VARMYSQL 12 /* data types for which comparisons must be */
+#define DATA_MYSQL 13 /* made by MySQL */
+#define DATA_ERROR 111 /* error value */
+#define DATA_MTYPE_MAX 255
+/*-------------------------------------------*/
+/* Precise data types for system columns; NOTE: the values must run
+from 0 up in the order given! All codes must be less than 256! */
+#define DATA_ROW_ID 0 /* row id: a dulint */
+#define DATA_ROW_ID_LEN 6 /* stored length for row id */
+#define DATA_TRX_ID 1 /* transaction id: 6 bytes */
+#define DATA_TRX_ID_LEN 6
+#define DATA_ROLL_PTR 2 /* rollback data pointer: 7 bytes */
+#define DATA_ROLL_PTR_LEN 7
+#define DATA_MIX_ID 3 /* mixed index label: a dulint, stored in
+ a row in a compressed form */
+#define DATA_MIX_ID_LEN 9 /* maximum stored length for mix id (in a
+ compressed dulint form) */
+#define DATA_N_SYS_COLS 4 /* number of system columns defined above */
+#define DATA_NOT_NULL 256 /* this is ORed to the precise type when
+ the column is declared as NOT NULL */
+#define DATA_UNSIGNED 512 /* this id ORed to the precise type when
+ we have an unsigned integer type */
+/*-------------------------------------------*/
+
+/* Precise types of a char or varchar data. All codes must be less than 256! */
+#define DATA_ENGLISH 4 /* English language character string */
+#define DATA_FINNISH 5 /* Finnish */
+#define DATA_PRTYPE_MAX 255
+
+/* This many bytes we need to store the type information affecting the
+alphabetical order for a single field and decide the storage size of an
+SQL null*/
+#define DATA_ORDER_NULL_TYPE_BUF_SIZE 4
+
+/*************************************************************************
+Sets a data type structure. */
+UNIV_INLINE
+void
+dtype_set(
+/*======*/
+ dtype_t* type, /* in: type struct to init */
+ ulint mtype, /* in: main data type */
+ ulint prtype, /* in: precise type */
+ ulint len, /* in: length of type */
+ ulint prec); /* in: precision of type */
+/*************************************************************************
+Copies a data type structure. */
+UNIV_INLINE
+void
+dtype_copy(
+/*=======*/
+ dtype_t* type1, /* in: type struct to copy to */
+ dtype_t* type2); /* in: type struct to copy from */
+/*************************************************************************
+Gets the SQL main data type. */
+UNIV_INLINE
+ulint
+dtype_get_mtype(
+/*============*/
+ dtype_t* type);
+/*************************************************************************
+Gets the precise data type. */
+UNIV_INLINE
+ulint
+dtype_get_prtype(
+/*=============*/
+ dtype_t* type);
+/*************************************************************************
+Gets the type length. */
+UNIV_INLINE
+ulint
+dtype_get_len(
+/*==========*/
+ dtype_t* type);
+/*************************************************************************
+Gets the type precision. */
+UNIV_INLINE
+ulint
+dtype_get_prec(
+/*===========*/
+ dtype_t* type);
+/*************************************************************************
+Gets the padding character code for the type. */
+UNIV_INLINE
+ulint
+dtype_get_pad_char(
+/*===============*/
+ /* out: padding character code, or
+ ULINT_UNDEFINED if no padding specified */
+ dtype_t* type); /* in: typeumn */
+/*************************************************************************
+Transforms the character code so that it is ordered appropriately
+for the language. */
+UNIV_INLINE
+ulint
+dtype_collate(
+/*==========*/
+ /* out: padding character */
+ dtype_t* type, /* in: type */
+ ulint code); /* in: character code stored in database
+ record */
+/***************************************************************************
+Returns the size of a fixed size data type, 0 if not a fixed size type. */
+UNIV_INLINE
+ulint
+dtype_get_fixed_size(
+/*=================*/
+ /* out: fixed size, or 0 */
+ dtype_t* type); /* in: type */
+/***************************************************************************
+Returns a stored SQL NULL size for a type. For fixed length types it is
+the fixed length of the type, otherwise 0. */
+UNIV_INLINE
+ulint
+dtype_get_sql_null_size(
+/*====================*/
+ /* out: SQL null storage size */
+ dtype_t* type); /* in: type */
+/***************************************************************************
+Returns TRUE if a type is of a fixed size. */
+UNIV_INLINE
+ibool
+dtype_is_fixed_size(
+/*================*/
+ /* out: TRUE if fixed size */
+ dtype_t* type); /* in: type */
+/**************************************************************************
+Stores to a type the information which determines its alphabetical
+ordering. */
+UNIV_INLINE
+void
+dtype_store_for_order_and_null_size(
+/*================================*/
+ byte* buf, /* in: buffer for DATA_ORDER_NULL_TYPE_BUF_SIZE
+ bytes */
+ dtype_t* type); /* in: type struct */
+/**************************************************************************
+Reads of a type the stored information which determines its alphabetical
+ordering. */
+UNIV_INLINE
+void
+dtype_read_for_order_and_null_size(
+/*===============================*/
+ dtype_t* type, /* in: type struct */
+ byte* buf); /* in: buffer for type order info */
+/*************************************************************************
+Validates a data type structure. */
+
+ibool
+dtype_validate(
+/*===========*/
+ /* out: TRUE if ok */
+ dtype_t* type); /* in: type struct to validate */
+/*************************************************************************
+Prints a data type structure. */
+
+void
+dtype_print(
+/*========*/
+ dtype_t* type); /* in: type */
+
+/* Structure for an SQL data type */
+
+struct dtype_struct{
+ ulint mtype; /* main data type */
+ ulint prtype; /* precise type; MySQL data type */
+
+ /* remaining two fields do not affect alphabetical ordering: */
+
+ ulint len; /* length */
+ ulint prec; /* precision */
+};
+
+#ifndef UNIV_NONINL
+#include "data0type.ic"
+#endif
+
+#endif
diff --git a/innobase/include/data0type.ic b/innobase/include/data0type.ic
new file mode 100644
index 00000000000..ca93c6a5383
--- /dev/null
+++ b/innobase/include/data0type.ic
@@ -0,0 +1,248 @@
+/******************************************************
+Data types
+
+(c) 1996 Innobase Oy
+
+Created 1/16/1996 Heikki Tuuri
+*******************************************************/
+
+#include "mach0data.h"
+
+/*************************************************************************
+Sets a data type structure. */
+UNIV_INLINE
+void
+dtype_set(
+/*======*/
+ dtype_t* type, /* in: type struct to init */
+ ulint mtype, /* in: main data type */
+ ulint prtype, /* in: precise type */
+ ulint len, /* in: length of type */
+ ulint prec) /* in: precision of type */
+{
+ ut_ad(type);
+ ut_ad(mtype <= DATA_MTYPE_MAX);
+
+ type->mtype = mtype;
+ type->prtype = prtype;
+ type->len = len;
+ type->prec = prec;
+
+ ut_ad(dtype_validate(type));
+}
+
+/*************************************************************************
+Copies a data type structure. */
+UNIV_INLINE
+void
+dtype_copy(
+/*=======*/
+ dtype_t* type1, /* in: type struct to copy to */
+ dtype_t* type2) /* in: type struct to copy from */
+{
+ *type1 = *type2;
+
+ ut_ad(dtype_validate(type1));
+}
+
+/*************************************************************************
+Gets the SQL main data type. */
+UNIV_INLINE
+ulint
+dtype_get_mtype(
+/*============*/
+ dtype_t* type)
+{
+ ut_ad(type);
+
+ return(type->mtype);
+}
+
+/*************************************************************************
+Gets the precise data type. */
+UNIV_INLINE
+ulint
+dtype_get_prtype(
+/*=============*/
+ dtype_t* type)
+{
+ ut_ad(type);
+
+ return(type->prtype);
+}
+
+/*************************************************************************
+Gets the type length. */
+UNIV_INLINE
+ulint
+dtype_get_len(
+/*==========*/
+ dtype_t* type)
+{
+ ut_ad(type);
+
+ return(type->len);
+}
+
+/*************************************************************************
+Gets the type precision. */
+UNIV_INLINE
+ulint
+dtype_get_prec(
+/*===========*/
+ dtype_t* type)
+{
+ ut_ad(type);
+
+ return(type->prec);
+}
+
+/*************************************************************************
+Gets the padding character code for the type. */
+UNIV_INLINE
+ulint
+dtype_get_pad_char(
+/*===============*/
+ /* out: padding character code, or
+ ULINT_UNDEFINED if no padding specified */
+ dtype_t* type) /* in: type */
+{
+ if (type->mtype == DATA_CHAR) {
+ /* space is the padding character for all char strings */
+
+ return((ulint)' ');
+ }
+
+ ut_ad((type->mtype == DATA_BINARY) || (type->mtype == DATA_VARCHAR));
+
+ /* No padding specified */
+
+ return(ULINT_UNDEFINED);
+}
+
+/*************************************************************************
+Transforms the character code so that it is ordered appropriately for the
+language. */
+UNIV_INLINE
+ulint
+dtype_collate(
+/*==========*/
+ /* out: collation order position */
+ dtype_t* type, /* in: type */
+ ulint code) /* in: character code stored in database
+ record */
+{
+ ut_ad((type->mtype == DATA_CHAR) || (type->mtype == DATA_VARCHAR));
+
+ return(toupper(code));
+}
+
+/**************************************************************************
+Stores to a type the information which determines its alphabetical
+ordering. */
+UNIV_INLINE
+void
+dtype_store_for_order_and_null_size(
+/*================================*/
+ byte* buf, /* in: buffer for DATA_ORDER_NULL_TYPE_BUF_SIZE
+ bytes */
+ dtype_t* type) /* in: type struct */
+{
+ ut_ad(4 == DATA_ORDER_NULL_TYPE_BUF_SIZE);
+ ut_ad(type->prtype < 256);
+
+ buf[0] = (byte)(type->mtype & 0xFF);
+ buf[1] = (byte)(type->prtype & 0xFF);
+
+ mach_write_to_2(buf + 2, type->len & 0xFFFF);
+}
+
+/**************************************************************************
+Reads of a type the stored information which determines its alphabetical
+ordering. */
+UNIV_INLINE
+void
+dtype_read_for_order_and_null_size(
+/*===============================*/
+ dtype_t* type, /* in: type struct */
+ byte* buf) /* in: buffer for type order info */
+{
+ ut_ad(4 == DATA_ORDER_NULL_TYPE_BUF_SIZE);
+
+ type->mtype = buf[0];
+ type->prtype = buf[1];
+
+ type->len = mach_read_from_2(buf + 2);
+}
+
+/***************************************************************************
+Returns the size of a fixed size data type, 0 if not a fixed size type. */
+UNIV_INLINE
+ulint
+dtype_get_fixed_size(
+/*=================*/
+ /* out: fixed size, or 0 */
+ dtype_t* type) /* in: type */
+{
+ ulint mtype;
+
+ mtype = dtype_get_mtype(type);
+
+ switch (mtype) {
+ case DATA_CHAR:
+ case DATA_FIXBINARY:
+ case DATA_INT:
+ case DATA_FLOAT:
+ case DATA_DOUBLE:
+ case DATA_MYSQL:
+ return(dtype_get_len(type));
+
+ case DATA_SYS: if (type->prtype == DATA_ROW_ID) {
+ return(DATA_ROW_ID_LEN);
+ } else {
+ return(0);
+ }
+ case DATA_VARCHAR:
+ case DATA_BINARY:
+ case DATA_DECIMAL:
+ case DATA_VARMYSQL:
+ case DATA_BLOB:
+ return(0);
+ default: ut_a(0);
+ }
+
+ return(0);
+}
+
+/***************************************************************************
+Returns a stored SQL NULL size for a type. For fixed length types it is
+the fixed length of the type, otherwise 0. */
+UNIV_INLINE
+ulint
+dtype_get_sql_null_size(
+/*====================*/
+ /* out: SQL null storage size */
+ dtype_t* type) /* in: type */
+{
+ return(dtype_get_fixed_size(type));
+}
+
+/***************************************************************************
+Returns TRUE if a type is of a fixed size. */
+UNIV_INLINE
+ibool
+dtype_is_fixed_size(
+/*================*/
+ /* out: TRUE if fixed size */
+ dtype_t* type) /* in: type */
+{
+ ulint size;
+
+ size = dtype_get_fixed_size(type);
+
+ if (size) {
+ return(TRUE);
+ }
+
+ return(FALSE);
+}
diff --git a/innobase/include/data0types.h b/innobase/include/data0types.h
new file mode 100644
index 00000000000..ab314f8f471
--- /dev/null
+++ b/innobase/include/data0types.h
@@ -0,0 +1,19 @@
+/************************************************************************
+Some type definitions
+
+(c) 1994-2000 Innobase Oy
+
+Created 9/21/2000 Heikki Tuuri
+*************************************************************************/
+
+#ifndef data0types_h
+#define data0types_h
+
+/* SQL data field struct */
+typedef struct dfield_struct dfield_t;
+
+/* SQL data tuple struct */
+typedef struct dtuple_struct dtuple_t;
+
+#endif
+
diff --git a/innobase/include/db0err.h b/innobase/include/db0err.h
new file mode 100644
index 00000000000..34513545faa
--- /dev/null
+++ b/innobase/include/db0err.h
@@ -0,0 +1,44 @@
+/******************************************************
+Global error codes for the database
+
+(c) 1996 Innobase Oy
+
+Created 5/24/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef db0err_h
+#define db0err_h
+
+
+#define DB_SUCCESS 10
+
+/* The following are error codes */
+#define DB_ERROR 11
+#define DB_OUT_OF_MEMORY 12
+#define DB_OUT_OF_FILE_SPACE 13
+#define DB_LOCK_WAIT 14
+#define DB_DEADLOCK 15
+#define DB_ROLLBACK 16
+#define DB_DUPLICATE_KEY 17
+#define DB_QUE_THR_SUSPENDED 18
+#define DB_MISSING_HISTORY 19 /* required history data has been
+ deleted due to lack of space in
+ rollback segment */
+#define DB_CLUSTER_NOT_FOUND 30
+#define DB_TABLE_NOT_FOUND 31
+#define DB_MUST_GET_MORE_FILE_SPACE 32 /* the database has to be stopped
+ and restrated with more file space */
+#define DB_TABLE_IS_BEING_USED 33
+#define DB_TOO_BIG_RECORD 34 /* a record in an index would become
+ bigger than 1/2 free space in a page
+ frame */
+
+/* The following are partial failure codes */
+#define DB_FAIL 1000
+#define DB_OVERFLOW 1001
+#define DB_UNDERFLOW 1002
+#define DB_STRONG_FAIL 1003
+#define DB_RECORD_NOT_FOUND 1500
+#define DB_END_OF_INDEX 1501
+
+#endif
diff --git a/innobase/include/dict0boot.h b/innobase/include/dict0boot.h
new file mode 100644
index 00000000000..71180439913
--- /dev/null
+++ b/innobase/include/dict0boot.h
@@ -0,0 +1,132 @@
+/******************************************************
+Data dictionary creation and booting
+
+(c) 1996 Innobase Oy
+
+Created 4/18/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef dict0boot_h
+#define dict0boot_h
+
+#include "univ.i"
+
+#include "mtr0mtr.h"
+#include "mtr0log.h"
+#include "ut0byte.h"
+#include "buf0buf.h"
+#include "fsp0fsp.h"
+#include "dict0dict.h"
+
+typedef byte dict_hdr_t;
+
+/**************************************************************************
+Gets a pointer to the dictionary header and x-latches its page. */
+UNIV_INLINE
+dict_hdr_t*
+dict_hdr_get(
+/*=========*/
+ /* out: pointer to the dictionary header,
+ page x-latched */
+ mtr_t* mtr); /* in: mtr */
+/**************************************************************************
+Returns a new row, table, index, or tree id. */
+UNIV_INLINE
+dulint
+dict_hdr_get_new_id(
+/*================*/
+ /* out: the new id */
+ ulint type); /* in: DICT_HDR_ROW_ID, ... */
+/**************************************************************************
+Returns a new row id. */
+UNIV_INLINE
+dulint
+dict_sys_get_new_row_id(void);
+/*=========================*/
+ /* out: the new id */
+/**************************************************************************
+Reads a row id from a record or other 6-byte stored form. */
+UNIV_INLINE
+dulint
+dict_sys_read_row_id(
+/*=================*/
+ /* out: row id */
+ byte* field); /* in: record field */
+/**************************************************************************
+Writes a row id to a record or other 6-byte stored form. */
+UNIV_INLINE
+void
+dict_sys_write_row_id(
+/*==================*/
+ byte* field, /* in: record field */
+ dulint row_id);/* in: row id */
+/*********************************************************************
+Initializes the data dictionary memory structures when the database is
+started. This function is also called when the data dictionary is created. */
+
+void
+dict_boot(void);
+/*===========*/
+/*********************************************************************
+Creates and initializes the data dictionary at the database creation. */
+
+void
+dict_create(void);
+/*=============*/
+
+
+/* Space id and page no where the dictionary header resides */
+#define DICT_HDR_SPACE 0 /* the SYSTEM tablespace */
+#define DICT_HDR_PAGE_NO FSP_DICT_HDR_PAGE_NO
+
+/* The ids for the basic system tables and their indexes */
+#define DICT_TABLES_ID ut_dulint_create(0, 1)
+#define DICT_COLUMNS_ID ut_dulint_create(0, 2)
+#define DICT_INDEXES_ID ut_dulint_create(0, 3)
+#define DICT_FIELDS_ID ut_dulint_create(0, 4)
+/* The following is a secondary index on SYS_TABLES */
+#define DICT_TABLE_IDS_ID ut_dulint_create(0, 5)
+
+#define DICT_HDR_FIRST_ID 10 /* the ids for tables etc. start
+ from this number, except for basic
+ system tables and their above defined
+ indexes; ibuf tables and indexes are
+ assigned as the id the number
+ DICT_IBUF_ID_MIN plus the space id */
+#define DICT_IBUF_ID_MIN ut_dulint_create(0xFFFFFFFF, 0)
+
+/* The offset of the dictionary header on the page */
+#define DICT_HDR FSEG_PAGE_DATA
+
+/*-------------------------------------------------------------*/
+/* Dictionary header offsets */
+#define DICT_HDR_ROW_ID 0 /* The latest assigned row id */
+#define DICT_HDR_TABLE_ID 8 /* The latest assigned table id */
+#define DICT_HDR_INDEX_ID 16 /* The latest assigned index id */
+#define DICT_HDR_MIX_ID 24 /* The latest assigned mix id */
+#define DICT_HDR_TABLES 32 /* Root of the table index tree */
+#define DICT_HDR_TABLE_IDS 36 /* Root of the table index tree */
+#define DICT_HDR_COLUMNS 40 /* Root of the column index tree */
+#define DICT_HDR_INDEXES 44 /* Root of the index index tree */
+#define DICT_HDR_FIELDS 48 /* Root of the index field index tree */
+
+#define DICT_HDR_FSEG_HEADER 56 /* Segment header for the tablespace
+ segment into which the dictionary
+ header is created */
+/*-------------------------------------------------------------*/
+
+/* The field number of the page number field in the sys_indexes table
+clustered index */
+#define DICT_SYS_INDEXES_PAGE_NO_FIELD 8
+#define DICT_SYS_INDEXES_SPACE_NO_FIELD 7
+
+/* When a row id which is zero modulo this number (which must be a power of
+two) is assigned, the field DICT_HDR_ROW_ID on the dictionary header page is
+updated */
+#define DICT_HDR_ROW_ID_WRITE_MARGIN 256
+
+#ifndef UNIV_NONINL
+#include "dict0boot.ic"
+#endif
+
+#endif
diff --git a/innobase/include/dict0boot.ic b/innobase/include/dict0boot.ic
new file mode 100644
index 00000000000..8f1e214701f
--- /dev/null
+++ b/innobase/include/dict0boot.ic
@@ -0,0 +1,124 @@
+/******************************************************
+Data dictionary creation and booting
+
+(c) 1996 Innobase Oy
+
+Created 4/18/1996 Heikki Tuuri
+*******************************************************/
+
+/**************************************************************************
+Writes the current value of the row id counter to the dictionary header file
+page. */
+
+void
+dict_hdr_flush_row_id(void);
+/*=======================*/
+
+
+/**************************************************************************
+Gets a pointer to the dictionary header and x-latches its page. */
+UNIV_INLINE
+dict_hdr_t*
+dict_hdr_get(
+/*=========*/
+ /* out: pointer to the dictionary header,
+ page x-latched */
+ mtr_t* mtr) /* in: mtr */
+{
+ dict_hdr_t* header;
+
+ ut_ad(mtr);
+
+ header = DICT_HDR + buf_page_get(DICT_HDR_SPACE, DICT_HDR_PAGE_NO,
+ RW_X_LATCH, mtr);
+ buf_page_dbg_add_level(header, SYNC_DICT_HEADER);
+
+ return(header);
+}
+
+/**************************************************************************
+Returns a new table, index, or tree id. */
+UNIV_INLINE
+dulint
+dict_hdr_get_new_id(
+/*================*/
+ /* out: the new id */
+ ulint type) /* in: DICT_HDR_ROW_ID, ... */
+{
+ dict_hdr_t* dict_hdr;
+ dulint id;
+ mtr_t mtr;
+
+ ut_ad((type == DICT_HDR_TABLE_ID) || (type == DICT_HDR_INDEX_ID)
+ || (type == DICT_HDR_MIX_ID));
+
+ mtr_start(&mtr);
+
+ dict_hdr = dict_hdr_get(&mtr);
+
+ id = mtr_read_dulint(dict_hdr + type, MLOG_8BYTES, &mtr);
+
+ id = ut_dulint_add(id, 1);
+
+ mlog_write_dulint(dict_hdr + type, id, MLOG_8BYTES, &mtr);
+
+ mtr_commit(&mtr);
+
+ return(id);
+}
+
+/**************************************************************************
+Returns a new row id. */
+UNIV_INLINE
+dulint
+dict_sys_get_new_row_id(void)
+/*=========================*/
+ /* out: the new id */
+{
+ dulint id;
+
+ mutex_enter(&(dict_sys->mutex));
+
+ id = dict_sys->row_id;
+
+ if (0 == (ut_dulint_get_low(id) % DICT_HDR_ROW_ID_WRITE_MARGIN)) {
+
+ dict_hdr_flush_row_id();
+ }
+
+ UT_DULINT_INC(dict_sys->row_id);
+
+ mutex_exit(&(dict_sys->mutex));
+
+ return(id);
+}
+
+/**************************************************************************
+Reads a row id from a record or other 6-byte stored form. */
+UNIV_INLINE
+dulint
+dict_sys_read_row_id(
+/*=================*/
+ /* out: row id */
+ byte* field) /* in: record field */
+{
+ ut_ad(DATA_ROW_ID_LEN == 6);
+
+ return(mach_read_from_6(field));
+}
+
+/**************************************************************************
+Writes a row id to a record or other 6-byte stored form. */
+UNIV_INLINE
+void
+dict_sys_write_row_id(
+/*==================*/
+ byte* field, /* in: record field */
+ dulint row_id) /* in: row id */
+{
+ ut_ad(DATA_ROW_ID_LEN == 6);
+
+ mach_write_to_6(field, row_id);
+}
+
+
diff --git a/innobase/include/dict0crea.h b/innobase/include/dict0crea.h
new file mode 100644
index 00000000000..6bc31e1e722
--- /dev/null
+++ b/innobase/include/dict0crea.h
@@ -0,0 +1,140 @@
+/******************************************************
+Database object creation
+
+(c) 1996 Innobase Oy
+
+Created 1/8/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef dict0crea_h
+#define dict0crea_h
+
+#include "univ.i"
+#include "dict0types.h"
+#include "dict0dict.h"
+#include "que0types.h"
+#include "row0types.h"
+#include "mtr0mtr.h"
+
+/*************************************************************************
+Creates the default clustered index for a table: the records are ordered
+by row id. */
+
+void
+dict_create_default_index(
+/*======================*/
+ dict_table_t* table, /* in: table */
+ trx_t* trx); /* in: transaction handle */
+/*************************************************************************
+Creates a table create graph. */
+
+tab_node_t*
+tab_create_graph_create(
+/*====================*/
+ /* out, own: table create node */
+ dict_table_t* table, /* in: table to create, built as a memory data
+ structure */
+ mem_heap_t* heap); /* in: heap where created */
+/*************************************************************************
+Creates an index create graph. */
+
+ind_node_t*
+ind_create_graph_create(
+/*====================*/
+ /* out, own: index create node */
+ dict_index_t* index, /* in: index to create, built as a memory data
+ structure */
+ mem_heap_t* heap); /* in: heap where created */
+/***************************************************************
+Creates a table. This is a high-level function used in SQL execution graphs. */
+
+que_thr_t*
+dict_create_table_step(
+/*===================*/
+ /* out: query thread to run next or NULL */
+ que_thr_t* thr); /* in: query thread */
+/***************************************************************
+Creates an index. This is a high-level function used in SQL execution
+graphs. */
+
+que_thr_t*
+dict_create_index_step(
+/*===================*/
+ /* out: query thread to run next or NULL */
+ que_thr_t* thr); /* in: query thread */
+/***********************************************************************
+Drops the index tree associated with a row in SYS_INDEXES table. */
+
+void
+dict_drop_index_tree(
+/*=================*/
+ rec_t* rec, /* in: record in the clustered index of SYS_INDEXES
+ table */
+ mtr_t* mtr); /* in: mtr having the latch on the record page */
+
+
+/* Table create node structure */
+
+struct tab_node_struct{
+ que_common_t common; /* node type: QUE_NODE_TABLE_CREATE */
+ dict_table_t* table; /* table to create, built as a memory data
+ structure with dict_mem_... functions */
+ ins_node_t* tab_def; /* child node which does the insert of
+ the table definition; the row to be inserted
+ is built by the parent node */
+ ins_node_t* col_def; /* child node which does the inserts of
+ the column definitions; the row to be inserted
+ is built by the parent node */
+ commit_node_t* commit_node;
+ /* child node which performs a commit after
+ a successful table creation */
+ /*----------------------*/
+ /* Local storage for this graph node */
+ ulint state; /* node execution state */
+ ulint col_no; /* next column definition to insert */
+ mem_heap_t* heap; /* memory heap used as auxiliary storage */
+};
+
+/* Table create node states */
+#define TABLE_BUILD_TABLE_DEF 1
+#define TABLE_BUILD_COL_DEF 2
+#define TABLE_COMMIT_WORK 3
+#define TABLE_ADD_TO_CACHE 4
+#define TABLE_COMPLETED 5
+
+/* Index create node struct */
+
+struct ind_node_struct{
+ que_common_t common; /* node type: QUE_NODE_INDEX_CREATE */
+ dict_index_t* index; /* index to create, built as a memory data
+ structure with dict_mem_... functions */
+ ins_node_t* ind_def; /* child node which does the insert of
+ the index definition; the row to be inserted
+ is built by the parent node */
+ ins_node_t* field_def; /* child node which does the inserts of
+ the field definitions; the row to be inserted
+ is built by the parent node */
+ commit_node_t* commit_node;
+ /* child node which performs a commit after
+ a successful index creation */
+ /*----------------------*/
+ /* Local storage for this graph node */
+ ulint state; /* node execution state */
+ dict_table_t* table; /* table which owns the index */
+ dtuple_t* ind_row;/* index definition row built */
+ ulint field_no;/* next field definition to insert */
+ mem_heap_t* heap; /* memory heap used as auxiliary storage */
+};
+
+/* Index create node states */
+#define INDEX_BUILD_INDEX_DEF 1
+#define INDEX_BUILD_FIELD_DEF 2
+#define INDEX_CREATE_INDEX_TREE 3
+#define INDEX_COMMIT_WORK 4
+#define INDEX_ADD_TO_CACHE 5
+
+#ifndef UNIV_NONINL
+#include "dict0crea.ic"
+#endif
+
+#endif
diff --git a/innobase/include/dict0crea.ic b/innobase/include/dict0crea.ic
new file mode 100644
index 00000000000..b4da2d7e03f
--- /dev/null
+++ b/innobase/include/dict0crea.ic
@@ -0,0 +1,8 @@
+/******************************************************
+Database object creation
+
+(c) 1996 Innobase Oy
+
+Created 1/8/1996 Heikki Tuuri
+*******************************************************/
+
diff --git a/innobase/include/dict0dict.h b/innobase/include/dict0dict.h
new file mode 100644
index 00000000000..b4ff9e90c75
--- /dev/null
+++ b/innobase/include/dict0dict.h
@@ -0,0 +1,677 @@
+/******************************************************
+Data dictionary system
+
+(c) 1996 Innobase Oy
+
+Created 1/8/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef dict0dict_h
+#define dict0dict_h
+
+#include "univ.i"
+#include "dict0types.h"
+#include "dict0mem.h"
+#include "data0type.h"
+#include "data0data.h"
+#include "sync0sync.h"
+#include "sync0rw.h"
+#include "mem0mem.h"
+#include "rem0types.h"
+#include "btr0types.h"
+#include "ut0mem.h"
+#include "ut0lst.h"
+#include "hash0hash.h"
+#include "ut0rnd.h"
+#include "ut0byte.h"
+#include "trx0types.h"
+
+/**************************************************************************
+Inits the data dictionary module. */
+
+void
+dict_init(void);
+/*===========*/
+/**************************************************************************
+Returns a stored procedure object and memoryfixes it. */
+UNIV_INLINE
+dict_proc_t*
+dict_procedure_get(
+/*===============*/
+ /* out: procedure, NULL if does not exist */
+ char* proc_name, /* in: table name */
+ trx_t* trx); /* in: transaction handle or NULL */
+/**************************************************************************
+Adds a stored procedure object to the dictionary cache. */
+
+void
+dict_procedure_add_to_cache(
+/*========================*/
+ dict_proc_t* proc); /* in: procedure */
+/**************************************************************************
+Reserves a parsed copy of a stored procedure to execute. If there are no
+free parsed copies left at the moment, parses a new copy. Takes the copy off
+the list of copies: the copy must be returned there with
+dict_procedure_release_parsed_copy. */
+
+que_t*
+dict_procedure_reserve_parsed_copy(
+/*===============================*/
+ /* out: the query graph */
+ dict_proc_t* proc); /* in: dictionary procedure node */
+/**************************************************************************
+Releases a parsed copy of an executed stored procedure. Puts the copy to the
+list of copies. */
+
+void
+dict_procedure_release_parsed_copy(
+/*===============================*/
+ que_t* graph); /* in: query graph of a stored procedure */
+/*************************************************************************
+Gets the column data type. */
+UNIV_INLINE
+dtype_t*
+dict_col_get_type(
+/*==============*/
+ dict_col_t* col);
+/*************************************************************************
+Gets the column number. */
+UNIV_INLINE
+ulint
+dict_col_get_no(
+/*============*/
+ dict_col_t* col);
+/*************************************************************************
+Gets the column position in the clustered index. */
+UNIV_INLINE
+ulint
+dict_col_get_clust_pos(
+/*===================*/
+ dict_col_t* col);
+/**************************************************************************
+Adds a table object to the dictionary cache. */
+
+void
+dict_table_add_to_cache(
+/*====================*/
+ dict_table_t* table); /* in: table */
+/**************************************************************************
+Removes a table object from the dictionary cache. */
+
+void
+dict_table_remove_from_cache(
+/*=========================*/
+ dict_table_t* table); /* in, own: table */
+/**************************************************************************
+Renames a table object. */
+
+ibool
+dict_table_rename_in_cache(
+/*=======================*/
+ /* out: TRUE if success */
+ dict_table_t* table, /* in: table */
+ char* new_name); /* in: new name */
+/**************************************************************************
+Returns a table object and memoryfixes it. NOTE! This is a high-level
+function to be used mainly from outside the 'dict' directory. Inside this
+directory dict_table_get_low is usually the appropriate function. */
+
+dict_table_t*
+dict_table_get(
+/*===========*/
+ /* out: table, NULL if does not exist */
+ char* table_name, /* in: table name */
+ trx_t* trx); /* in: transaction handle */
+/**************************************************************************
+Returns a table object, based on table id, and memoryfixes it. */
+
+dict_table_t*
+dict_table_get_on_id(
+/*=================*/
+ /* out: table, NULL if does not exist */
+ dulint table_id, /* in: table id */
+ trx_t* trx); /* in: transaction handle */
+/**************************************************************************
+Returns a table object, based on table id, and memoryfixes it. */
+UNIV_INLINE
+dict_table_t*
+dict_table_get_on_id_low(
+/*=====================*/
+ /* out: table, NULL if does not exist */
+ dulint table_id, /* in: table id */
+ trx_t* trx); /* in: transaction handle */
+/**************************************************************************
+Releases a table from being memoryfixed. Currently this has no relevance. */
+UNIV_INLINE
+void
+dict_table_release(
+/*===============*/
+ dict_table_t* table); /* in: table to be released */
+/**************************************************************************
+Gets a table; loads it to the dictionary cache if necessary. A low-level
+function. */
+UNIV_INLINE
+dict_table_t*
+dict_table_get_low(
+/*===============*/
+ /* out: table, NULL if not found */
+ char* table_name); /* in: table name */
+/**************************************************************************
+Returns an index object. */
+UNIV_INLINE
+dict_index_t*
+dict_table_get_index(
+/*=================*/
+ /* out: index, NULL if does not exist */
+ dict_table_t* table, /* in: table */
+ char* name); /* in: index name */
+/**************************************************************************
+Returns an index object. */
+
+dict_index_t*
+dict_table_get_index_noninline(
+/*===========================*/
+ /* out: index, NULL if does not exist */
+ dict_table_t* table, /* in: table */
+ char* name); /* in: index name */
+/**************************************************************************
+Prints a table definition. */
+
+void
+dict_table_print(
+/*=============*/
+ dict_table_t* table); /* in: table */
+/**************************************************************************
+Prints a table data when we know the table name. */
+
+void
+dict_table_print_by_name(
+/*=====================*/
+ char* name);
+/************************************************************************
+Gets the first index on the table (the clustered index). */
+UNIV_INLINE
+dict_index_t*
+dict_table_get_first_index(
+/*=======================*/
+ /* out: index, NULL if none exists */
+ dict_table_t* table); /* in: table */
+/************************************************************************
+Gets the first index on the table (the clustered index). */
+
+dict_index_t*
+dict_table_get_first_index_noninline(
+/*=================================*/
+ /* out: index, NULL if none exists */
+ dict_table_t* table); /* in: table */
+/************************************************************************
+Gets the next index on the table. */
+UNIV_INLINE
+dict_index_t*
+dict_table_get_next_index(
+/*======================*/
+ /* out: index, NULL if none left */
+ dict_index_t* index); /* in: index */
+/************************************************************************
+Gets the next index on the table. */
+
+dict_index_t*
+dict_table_get_next_index_noninline(
+/*================================*/
+ /* out: index, NULL if none left */
+ dict_index_t* index); /* in: index */
+/************************************************************************
+Gets the number of user-defined columns in a table in the dictionary
+cache. */
+UNIV_INLINE
+ulint
+dict_table_get_n_user_cols(
+/*=======================*/
+ /* out: number of user-defined (e.g., not
+ ROW_ID) columns of a table */
+ dict_table_t* table); /* in: table */
+/************************************************************************
+Gets the number of system columns in a table in the dictionary cache. */
+UNIV_INLINE
+ulint
+dict_table_get_n_sys_cols(
+/*======================*/
+ /* out: number of system (e.g.,
+ ROW_ID) columns of a table */
+ dict_table_t* table); /* in: table */
+/************************************************************************
+Gets the number of all columns (also system) in a table in the dictionary
+cache. */
+UNIV_INLINE
+ulint
+dict_table_get_n_cols(
+/*==================*/
+ /* out: number of columns of a table */
+ dict_table_t* table); /* in: table */
+/************************************************************************
+Gets the nth column of a table. */
+UNIV_INLINE
+dict_col_t*
+dict_table_get_nth_col(
+/*===================*/
+ /* out: pointer to column object */
+ dict_table_t* table, /* in: table */
+ ulint pos); /* in: position of column */
+/************************************************************************
+Gets the nth column of a table. */
+
+dict_col_t*
+dict_table_get_nth_col_noninline(
+/*=============================*/
+ /* out: pointer to column object */
+ dict_table_t* table, /* in: table */
+ ulint pos); /* in: position of column */
+/************************************************************************
+Gets the given system column of a table. */
+UNIV_INLINE
+dict_col_t*
+dict_table_get_sys_col(
+/*===================*/
+ /* out: pointer to column object */
+ dict_table_t* table, /* in: table */
+ ulint sys); /* in: DATA_ROW_ID, ... */
+/************************************************************************
+Gets the given system column number of a table. */
+UNIV_INLINE
+ulint
+dict_table_get_sys_col_no(
+/*======================*/
+ /* out: column number */
+ dict_table_t* table, /* in: table */
+ ulint sys); /* in: DATA_ROW_ID, ... */
+/***********************************************************************
+Copies types of columns contained in table to tuple. */
+
+void
+dict_table_copy_types(
+/*==================*/
+ dtuple_t* tuple, /* in: data tuple */
+ dict_table_t* table); /* in: index */
+/**************************************************************************
+Adds an index to dictionary cache. */
+
+ibool
+dict_index_add_to_cache(
+/*====================*/
+ /* out: TRUE if success */
+ dict_table_t* table, /* in: table on which the index is */
+ dict_index_t* index); /* in, own: index; NOTE! The index memory
+ object is freed in this function! */
+/************************************************************************
+Gets the number of fields in the internal representation of an index,
+including fields added by the dictionary system. */
+UNIV_INLINE
+ulint
+dict_index_get_n_fields(
+/*====================*/
+ /* out: number of fields */
+ dict_index_t* index); /* in: an internal representation of index
+ (in the dictionary cache) */
+/************************************************************************
+Gets the number of fields in the internal representation of an index
+that uniquely determine the position of an index entry in the index, if
+we do not take multiversioning into account: in the B-tree use the value
+returned by dict_index_get_n_unique_in_tree. */
+UNIV_INLINE
+ulint
+dict_index_get_n_unique(
+/*====================*/
+ /* out: number of fields */
+ dict_index_t* index); /* in: an internal representation of index
+ (in the dictionary cache) */
+/************************************************************************
+Gets the number of fields in the internal representation of an index
+which uniquely determine the position of an index entry in the index, if
+we also take multiversioning into account. */
+UNIV_INLINE
+ulint
+dict_index_get_n_unique_in_tree(
+/*============================*/
+ /* out: number of fields */
+ dict_index_t* index); /* in: an internal representation of index
+ (in the dictionary cache) */
+/************************************************************************
+Gets the number of user-defined ordering fields in the index. In the internal
+representation we add the row id to the ordering fields to make all indexes
+unique, but this function returns the number of fields the user defined
+in the index as ordering fields. */
+UNIV_INLINE
+ulint
+dict_index_get_n_ordering_defined_by_user(
+/*======================================*/
+ /* out: number of fields */
+ dict_index_t* index); /* in: an internal representation of index
+ (in the dictionary cache) */
+/************************************************************************
+Gets the nth field of an index. */
+UNIV_INLINE
+dict_field_t*
+dict_index_get_nth_field(
+/*=====================*/
+ /* out: pointer to field object */
+ dict_index_t* index, /* in: index */
+ ulint pos); /* in: position of field */
+/************************************************************************
+Gets pointer to the nth field data type in an index. */
+UNIV_INLINE
+dtype_t*
+dict_index_get_nth_type(
+/*====================*/
+ /* out: data type */
+ dict_index_t* index, /* in: index */
+ ulint pos); /* in: position of the field */
+/************************************************************************
+Gets the column number of the nth field in an index. */
+UNIV_INLINE
+ulint
+dict_index_get_nth_col_no(
+/*======================*/
+ /* out: column number */
+ dict_index_t* index, /* in: index */
+ ulint pos); /* in: position of the field */
+/************************************************************************
+Looks for column n in an index. */
+
+ulint
+dict_index_get_nth_col_pos(
+/*=======================*/
+ /* out: position in internal representation
+ of the index; if not contained, returns
+ ULINT_UNDEFINED */
+ dict_index_t* index, /* in: index */
+ ulint n); /* in: column number */
+/************************************************************************
+Looks for column n position in the clustered index. */
+
+ulint
+dict_table_get_nth_col_pos(
+/*=======================*/
+ /* out: position in internal representation
+ of the clustered index */
+ dict_table_t* table, /* in: table */
+ ulint n); /* in: column number */
+/************************************************************************
+Returns the position of a system column in an index. */
+UNIV_INLINE
+ulint
+dict_index_get_sys_col_pos(
+/*=======================*/
+ /* out: position, ULINT_UNDEFINED if not
+ contained */
+ dict_index_t* index, /* in: index */
+ ulint type); /* in: DATA_ROW_ID, ... */
+/***********************************************************************
+Copies types of fields contained in index to tuple. */
+
+void
+dict_index_copy_types(
+/*==================*/
+ dtuple_t* tuple, /* in: data tuple */
+ dict_index_t* index, /* in: index */
+ ulint n_fields); /* in: number of field types to copy */
+/************************************************************************
+Gets the value of a system column in a clustered index record. The clustered
+index must contain the system column: if the index is unique, row id is
+not contained there! */
+UNIV_INLINE
+dulint
+dict_index_rec_get_sys_col(
+/*=======================*/
+ /* out: system column value */
+ dict_index_t* index, /* in: clustered index describing the record */
+ ulint type, /* in: column type: DATA_ROLL_PTR, ... */
+ rec_t* rec); /* in: record */
+/*************************************************************************
+Gets the index tree where the index is stored. */
+UNIV_INLINE
+dict_tree_t*
+dict_index_get_tree(
+/*================*/
+ /* out: index tree */
+ dict_index_t* index); /* in: index */
+/*************************************************************************
+Gets the column data type. */
+UNIV_INLINE
+dtype_t*
+dict_col_get_type(
+/*==============*/
+ dict_col_t* col);
+/*************************************************************************
+Gets the field order criterion. */
+UNIV_INLINE
+ulint
+dict_field_get_order(
+/*=================*/
+ dict_field_t* field);
+/*************************************************************************
+Gets the field column. */
+UNIV_INLINE
+dict_col_t*
+dict_field_get_col(
+/*===============*/
+ dict_field_t* field);
+/**************************************************************************
+Creates an index tree struct. */
+
+dict_tree_t*
+dict_tree_create(
+/*=============*/
+ /* out, own: created tree */
+ dict_index_t* index); /* in: the index for which to create: in the
+ case of a mixed tree, this should be the
+ index of the cluster object */
+/**************************************************************************
+Frees an index tree struct. */
+
+void
+dict_tree_free(
+/*===========*/
+ dict_tree_t* tree); /* in, own: index tree */
+/**************************************************************************
+In an index tree, finds the index corresponding to a record in the tree. */
+
+dict_index_t*
+dict_tree_find_index(
+/*=================*/
+ /* out: index */
+ dict_tree_t* tree, /* in: index tree */
+ rec_t* rec); /* in: record for which to find correct index */
+/**************************************************************************
+In an index tree, finds the index corresponding to a dtuple which is used
+in a search to a tree. */
+
+dict_index_t*
+dict_tree_find_index_for_tuple(
+/*===========================*/
+ /* out: index; NULL if the tuple does not
+ contain the mix id field in a mixed tree */
+ dict_tree_t* tree, /* in: index tree */
+ dtuple_t* tuple); /* in: tuple for which to find index */
+/***********************************************************************
+Checks if a table which is a mixed cluster member owns a record. */
+UNIV_INLINE
+ibool
+dict_is_mixed_table_rec(
+/*====================*/
+ /* out: TRUE if the record belongs to this
+ table */
+ dict_table_t* table, /* in: table in a mixed cluster */
+ rec_t* rec); /* in: user record in the clustered index */
+/**************************************************************************
+Returns an index object if it is found in the dictionary cache. */
+
+dict_index_t*
+dict_index_get_if_in_cache(
+/*=======================*/
+ /* out: index, NULL if not found */
+ dulint index_id); /* in: index id */
+/**************************************************************************
+Checks that a tuple has n_fields_cmp value in a sensible range, so that
+no comparison can occur with the page number field in a node pointer. */
+
+ibool
+dict_tree_check_search_tuple(
+/*=========================*/
+ /* out: TRUE if ok */
+ dict_tree_t* tree, /* in: index tree */
+ dtuple_t* tuple); /* in: tuple used in a search */
+/**************************************************************************
+Builds a node pointer out of a physical record and a page number. */
+
+dtuple_t*
+dict_tree_build_node_ptr(
+/*=====================*/
+ /* out, own: node pointer */
+ dict_tree_t* tree, /* in: index tree */
+ rec_t* rec, /* in: record for which to build node pointer */
+ ulint page_no,/* in: page number to put in node pointer */
+ mem_heap_t* heap); /* in: memory heap where pointer created */
+/**************************************************************************
+Copies an initial segment of a physical record, long enough to specify an
+index entry uniquely. */
+
+rec_t*
+dict_tree_copy_rec_order_prefix(
+/*============================*/
+ /* out: pointer to the prefix record */
+ dict_tree_t* tree, /* in: index tree */
+ rec_t* rec, /* in: record for which to copy prefix */
+ byte** buf, /* in/out: memory buffer for the copied prefix,
+ or NULL */
+ ulint* buf_size);/* in/out: buffer size */
+/**************************************************************************
+Builds a typed data tuple out of a physical record. */
+
+dtuple_t*
+dict_tree_build_data_tuple(
+/*=======================*/
+ /* out, own: data tuple */
+ dict_tree_t* tree, /* in: index tree */
+ rec_t* rec, /* in: record for which to build data tuple */
+ mem_heap_t* heap); /* in: memory heap where tuple created */
+/*************************************************************************
+Gets the space id of the root of the index tree. */
+UNIV_INLINE
+ulint
+dict_tree_get_space(
+/*================*/
+ /* out: space id */
+ dict_tree_t* tree); /* in: tree */
+/*************************************************************************
+Sets the space id of the root of the index tree. */
+UNIV_INLINE
+void
+dict_tree_set_space(
+/*================*/
+ dict_tree_t* tree, /* in: tree */
+ ulint space); /* in: space id */
+/*************************************************************************
+Gets the page number of the root of the index tree. */
+UNIV_INLINE
+ulint
+dict_tree_get_page(
+/*===============*/
+ /* out: page number */
+ dict_tree_t* tree); /* in: tree */
+/*************************************************************************
+Sets the page number of the root of index tree. */
+UNIV_INLINE
+void
+dict_tree_set_page(
+/*===============*/
+ dict_tree_t* tree, /* in: tree */
+ ulint page); /* in: page number */
+/*************************************************************************
+Gets the type of the index tree. */
+UNIV_INLINE
+ulint
+dict_tree_get_type(
+/*===============*/
+ /* out: type */
+ dict_tree_t* tree); /* in: tree */
+/*************************************************************************
+Gets the read-write lock of the index tree. */
+UNIV_INLINE
+rw_lock_t*
+dict_tree_get_lock(
+/*===============*/
+ /* out: read-write lock */
+ dict_tree_t* tree); /* in: tree */
+/************************************************************************
+Returns free space reserved for future updates of records. This is
+relevant only in the case of many consecutive inserts, as updates
+which make the records bigger might fragment the index. */
+UNIV_INLINE
+ulint
+dict_tree_get_space_reserve(
+/*========================*/
+ /* out: number of free bytes on page,
+ reserved for updates */
+ dict_tree_t* tree); /* in: a tree */
+/*************************************************************************
+Calculates new estimates for table and index statistics. The statistics
+are used in query optimization. */
+
+void
+dict_update_statistics(
+/*===================*/
+ dict_table_t* table); /* in: table */
+/************************************************************************
+Reserves the dictionary system mutex for MySQL. */
+
+void
+dict_mutex_enter_for_mysql(void);
+/*============================*/
+/************************************************************************
+Releases the dictionary system mutex for MySQL. */
+
+void
+dict_mutex_exit_for_mysql(void);
+/*===========================*/
+
+
+extern dict_sys_t* dict_sys; /* the dictionary system */
+
+/* Dictionary system struct */
+struct dict_sys_struct{
+ mutex_t mutex; /* mutex protecting the data
+ dictionary; protects also the
+ disk-based dictionary system tables;
+ this mutex serializes CREATE TABLE
+ and DROP TABLE, as well as reading
+ the dictionary data for a table from
+ system tables */
+ dulint row_id; /* the next row id to assign;
+ NOTE that at a checkpoint this
+ must be written to the dict system
+ header and flushed to a file; in
+ recovery this must be derived from
+ the log records */
+ hash_table_t* table_hash; /* hash table of the tables, based
+ on name */
+ hash_table_t* table_id_hash; /* hash table of the tables, based
+ on id */
+ hash_table_t* col_hash; /* hash table of the columns */
+ hash_table_t* procedure_hash; /* hash table of the stored
+ procedures */
+ UT_LIST_BASE_NODE_T(dict_table_t)
+ table_LRU; /* LRU list of tables */
+ ulint size; /* varying space in bytes occupied
+ by the data dictionary table and
+ index objects */
+ dict_table_t* sys_tables; /* SYS_TABLES table */
+ dict_table_t* sys_columns; /* SYS_COLUMNS table */
+ dict_table_t* sys_indexes; /* SYS_INDEXES table */
+ dict_table_t* sys_fields; /* SYS_FIELDS table */
+};
+
+#ifndef UNIV_NONINL
+#include "dict0dict.ic"
+#endif
+
+#endif
diff --git a/innobase/include/dict0dict.ic b/innobase/include/dict0dict.ic
new file mode 100644
index 00000000000..549a5763b44
--- /dev/null
+++ b/innobase/include/dict0dict.ic
@@ -0,0 +1,696 @@
+/**********************************************************************
+Data dictionary system
+
+(c) 1996 Innobase Oy
+
+Created 1/8/1996 Heikki Tuuri
+***********************************************************************/
+
+#include "dict0load.h"
+#include "trx0undo.h"
+#include "trx0sys.h"
+#include "rem0rec.h"
+
+/*************************************************************************
+Gets the column data type. */
+UNIV_INLINE
+dtype_t*
+dict_col_get_type(
+/*==============*/
+ dict_col_t* col)
+{
+ ut_ad(col);
+
+ return(&col->type);
+}
+
+/*************************************************************************
+Gets the column number. */
+UNIV_INLINE
+ulint
+dict_col_get_no(
+/*============*/
+ dict_col_t* col)
+{
+ ut_ad(col);
+
+ return(col->ind);
+}
+
+/*************************************************************************
+Gets the column position in the clustered index. */
+UNIV_INLINE
+ulint
+dict_col_get_clust_pos(
+/*===================*/
+ dict_col_t* col)
+{
+ ut_ad(col);
+
+ return(col->clust_pos);
+}
+
+/************************************************************************
+Gets the first index on the table (the clustered index). */
+UNIV_INLINE
+dict_index_t*
+dict_table_get_first_index(
+/*=======================*/
+ /* out: index, NULL if none exists */
+ dict_table_t* table) /* in: table */
+{
+ ut_ad(table);
+ ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+
+ return(UT_LIST_GET_FIRST(table->indexes));
+}
+
+/************************************************************************
+Gets the next index on the table. */
+UNIV_INLINE
+dict_index_t*
+dict_table_get_next_index(
+/*======================*/
+ /* out: index, NULL if none left */
+ dict_index_t* index) /* in: index */
+{
+ ut_ad(index);
+ ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+
+ return(UT_LIST_GET_NEXT(indexes, index));
+}
+
+/************************************************************************
+Gets the number of user-defined columns in a table in the dictionary
+cache. */
+UNIV_INLINE
+ulint
+dict_table_get_n_user_cols(
+/*=======================*/
+ /* out: number of user-defined (e.g., not
+ ROW_ID) columns of a table */
+ dict_table_t* table) /* in: table */
+{
+ ut_ad(table);
+ ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+ ut_ad(table->cached);
+
+ return(table->n_cols - DATA_N_SYS_COLS);
+}
+
+/************************************************************************
+Gets the number of system columns in a table in the dictionary cache. */
+UNIV_INLINE
+ulint
+dict_table_get_n_sys_cols(
+/*======================*/
+ /* out: number of system (e.g.,
+ ROW_ID) columns of a table */
+ dict_table_t* table) /* in: table */
+{
+ ut_ad(table);
+ ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+ ut_ad(table->cached);
+
+ return(DATA_N_SYS_COLS);
+}
+
+/************************************************************************
+Gets the number of all columns (also system) in a table in the dictionary
+cache. */
+UNIV_INLINE
+ulint
+dict_table_get_n_cols(
+/*==================*/
+ /* out: number of columns of a table */
+ dict_table_t* table) /* in: table */
+{
+ ut_ad(table);
+ ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+ ut_ad(table->cached);
+
+ return(table->n_cols);
+}
+
+/************************************************************************
+Gets the nth column of a table. */
+UNIV_INLINE
+dict_col_t*
+dict_table_get_nth_col(
+/*===================*/
+ /* out: pointer to column object */
+ dict_table_t* table, /* in: table */
+ ulint pos) /* in: position of column */
+{
+ ut_ad(table);
+ ut_ad(pos < table->n_def);
+ ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+
+ return((table->cols) + pos);
+}
+
+/************************************************************************
+Gets the given system column of a table. */
+UNIV_INLINE
+dict_col_t*
+dict_table_get_sys_col(
+/*===================*/
+ /* out: pointer to column object */
+ dict_table_t* table, /* in: table */
+ ulint sys) /* in: DATA_ROW_ID, ... */
+{
+ dict_col_t* col;
+
+ ut_ad(table);
+ ut_ad(sys < DATA_N_SYS_COLS);
+ ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+
+ col = dict_table_get_nth_col(table, table->n_cols
+ - DATA_N_SYS_COLS + sys);
+ ut_ad(col->type.mtype == DATA_SYS);
+ ut_ad(col->type.prtype == sys);
+
+ return(col);
+}
+
+/************************************************************************
+Gets the given system column number of a table. */
+UNIV_INLINE
+ulint
+dict_table_get_sys_col_no(
+/*======================*/
+ /* out: column number */
+ dict_table_t* table, /* in: table */
+ ulint sys) /* in: DATA_ROW_ID, ... */
+{
+ ut_ad(table);
+ ut_ad(sys < DATA_N_SYS_COLS);
+ ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+
+ return(table->n_cols - DATA_N_SYS_COLS + sys);
+}
+
+/************************************************************************
+Gets the number of fields in the internal representation of an index,
+including fields added by the dictionary system. */
+UNIV_INLINE
+ulint
+dict_index_get_n_fields(
+/*====================*/
+ /* out: number of fields */
+ dict_index_t* index) /* in: an internal representation of index
+ (in the dictionary cache) */
+{
+ ut_ad(index);
+ ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+ ut_ad(index->cached);
+
+ return(index->n_fields);
+}
+
+/************************************************************************
+Gets the number of fields in the internal representation of an index
+that uniquely determine the position of an index entry in the index, if
+we do not take multiversioning into account: in the B-tree use the value
+returned by dict_index_get_n_unique_in_tree. */
+UNIV_INLINE
+ulint
+dict_index_get_n_unique(
+/*====================*/
+ /* out: number of fields */
+ dict_index_t* index) /* in: an internal representation of index
+ (in the dictionary cache) */
+{
+ ut_ad(index);
+ ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+ ut_ad(index->cached);
+
+ return(index->n_uniq);
+}
+
+/************************************************************************
+Gets the number of fields in the internal representation of an index
+which uniquely determine the position of an index entry in the index, if
+we also take multiversioning into account. */
+UNIV_INLINE
+ulint
+dict_index_get_n_unique_in_tree(
+/*============================*/
+ /* out: number of fields */
+ dict_index_t* index) /* in: an internal representation of index
+ (in the dictionary cache) */
+{
+ ut_ad(index);
+ ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+ ut_ad(index->cached);
+
+ if (index->type & DICT_CLUSTERED) {
+
+ return(dict_index_get_n_unique(index));
+ }
+
+ return(dict_index_get_n_fields(index));
+}
+
+/************************************************************************
+Gets the number of user-defined ordering fields in the index. In the internal
+representation of clustered indexes we add the row id to the ordering fields
+to make a clustered index unique, but this function returns the number of
+fields the user defined in the index as ordering fields. */
+UNIV_INLINE
+ulint
+dict_index_get_n_ordering_defined_by_user(
+/*======================================*/
+ /* out: number of fields */
+ dict_index_t* index) /* in: an internal representation of index
+ (in the dictionary cache) */
+{
+ return(index->n_user_defined_cols);
+}
+
+/************************************************************************
+Gets the nth field of an index. */
+UNIV_INLINE
+dict_field_t*
+dict_index_get_nth_field(
+/*=====================*/
+ /* out: pointer to field object */
+ dict_index_t* index, /* in: index */
+ ulint pos) /* in: position of field */
+{
+ ut_ad(index);
+ ut_ad(pos < index->n_def);
+ ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+
+ return((index->fields) + pos);
+}
+
+/************************************************************************
+Returns the position of a system column in an index. */
+UNIV_INLINE
+ulint
+dict_index_get_sys_col_pos(
+/*=======================*/
+ /* out: position, ULINT_UNDEFINED if not
+ contained */
+ dict_index_t* index, /* in: index */
+ ulint type) /* in: DATA_ROW_ID, ... */
+{
+ dict_col_t* col;
+
+ ut_ad(index);
+ ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+ ut_ad(!(index->type & DICT_UNIVERSAL));
+
+ col = dict_table_get_sys_col(index->table, type);
+
+ if (index->type & DICT_CLUSTERED) {
+
+ return(col->clust_pos);
+ }
+
+ return(dict_index_get_nth_col_pos(index,
+ dict_table_get_sys_col_no(index->table, type)));
+}
+
+/************************************************************************
+Gets the value of a system column in a clustered index record. The clustered
+index must contain the system column: if the index is unique, row id is
+not contained there! */
+UNIV_INLINE
+dulint
+dict_index_rec_get_sys_col(
+/*=======================*/
+ /* out: system column value */
+ dict_index_t* index, /* in: clustered index describing the record */
+ ulint type, /* in: column type: DATA_ROLL_PTR, ... */
+ rec_t* rec) /* in: record */
+{
+ ulint pos;
+ byte* field;
+ ulint len;
+
+ ut_ad(index);
+ ut_ad(index->type & DICT_CLUSTERED);
+
+ pos = dict_index_get_sys_col_pos(index, type);
+
+ ut_ad(pos != ULINT_UNDEFINED);
+
+ field = rec_get_nth_field(rec, pos, &len);
+
+ if (type == DATA_ROLL_PTR) {
+ ut_ad(len == 7);
+
+ return(trx_read_roll_ptr(field));
+ } else if ((type == DATA_ROW_ID) || (type == DATA_MIX_ID)) {
+
+ return(mach_dulint_read_compressed(field));
+ } else {
+ ut_ad(type == DATA_TRX_ID);
+
+ return(trx_read_trx_id(field));
+ }
+}
+
+/*************************************************************************
+Gets the index tree where the index is stored. */
+UNIV_INLINE
+dict_tree_t*
+dict_index_get_tree(
+/*================*/
+ /* out: index tree */
+ dict_index_t* index) /* in: index */
+{
+ ut_ad(index);
+ ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+
+ return(index->tree);
+}
+
+/*************************************************************************
+Gets the field order criterion. */
+UNIV_INLINE
+ulint
+dict_field_get_order(
+/*=================*/
+ dict_field_t* field)
+{
+ ut_ad(field);
+
+ return(field->order);
+}
+
+/*************************************************************************
+Gets the field column. */
+UNIV_INLINE
+dict_col_t*
+dict_field_get_col(
+/*===============*/
+ dict_field_t* field)
+{
+ ut_ad(field);
+
+ return(field->col);
+}
+
+/************************************************************************
+Gets pointer to the nth field data type in an index. */
+UNIV_INLINE
+dtype_t*
+dict_index_get_nth_type(
+/*====================*/
+ /* out: data type */
+ dict_index_t* index, /* in: index */
+ ulint pos) /* in: position of the field */
+{
+ return(dict_col_get_type(dict_field_get_col(
+ dict_index_get_nth_field(index, pos))));
+}
+
+/************************************************************************
+Gets the column number the nth field in an index. */
+UNIV_INLINE
+ulint
+dict_index_get_nth_col_no(
+/*======================*/
+ /* out: column number */
+ dict_index_t* index, /* in: index */
+ ulint pos) /* in: position of the field */
+{
+ return(dict_col_get_no(dict_field_get_col(
+ dict_index_get_nth_field(index, pos))));
+}
+
+/*************************************************************************
+Gets the space id of the root of the index tree. */
+UNIV_INLINE
+ulint
+dict_tree_get_space(
+/*================*/
+ /* out: space id */
+ dict_tree_t* tree) /* in: tree */
+{
+ ut_ad(tree);
+ ut_ad(tree->magic_n == DICT_TREE_MAGIC_N);
+
+ return(tree->space);
+}
+
+/*************************************************************************
+Sets the space id of the root of the index tree. */
+UNIV_INLINE
+void
+dict_tree_set_space(
+/*================*/
+ dict_tree_t* tree, /* in: tree */
+ ulint space) /* in: space id */
+{
+ ut_ad(tree);
+ ut_ad(tree->magic_n == DICT_TREE_MAGIC_N);
+
+ tree->space = space;
+}
+
+/*************************************************************************
+Gets the page number of the root of the index tree. */
+UNIV_INLINE
+ulint
+dict_tree_get_page(
+/*===============*/
+ /* out: page number */
+ dict_tree_t* tree) /* in: tree */
+{
+ ut_ad(tree);
+ ut_ad(tree->magic_n == DICT_TREE_MAGIC_N);
+
+ return(tree->page);
+}
+
+/*************************************************************************
+Sets the page number of the root of index tree. */
+UNIV_INLINE
+void
+dict_tree_set_page(
+/*===============*/
+ dict_tree_t* tree, /* in: tree */
+ ulint page) /* in: page number */
+{
+ ut_ad(tree);
+ ut_ad(tree->magic_n == DICT_TREE_MAGIC_N);
+
+ tree->page = page;
+}
+
+/*************************************************************************
+Gets the type of the index tree. */
+UNIV_INLINE
+ulint
+dict_tree_get_type(
+/*===============*/
+ /* out: type */
+ dict_tree_t* tree) /* in: tree */
+{
+ ut_ad(tree);
+ ut_ad(tree->magic_n == DICT_TREE_MAGIC_N);
+
+ return(tree->type);
+}
+
+/*************************************************************************
+Gets the read-write lock of the index tree. */
+UNIV_INLINE
+rw_lock_t*
+dict_tree_get_lock(
+/*===============*/
+ /* out: read-write lock */
+ dict_tree_t* tree) /* in: tree */
+{
+ ut_ad(tree);
+ ut_ad(tree->magic_n == DICT_TREE_MAGIC_N);
+
+ return(&(tree->lock));
+}
+
+/************************************************************************
+Returns free space reserved for future updates of records. This is
+relevant only in the case of many consecutive inserts, as updates
+which make the records bigger might fragment the index. */
+UNIV_INLINE
+ulint
+dict_tree_get_space_reserve(
+/*========================*/
+ /* out: number of free bytes on page,
+ reserved for updates */
+ dict_tree_t* tree) /* in: a tree */
+{
+ ut_ad(tree);
+
+ UT_NOT_USED(tree);
+
+ return(UNIV_PAGE_SIZE / 16);
+}
+
+/**************************************************************************
+Gets a table; loads it to the dictionary cache if necessary. A low-level
+function. */
+UNIV_INLINE
+dict_table_t*
+dict_table_get_low(
+/*===============*/
+ /* out: table, NULL if not found */
+ char* table_name) /* in: table name */
+{
+ dict_table_t* table;
+ ulint table_fold;
+
+ ut_ad(table_name);
+ ut_ad(mutex_own(&(dict_sys->mutex)));
+
+ /* Look for the table name in the hash table */
+ table_fold = ut_fold_string(table_name);
+
+ HASH_SEARCH(name_hash, dict_sys->table_hash, table_fold, table,
+ ut_strcmp(table->name, table_name) == 0);
+ if (table == NULL) {
+ table = dict_load_table(table_name);
+ }
+
+ return(table);
+}
+
+/**************************************************************************
+Returns a stored procedure object and memoryfixes it. */
+UNIV_INLINE
+dict_proc_t*
+dict_procedure_get(
+/*===============*/
+ /* out: procedure, NULL if does not exist */
+ char* proc_name, /* in: table name */
+ trx_t* trx) /* in: transaction handle or NULL */
+{
+ dict_proc_t* proc;
+ ulint name_fold;
+
+ UT_NOT_USED(trx);
+
+ mutex_enter(&(dict_sys->mutex));
+
+ /* Look for the table name in the hash table */
+ name_fold = ut_fold_string(proc_name);
+
+ HASH_SEARCH(name_hash, dict_sys->procedure_hash, name_fold, proc,
+ ut_strcmp(proc->name, proc_name) == 0);
+ if (proc != NULL) {
+ proc->mem_fix++;
+ }
+
+ mutex_exit(&(dict_sys->mutex));
+
+ return(proc);
+}
+
+/**************************************************************************
+Returns a table object, based on table id, and memoryfixes it. */
+UNIV_INLINE
+dict_table_t*
+dict_table_get_on_id_low(
+/*=====================*/
+ /* out: table, NULL if does not exist */
+ dulint table_id, /* in: table id */
+ trx_t* trx) /* in: transaction handle */
+{
+ dict_table_t* table;
+ ulint fold;
+
+ UT_NOT_USED(trx);
+
+ /* Look for the table name in the hash table */
+ fold = ut_fold_dulint(table_id);
+
+ HASH_SEARCH(id_hash, dict_sys->table_id_hash, fold, table,
+ ut_dulint_cmp(table->id, table_id) == 0);
+ if (table == NULL) {
+ table = dict_load_table_on_id(table_id);
+ }
+
+ if (table != NULL) {
+ table->mem_fix++;
+
+ /* lock_push(trx, table, LOCK_DICT_MEM_FIX) */
+ }
+
+ /* TODO: should get the type information from MySQL */
+
+ return(table);
+}
+
+/**************************************************************************
+Releases a table from being memoryfixed. Currently this has no relevance. */
+UNIV_INLINE
+void
+dict_table_release(
+/*===============*/
+ dict_table_t* table) /* in: table to be released */
+{
+ mutex_enter(&(dict_sys->mutex));
+
+ table->mem_fix--;
+
+ mutex_exit(&(dict_sys->mutex));
+}
+
+/**************************************************************************
+Returns an index object. */
+UNIV_INLINE
+dict_index_t*
+dict_table_get_index(
+/*=================*/
+ /* out: index, NULL if does not exist */
+ dict_table_t* table, /* in: table */
+ char* name) /* in: index name */
+{
+ dict_index_t* index = NULL;
+
+ mutex_enter(&(dict_sys->mutex));
+
+ index = dict_table_get_first_index(table);
+
+ while (index != NULL) {
+ if (ut_strcmp(name, index->name) == 0) {
+
+ break;
+ }
+
+ index = dict_table_get_next_index(index);
+ }
+
+ mutex_exit(&(dict_sys->mutex));
+
+ return(index);
+}
+
+/***********************************************************************
+Checks if a table which is a mixed cluster member owns a record. */
+UNIV_INLINE
+ibool
+dict_is_mixed_table_rec(
+/*====================*/
+ /* out: TRUE if the record belongs to this
+ table */
+ dict_table_t* table, /* in: table in a mixed cluster */
+ rec_t* rec) /* in: user record in the clustered index */
+{
+ byte* mix_id_field;
+ ulint len;
+
+ mix_id_field = rec_get_nth_field(rec, table->mix_len, &len);
+
+ if ((len != table->mix_id_len)
+ || (0 != ut_memcmp(table->mix_id_buf, mix_id_field, len))) {
+
+ return(FALSE);
+ }
+
+ return(TRUE);
+}
diff --git a/innobase/include/dict0load.h b/innobase/include/dict0load.h
new file mode 100644
index 00000000000..d0298d8df37
--- /dev/null
+++ b/innobase/include/dict0load.h
@@ -0,0 +1,49 @@
+/******************************************************
+Loads to the memory cache database object definitions
+from dictionary tables
+
+(c) 1996 Innobase Oy
+
+Created 4/24/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef dict0load_h
+#define dict0load_h
+
+#include "univ.i"
+#include "dict0types.h"
+#include "ut0byte.h"
+
+/************************************************************************
+Loads a table definition and also all its index definitions, and also
+the cluster definition, if the table is a member in a cluster. */
+
+dict_table_t*
+dict_load_table(
+/*============*/
+ /* out: table, NULL if does not exist */
+ char* name); /* in: table name */
+/***************************************************************************
+Loads a table object based on the table id. */
+
+dict_table_t*
+dict_load_table_on_id(
+/*==================*/
+ /* out: table; NULL if table does not exist */
+ dulint table_id); /* in: table id */
+/************************************************************************
+This function is called when the database is booted.
+Loads system table index definitions except for the clustered index which
+is added to the dictionary cache at booting before calling this function. */
+
+void
+dict_load_sys_table(
+/*================*/
+ dict_table_t* table); /* in: system table */
+
+
+#ifndef UNIV_NONINL
+#include "dict0load.ic"
+#endif
+
+#endif
diff --git a/innobase/include/dict0load.ic b/innobase/include/dict0load.ic
new file mode 100644
index 00000000000..1a207fbf0fd
--- /dev/null
+++ b/innobase/include/dict0load.ic
@@ -0,0 +1,9 @@
+/******************************************************
+Loads to the memory cache database object definitions
+from dictionary tables
+
+(c) 1996 Innobase Oy
+
+Created 4/24/1996 Heikki Tuuri
+*******************************************************/
+
diff --git a/innobase/include/dict0mem.h b/innobase/include/dict0mem.h
new file mode 100644
index 00000000000..42b9cb55270
--- /dev/null
+++ b/innobase/include/dict0mem.h
@@ -0,0 +1,335 @@
+/******************************************************
+Data dictionary memory object creation
+
+(c) 1996 Innobase Oy
+
+Created 1/8/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef dict0mem_h
+#define dict0mem_h
+
+#include "univ.i"
+#include "dict0types.h"
+#include "data0type.h"
+#include "data0data.h"
+#include "mem0mem.h"
+#include "rem0types.h"
+#include "btr0types.h"
+#include "ut0mem.h"
+#include "ut0lst.h"
+#include "ut0rnd.h"
+#include "ut0byte.h"
+#include "sync0rw.h"
+#include "lock0types.h"
+#include "hash0hash.h"
+#include "que0types.h"
+
+/* Type flags of an index: OR'ing of the flags is allowed to define a
+combination of types */
+#define DICT_CLUSTERED 1 /* clustered index */
+#define DICT_UNIQUE 2 /* unique index */
+#define DICT_UNIVERSAL 4 /* index which can contain records from any
+ other index */
+#define DICT_IBUF 8 /* insert buffer tree */
+
+/* Flags for ordering an index field: OR'ing of the flags allowed */
+#define DICT_DESCEND 1 /* in descending order (default ascending) */
+
+/* Types for a table object */
+#define DICT_TABLE_ORDINARY 1
+#define DICT_TABLE_CLUSTER_MEMBER 2
+#define DICT_TABLE_CLUSTER 3 /* this means that the table is
+ really a cluster definition */
+
+/**************************************************************************
+Creates a table memory object. */
+
+dict_table_t*
+dict_mem_table_create(
+/*==================*/
+ /* out, own: table object */
+ char* name, /* in: table name */
+ ulint space, /* in: space where the clustered index of
+ the table is placed; this parameter is
+ ignored if the table is made a member of
+ a cluster */
+ ulint n_cols); /* in: number of columns */
+/**************************************************************************
+Creates a cluster memory object. */
+
+dict_cluster_t*
+dict_mem_cluster_create(
+/*====================*/
+ /* out, own: cluster object (where the type
+ dict_cluster_t == dict_table_t) */
+ char* name, /* in: cluster name */
+ ulint space, /* in: space where the clustered indexes
+ of the member tables are placed */
+ ulint n_cols, /* in: number of columns */
+ ulint mix_len); /* in: length of the common key prefix in the
+ cluster */
+/**************************************************************************
+Declares a non-published table as a member in a cluster. */
+
+void
+dict_mem_table_make_cluster_member(
+/*===============================*/
+ dict_table_t* table, /* in: non-published table */
+ char* cluster_name); /* in: cluster name */
+/**************************************************************************
+Adds a column definition to a table. */
+
+void
+dict_mem_table_add_col(
+/*===================*/
+ dict_table_t* table, /* in: table */
+ char* name, /* in: column name */
+ ulint mtype, /* in: main datatype */
+ ulint prtype, /* in: precise type */
+ ulint len, /* in: length */
+ ulint prec); /* in: precision */
+/**************************************************************************
+Creates an index memory object. */
+
+dict_index_t*
+dict_mem_index_create(
+/*==================*/
+ /* out, own: index object */
+ char* table_name, /* in: table name */
+ char* index_name, /* in: index name */
+ ulint space, /* in: space where the index tree is placed,
+ ignored if the index is of the clustered
+ type */
+ ulint type, /* in: DICT_UNIQUE, DICT_CLUSTERED, ... ORed */
+ ulint n_fields); /* in: number of fields */
+/**************************************************************************
+Adds a field definition to an index. NOTE: does not take a copy
+of the column name if the field is a column. The memory occupied
+by the column name may be released only after publishing the index. */
+
+void
+dict_mem_index_add_field(
+/*=====================*/
+ dict_index_t* index, /* in: index */
+ char* name, /* in: column name */
+ ulint order); /* in: order criterion; 0 means an ascending
+ order */
+/**************************************************************************
+Frees an index memory object. */
+
+void
+dict_mem_index_free(
+/*================*/
+ dict_index_t* index); /* in: index */
+/**************************************************************************
+Creates a procedure memory object. */
+
+dict_proc_t*
+dict_mem_procedure_create(
+/*======================*/
+ /* out, own: procedure object */
+ char* name, /* in: procedure name */
+ char* sql_string, /* in: procedure definition as an SQL
+ string */
+ que_fork_t* graph); /* in: parsed procedure graph */
+
+
+/* Data structure for a column in a table */
+struct dict_col_struct{
+ hash_node_t hash; /* hash chain node */
+ ulint ind; /* table column position (they are numbered
+ starting from 0) */
+ ulint clust_pos;/* position of the column in the
+ clustered index */
+ ulint ord_part;/* count of how many times this column
+ appears in an ordering fields of an index */
+ char* name; /* name */
+ dtype_t type; /* data type */
+ dict_table_t* table; /* back pointer to table of this column */
+ ulint aux; /* this is used as an auxiliary variable
+ in some of the functions below */
+};
+
+/* Data structure for a field in an index */
+struct dict_field_struct{
+ dict_col_t* col; /* pointer to the table column */
+ char* name; /* name of the column */
+ ulint order; /* flags for ordering this field:
+ DICT_DESCEND, ... */
+};
+
+/* Data structure for an index tree */
+struct dict_tree_struct{
+ ulint type; /* tree type */
+ dulint id; /* id of the index stored in the tree, in the
+ case of a mixed index, the id of the clustered
+ index of the cluster table */
+ ulint space; /* space of index tree */
+ ulint page; /* index tree root page number */
+ byte pad[64];/* Padding to prevent other memory hotspots on
+ the same memory cache line */
+ rw_lock_t lock; /* read-write lock protecting the upper levels
+ of the index tree */
+ ulint mem_fix;/* count of how many times this tree
+ struct has been memoryfixed (by mini-
+ transactions wanting to access the index
+ tree) */
+ UT_LIST_BASE_NODE_T(dict_index_t)
+ tree_indexes; /* list of indexes stored in the
+ index tree: if the tree is not of the
+ mixed type there is only one index in
+ the list; if the tree is of the mixed
+ type, the first index in the list is the
+ index of the cluster which owns the tree */
+ ulint magic_n;/* magic number */
+};
+
+#define DICT_TREE_MAGIC_N 7545676
+
+/* Data structure for an index */
+struct dict_index_struct{
+ dulint id; /* id of the index */
+ mem_heap_t* heap; /* memory heap */
+ ulint type; /* index type */
+ char* name; /* index name */
+ char* table_name; /* table name */
+ dict_table_t* table; /* back pointer to table */
+ ulint space; /* space where the index tree is placed */
+ ulint page_no;/* page number of the index tree root */
+ ulint trx_id_offset;/* position of the the trx id column
+ in a clustered index record, if the fields
+ before it are known to be of a fixed size,
+ 0 otherwise */
+ ulint n_user_defined_cols;
+ /* number of columns the user defined to
+ be in the index: in the internal
+ representation we add more columns */
+ ulint n_uniq; /* number of fields from the beginning
+ which are enough to determine an index
+ entry uniquely */
+ ulint n_def; /* number of fields defined so far */
+ ulint n_fields;/* number of fields in the index */
+ dict_field_t* fields; /* array of field descriptions */
+ UT_LIST_NODE_T(dict_index_t)
+ indexes;/* list of indexes of the table */
+ dict_tree_t* tree; /* index tree struct */
+ UT_LIST_NODE_T(dict_index_t)
+ tree_indexes; /* list of indexes of the same index
+ tree */
+ ibool cached; /* TRUE if the index object is in the
+ dictionary cache */
+ btr_search_t* search_info; /* info used in optimistic searches */
+ /*----------------------*/
+ ulint stat_n_diff_key_vals;
+ /* approximate number of different key values
+ for this index; we periodically calculate
+ new estimates */
+ ulint stat_index_size;
+ /* approximate index size in database pages */
+ ulint magic_n;/* magic number */
+};
+
+#define DICT_INDEX_MAGIC_N 76789786
+
+/* Data structure for a database table */
+struct dict_table_struct{
+ dulint id; /* id of the table or cluster */
+ ulint type; /* DICT_TABLE_ORDINARY, ... */
+ mem_heap_t* heap; /* memory heap */
+ char* name; /* table name */
+ ulint space; /* space where the clustered index of the
+ table is placed */
+ hash_node_t name_hash; /* hash chain node */
+ hash_node_t id_hash; /* hash chain node */
+ ulint n_def; /* number of columns defined so far */
+ ulint n_cols; /* number of columns */
+ dict_col_t* cols; /* array of column descriptions */
+ UT_LIST_BASE_NODE_T(dict_index_t)
+ indexes; /* list of indexes of the table */
+ UT_LIST_NODE_T(dict_table_t)
+ table_LRU; /* node of the LRU list of tables */
+ ulint mem_fix;/* count of how many times the table
+ and its indexes has been fixed in memory;
+ currently NOT used */
+ ibool cached; /* TRUE if the table object has been added
+ to the dictionary cache */
+ UT_LIST_BASE_NODE_T(lock_t)
+ locks; /* list of locks on the table */
+ /*----------------------*/
+ dulint mix_id; /* if the table is a member in a cluster,
+ this is its mix id */
+ ulint mix_len;/* if the table is a cluster or a member
+ this is the common key prefix lenght */
+ ulint mix_id_len;/* mix id length in a compressed form */
+ byte mix_id_buf[12];
+ /* mix id of a mixed table written in
+ a compressed form */
+ char* cluster_name; /* if the table is a member in a
+ cluster, this is the name of the cluster */
+ /*----------------------*/
+ ibool does_not_fit_in_memory;
+ /* this field is used to specify in simulations
+ tables which are so big that disk should be
+ accessed: disk access is simulated by
+ putting the thread to sleep for a while;
+ NOTE that this flag is not stored to the data
+ dictionary on disk, and the database will
+ forget about value TRUE if it has to reload
+ the table definition from disk */
+ /*----------------------*/
+ ulint stat_n_rows;
+ /* approximate number of rows in the table;
+ we periodically calculate new estimates */
+ ulint stat_clustered_index_size;
+ /* approximate clustered index size in
+ database pages */
+ ulint stat_sum_of_other_index_sizes;
+ /* other indexes in database pages */
+ ulint stat_last_estimate_counter;
+ /* when the estimates were last time
+ calculated; a value (ulint)-1 denotes that
+ they have not yet been calculated for this
+ table (or the counter has wrapped over) */
+ ulint stat_modif_counter;
+ /* when a row is inserted, updated, or deleted,
+ we add the row length to this number; we
+ calculate new estimates for the stat_...
+ values for the table and the indexes at an
+ interval of DICT_STAT_CALCULATE_INTERVAL,
+ but for small tables more often, also
+ when the estimate operation is called
+ for MySQL SHOW TABLE STATUS; this counter
+ is not protected by any latch, because this
+ is only used for heuristics */
+ ulint magic_n;/* magic number */
+};
+#define DICT_TABLE_MAGIC_N 76333786
+
+/* Statistics are calculated at least with this interval; see the struct
+above */
+#define DICT_STAT_CALCULATE_INTERVAL (UNIV_PAGE_SIZE * 8)
+
+/* Data structure for a stored procedure */
+struct dict_proc_struct{
+ mem_heap_t* heap; /* memory heap */
+ char* name; /* procedure name */
+ char* sql_string;
+ /* procedure definition as an SQL string:
+ we can produce more parsed instances of the
+ procedure by parsing this string */
+ hash_node_t name_hash;
+ /* hash chain node */
+ UT_LIST_BASE_NODE_T(que_fork_t) graphs;
+ /* list of parsed instances of the procedure:
+ there may be many of them, and they are
+ recycled */
+ ulint mem_fix;/* count of how many times this struct
+ has been fixed in memory */
+};
+
+#ifndef UNIV_NONINL
+#include "dict0mem.ic"
+#endif
+
+#endif
diff --git a/innobase/include/dict0mem.ic b/innobase/include/dict0mem.ic
new file mode 100644
index 00000000000..9bcefc2a51f
--- /dev/null
+++ b/innobase/include/dict0mem.ic
@@ -0,0 +1,9 @@
+/**********************************************************************
+Data dictionary memory object creation
+
+(c) 1996 Innobase Oy
+
+Created 1/8/1996 Heikki Tuuri
+***********************************************************************/
+
+
diff --git a/innobase/include/dict0types.h b/innobase/include/dict0types.h
new file mode 100644
index 00000000000..fe1bad45063
--- /dev/null
+++ b/innobase/include/dict0types.h
@@ -0,0 +1,28 @@
+/******************************************************
+Data dictionary global types
+
+(c) 1996 Innobase Oy
+
+Created 1/8/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef dict0types_h
+#define dict0types_h
+
+typedef struct dict_sys_struct dict_sys_t;
+typedef struct dict_col_struct dict_col_t;
+typedef struct dict_field_struct dict_field_t;
+typedef struct dict_index_struct dict_index_t;
+typedef struct dict_tree_struct dict_tree_t;
+typedef struct dict_table_struct dict_table_t;
+typedef struct dict_proc_struct dict_proc_t;
+
+/* A cluster object is a table object with the type field set to
+DICT_CLUSTERED */
+
+typedef dict_table_t dict_cluster_t;
+
+typedef struct ind_node_struct ind_node_t;
+typedef struct tab_node_struct tab_node_t;
+
+#endif
diff --git a/innobase/include/dyn0dyn.h b/innobase/include/dyn0dyn.h
new file mode 100644
index 00000000000..07ad8539b38
--- /dev/null
+++ b/innobase/include/dyn0dyn.h
@@ -0,0 +1,172 @@
+/******************************************************
+The dynamically allocated array
+
+(c) 1996 Innobase Oy
+
+Created 2/5/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef dyn0dyn_h
+#define dyn0dyn_h
+
+#include "univ.i"
+#include "ut0lst.h"
+#include "mem0mem.h"
+
+typedef struct dyn_block_struct dyn_block_t;
+typedef dyn_block_t dyn_array_t;
+
+
+/* Initial 'payload' size in bytes in a dynamic array block */
+#define DYN_ARRAY_DATA_SIZE 1024
+
+/*************************************************************************
+Initializes a dynamic array. */
+UNIV_INLINE
+dyn_array_t*
+dyn_array_create(
+/*=============*/
+ /* out: initialized dyn array */
+ dyn_array_t* arr); /* in: pointer to a memory buffer of
+ size sizeof(dyn_array_t) */
+/****************************************************************
+Frees a dynamic array. */
+UNIV_INLINE
+void
+dyn_array_free(
+/*===========*/
+ dyn_array_t* arr); /* in: dyn array */
+/*************************************************************************
+Makes room on top of a dyn array and returns a pointer to a buffer in it.
+After copying the elements, the caller must close the buffer using
+dyn_array_close. */
+UNIV_INLINE
+byte*
+dyn_array_open(
+/*===========*/
+ /* out: pointer to the buffer */
+ dyn_array_t* arr, /* in: dynamic array */
+ ulint size); /* in: size in bytes of the buffer */
+/*************************************************************************
+Closes the buffer returned by dyn_array_open. */
+UNIV_INLINE
+void
+dyn_array_close(
+/*============*/
+ dyn_array_t* arr, /* in: dynamic array */
+ byte* ptr); /* in: buffer space from ptr up was not used */
+/*************************************************************************
+Makes room on top of a dyn array and returns a pointer to
+the added element. The caller must copy the element to
+the pointer returned. */
+UNIV_INLINE
+void*
+dyn_array_push(
+/*===========*/
+ /* out: pointer to the element */
+ dyn_array_t* arr, /* in: dynamic array */
+ ulint size); /* in: size in bytes of the element */
+/****************************************************************
+Returns pointer to an element in dyn array. */
+UNIV_INLINE
+void*
+dyn_array_get_element(
+/*==================*/
+ /* out: pointer to element */
+ dyn_array_t* arr, /* in: dyn array */
+ ulint pos); /* in: position of element as bytes
+ from array start */
+/****************************************************************
+Returns the size of stored data in a dyn array. */
+UNIV_INLINE
+ulint
+dyn_array_get_data_size(
+/*====================*/
+ /* out: data size in bytes */
+ dyn_array_t* arr); /* in: dyn array */
+/****************************************************************
+Gets the first block in a dyn array. */
+UNIV_INLINE
+dyn_block_t*
+dyn_array_get_first_block(
+/*======================*/
+ dyn_array_t* arr); /* in: dyn array */
+/****************************************************************
+Gets the last block in a dyn array. */
+UNIV_INLINE
+dyn_block_t*
+dyn_array_get_last_block(
+/*=====================*/
+ dyn_array_t* arr); /* in: dyn array */
+/************************************************************************
+Gets the next block in a dyn array. */
+UNIV_INLINE
+dyn_block_t*
+dyn_array_get_next_block(
+/*=====================*/
+ /* out: pointer to next, NULL if end of list */
+ dyn_array_t* arr, /* in: dyn array */
+ dyn_block_t* block); /* in: dyn array block */
+/************************************************************************
+Gets the number of used bytes in a dyn array block. */
+UNIV_INLINE
+ulint
+dyn_block_get_used(
+/*===============*/
+ /* out: number of bytes used */
+ dyn_block_t* block); /* in: dyn array block */
+/************************************************************************
+Gets pointer to the start of data in a dyn array block. */
+UNIV_INLINE
+byte*
+dyn_block_get_data(
+/*===============*/
+ /* out: pointer to data */
+ dyn_block_t* block); /* in: dyn array block */
+/************************************************************************
+Gets the next block in a dyn array. */
+UNIV_INLINE
+dyn_block_t*
+dyn_block_get_next(
+/*===============*/
+ /* out: pointer to next, NULL if end of list */
+ dyn_block_t* block); /* in: dyn array block */
+/************************************************************
+Pushes n bytes to a dyn array. */
+UNIV_INLINE
+void
+dyn_push_string(
+/*============*/
+ dyn_array_t* arr, /* in: dyn array */
+ byte* str, /* in: string to write */
+ ulint len); /* in: string length */
+
+/*#################################################################*/
+
+/* NOTE! Do not use the fields of the struct directly: the definition
+appears here only for the compiler to know its size! */
+struct dyn_block_struct{
+ mem_heap_t* heap; /* in the first block this is != NULL
+ if dynamic allocation has been needed */
+ ulint used; /* number of data bytes used in this block */
+ byte data[DYN_ARRAY_DATA_SIZE];
+ /* storage for array elements */
+ UT_LIST_BASE_NODE_T(dyn_block_t) base;
+ /* linear list of dyn blocks: this node is
+ used only in the first block */
+ UT_LIST_NODE_T(dyn_block_t) list;
+ /* linear list node: used in all blocks */
+#ifdef UNIV_DEBUG
+ ulint buf_end;/* only in the debug version: if dyn array is
+ opened, this is the buffer end offset, else
+ this is 0 */
+ ulint magic_n;
+#endif
+};
+
+
+#ifndef UNIV_NONINL
+#include "dyn0dyn.ic"
+#endif
+
+#endif
diff --git a/innobase/include/dyn0dyn.ic b/innobase/include/dyn0dyn.ic
new file mode 100644
index 00000000000..dc004efbb8b
--- /dev/null
+++ b/innobase/include/dyn0dyn.ic
@@ -0,0 +1,345 @@
+/******************************************************
+The dynamically allocated array
+
+(c) 1996 Innobase Oy
+
+Created 2/5/1996 Heikki Tuuri
+*******************************************************/
+
+#define DYN_BLOCK_MAGIC_N 375767
+#define DYN_BLOCK_FULL_FLAG 0x1000000
+
+/****************************************************************
+Adds a new block to a dyn array. */
+
+dyn_block_t*
+dyn_array_add_block(
+/*================*/
+ /* out: created block */
+ dyn_array_t* arr); /* in: dyn array */
+
+
+/****************************************************************
+Gets the first block in a dyn array. */
+UNIV_INLINE
+dyn_block_t*
+dyn_array_get_first_block(
+/*======================*/
+ dyn_array_t* arr) /* in: dyn array */
+{
+ return(arr);
+}
+
+/****************************************************************
+Gets the last block in a dyn array. */
+UNIV_INLINE
+dyn_block_t*
+dyn_array_get_last_block(
+/*=====================*/
+ dyn_array_t* arr) /* in: dyn array */
+{
+ if (arr->heap == NULL) {
+
+ return(arr);
+ }
+
+ return(UT_LIST_GET_LAST(arr->base));
+}
+
+/************************************************************************
+Gets the next block in a dyn array. */
+UNIV_INLINE
+dyn_block_t*
+dyn_array_get_next_block(
+/*=====================*/
+ /* out: pointer to next, NULL if end of list */
+ dyn_array_t* arr, /* in: dyn array */
+ dyn_block_t* block) /* in: dyn array block */
+{
+ ut_ad(arr && block);
+
+ if (arr->heap == NULL) {
+ ut_ad(arr == block);
+
+ return(NULL);
+ }
+
+ return(UT_LIST_GET_NEXT(list, block));
+}
+
+/************************************************************************
+Gets the number of used bytes in a dyn array block. */
+UNIV_INLINE
+ulint
+dyn_block_get_used(
+/*===============*/
+ /* out: number of bytes used */
+ dyn_block_t* block) /* in: dyn array block */
+{
+ ut_ad(block);
+
+ return((block->used) & ~DYN_BLOCK_FULL_FLAG);
+}
+
+/************************************************************************
+Gets pointer to the start of data in a dyn array block. */
+UNIV_INLINE
+byte*
+dyn_block_get_data(
+/*===============*/
+ /* out: pointer to data */
+ dyn_block_t* block) /* in: dyn array block */
+{
+ ut_ad(block);
+
+ return(block->data);
+}
+
+/*************************************************************************
+Initializes a dynamic array. */
+UNIV_INLINE
+dyn_array_t*
+dyn_array_create(
+/*=============*/
+ /* out: initialized dyn array */
+ dyn_array_t* arr) /* in: pointer to a memory buffer of
+ size sizeof(dyn_array_t) */
+{
+ ut_ad(arr);
+ ut_ad(DYN_ARRAY_DATA_SIZE < DYN_BLOCK_FULL_FLAG);
+
+ arr->heap = NULL;
+ arr->used = 0;
+
+#ifdef UNIV_DEBUG
+ arr->buf_end = 0;
+ arr->magic_n = DYN_BLOCK_MAGIC_N;
+#endif
+ return(arr);
+}
+
+/****************************************************************
+Frees a dynamic array. */
+UNIV_INLINE
+void
+dyn_array_free(
+/*===========*/
+ dyn_array_t* arr) /* in: dyn array */
+{
+ if (arr->heap != NULL) {
+ mem_heap_free(arr->heap);
+ }
+
+#ifdef UNIV_DEBUG
+ arr->magic_n = 0;
+#endif
+}
+
+/*************************************************************************
+Makes room on top of a dyn array and returns a pointer to the added element.
+The caller must copy the element to the pointer returned. */
+UNIV_INLINE
+void*
+dyn_array_push(
+/*===========*/
+ /* out: pointer to the element */
+ dyn_array_t* arr, /* in: dynamic array */
+ ulint size) /* in: size in bytes of the element */
+{
+ dyn_block_t* block;
+ ulint used;
+
+ ut_ad(arr);
+ ut_ad(arr->magic_n == DYN_BLOCK_MAGIC_N);
+ ut_ad(size <= DYN_ARRAY_DATA_SIZE);
+ ut_ad(size);
+
+ block = arr;
+ used = block->used;
+
+ if (used + size > DYN_ARRAY_DATA_SIZE) {
+ /* Get the last array block */
+
+ block = dyn_array_get_last_block(arr);
+ used = block->used;
+
+ if (used + size > DYN_ARRAY_DATA_SIZE) {
+ block = dyn_array_add_block(arr);
+ used = block->used;
+ }
+ }
+
+ block->used = used + size;
+ ut_ad(block->used <= DYN_ARRAY_DATA_SIZE);
+
+ return((block->data) + used);
+}
+
+/*************************************************************************
+Makes room on top of a dyn array and returns a pointer to a buffer in it.
+After copying the elements, the caller must close the buffer using
+dyn_array_close. */
+UNIV_INLINE
+byte*
+dyn_array_open(
+/*===========*/
+ /* out: pointer to the buffer */
+ dyn_array_t* arr, /* in: dynamic array */
+ ulint size) /* in: size in bytes of the buffer */
+{
+ dyn_block_t* block;
+ ulint used;
+
+ ut_ad(arr);
+ ut_ad(arr->magic_n == DYN_BLOCK_MAGIC_N);
+ ut_ad(size <= DYN_ARRAY_DATA_SIZE);
+ ut_ad(size);
+
+ block = arr;
+ used = block->used;
+
+ if (used + size > DYN_ARRAY_DATA_SIZE) {
+ /* Get the last array block */
+
+ block = dyn_array_get_last_block(arr);
+ used = block->used;
+
+ if (used + size > DYN_ARRAY_DATA_SIZE) {
+ block = dyn_array_add_block(arr);
+ used = block->used;
+ }
+ }
+
+ ut_ad(block->used <= DYN_ARRAY_DATA_SIZE);
+#ifdef UNIV_DEBUG
+ ut_ad(arr->buf_end == 0);
+
+ arr->buf_end = used + size;
+#endif
+ return((block->data) + used);
+}
+
+/*************************************************************************
+Closes the buffer returned by dyn_array_open. */
+UNIV_INLINE
+void
+dyn_array_close(
+/*============*/
+ dyn_array_t* arr, /* in: dynamic array */
+ byte* ptr) /* in: buffer space from ptr up was not used */
+{
+ dyn_block_t* block;
+
+ ut_ad(arr);
+ ut_ad(arr->magic_n == DYN_BLOCK_MAGIC_N);
+
+ block = dyn_array_get_last_block(arr);
+
+ ut_ad(arr->buf_end + block->data >= ptr);
+
+ block->used = ptr - block->data;
+
+ ut_ad(block->used <= DYN_ARRAY_DATA_SIZE);
+
+#ifdef UNIV_DEBUG
+ arr->buf_end = 0;
+#endif
+}
+
+/****************************************************************
+Returns pointer to an element in dyn array. */
+UNIV_INLINE
+void*
+dyn_array_get_element(
+/*==================*/
+ /* out: pointer to element */
+ dyn_array_t* arr, /* in: dyn array */
+ ulint pos) /* in: position of element as bytes
+ from array start */
+{
+ dyn_block_t* block;
+ ulint used;
+
+ ut_ad(arr);
+ ut_ad(arr->magic_n == DYN_BLOCK_MAGIC_N);
+
+ /* Get the first array block */
+ block = dyn_array_get_first_block(arr);
+
+ if (arr->heap != NULL) {
+ used = dyn_block_get_used(block);
+
+ while (pos >= used) {
+ pos -= used;
+ block = UT_LIST_GET_NEXT(list, block);
+ ut_ad(block);
+
+ used = dyn_block_get_used(block);
+ }
+ }
+
+ ut_ad(block);
+ ut_ad(dyn_block_get_used(block) >= pos);
+
+ return(block->data + pos);
+}
+
+/****************************************************************
+Returns the size of stored data in a dyn array. */
+UNIV_INLINE
+ulint
+dyn_array_get_data_size(
+/*====================*/
+ /* out: data size in bytes */
+ dyn_array_t* arr) /* in: dyn array */
+{
+ dyn_block_t* block;
+ ulint sum = 0;
+
+ ut_ad(arr);
+ ut_ad(arr->magic_n == DYN_BLOCK_MAGIC_N);
+
+ if (arr->heap == NULL) {
+
+ return(arr->used);
+ }
+
+ /* Get the first array block */
+ block = dyn_array_get_first_block(arr);
+
+ while (block != NULL) {
+ sum += dyn_block_get_used(block);
+ block = dyn_array_get_next_block(arr, block);
+ }
+
+ return(sum);
+}
+
+/************************************************************
+Pushes n bytes to a dyn array. */
+UNIV_INLINE
+void
+dyn_push_string(
+/*============*/
+ dyn_array_t* arr, /* in: dyn array */
+ byte* str, /* in: string to write */
+ ulint len) /* in: string length */
+{
+ byte* ptr;
+ ulint n_copied;
+
+ while (len > 0) {
+ if (len > DYN_ARRAY_DATA_SIZE) {
+ n_copied = DYN_ARRAY_DATA_SIZE;
+ } else {
+ n_copied = len;
+ }
+
+ ptr = (byte*) dyn_array_push(arr, n_copied);
+
+ ut_memcpy(ptr, str, n_copied);
+
+ str += n_copied;
+ len -= n_copied;
+ }
+}
diff --git a/innobase/include/eval0eval.h b/innobase/include/eval0eval.h
new file mode 100644
index 00000000000..6561f0c8ae7
--- /dev/null
+++ b/innobase/include/eval0eval.h
@@ -0,0 +1,97 @@
+/******************************************************
+SQL evaluator: evaluates simple data structures, like expressions, in
+a query graph
+
+(c) 1997 Innobase Oy
+
+Created 12/29/1997 Heikki Tuuri
+*******************************************************/
+
+#ifndef eval0eval_h
+#define eval0eval_h
+
+#include "univ.i"
+#include "que0types.h"
+#include "pars0sym.h"
+#include "pars0pars.h"
+
+/*********************************************************************
+Free the buffer from global dynamic memory for a value of a que_node,
+if it has been allocated in the above function. The freeing for pushed
+column values is done in sel_col_prefetch_buf_free. */
+
+void
+eval_node_free_val_buf(
+/*===================*/
+ que_node_t* node); /* in: query graph node */
+/*********************************************************************
+Evaluates a symbol table symbol. */
+UNIV_INLINE
+void
+eval_sym(
+/*=====*/
+ sym_node_t* sym_node); /* in: symbol table node */
+/*********************************************************************
+Evaluates an expression. */
+UNIV_INLINE
+void
+eval_exp(
+/*=====*/
+ que_node_t* exp_node); /* in: expression */
+/*********************************************************************
+Sets an integer value as the value of an expression node. */
+UNIV_INLINE
+void
+eval_node_set_int_val(
+/*==================*/
+ que_node_t* node, /* in: expression node */
+ lint val); /* in: value to set */
+/*********************************************************************
+Gets an integer value from an expression node. */
+UNIV_INLINE
+lint
+eval_node_get_int_val(
+/*==================*/
+ /* out: integer value */
+ que_node_t* node); /* in: expression node */
+/*********************************************************************
+Copies a binary string value as the value of a query graph node. Allocates a
+new buffer if necessary. */
+UNIV_INLINE
+void
+eval_node_copy_and_alloc_val(
+/*=========================*/
+ que_node_t* node, /* in: query graph node */
+ byte* str, /* in: binary string */
+ ulint len); /* in: string length or UNIV_SQL_NULL */
+/*********************************************************************
+Copies a query node value to another node. */
+UNIV_INLINE
+void
+eval_node_copy_val(
+/*===============*/
+ que_node_t* node1, /* in: node to copy to */
+ que_node_t* node2); /* in: node to copy from */
+/*********************************************************************
+Gets a iboolean value from a query node. */
+UNIV_INLINE
+ibool
+eval_node_get_ibool_val(
+/*===================*/
+ /* out: iboolean value */
+ que_node_t* node); /* in: query graph node */
+/*********************************************************************
+Evaluates a comparison node. */
+
+ibool
+eval_cmp(
+/*=====*/
+ /* out: the result of the comparison */
+ func_node_t* cmp_node); /* in: comparison node */
+
+
+#ifndef UNIV_NONINL
+#include "eval0eval.ic"
+#endif
+
+#endif
diff --git a/innobase/include/eval0eval.ic b/innobase/include/eval0eval.ic
new file mode 100644
index 00000000000..2530c869206
--- /dev/null
+++ b/innobase/include/eval0eval.ic
@@ -0,0 +1,236 @@
+/******************************************************
+SQL evaluator: evaluates simple data structures, like expressions, in
+a query graph
+
+(c) 1997 Innobase Oy
+
+Created 12/29/1997 Heikki Tuuri
+*******************************************************/
+
+#include "que0que.h"
+#include "rem0cmp.h"
+#include "pars0grm.h"
+
+/*********************************************************************
+Evaluates a function node. */
+
+void
+eval_func(
+/*======*/
+ func_node_t* func_node); /* in: function node */
+/*********************************************************************
+Allocate a buffer from global dynamic memory for a value of a que_node.
+NOTE that this memory must be explicitly freed when the query graph is
+freed. If the node already has allocated buffer, that buffer is freed
+here. NOTE that this is the only function where dynamic memory should be
+allocated for a query node val field. */
+
+byte*
+eval_node_alloc_val_buf(
+/*====================*/
+ /* out: pointer to allocated buffer */
+ que_node_t* node, /* in: query graph node; sets the val field
+ data field to point to the new buffer, and
+ len field equal to size */
+ ulint size); /* in: buffer size */
+
+
+/*********************************************************************
+Allocates a new buffer if needed. */
+UNIV_INLINE
+byte*
+eval_node_ensure_val_buf(
+/*=====================*/
+ /* out: pointer to buffer */
+ que_node_t* node, /* in: query graph node; sets the val field
+ data field to point to the new buffer, and
+ len field equal to size */
+ ulint size) /* in: buffer size */
+{
+ dfield_t* dfield;
+ byte* data;
+
+ dfield = que_node_get_val(node);
+ dfield_set_len(dfield, size);
+
+ data = dfield_get_data(dfield);
+
+ if (!data || que_node_get_val_buf_size(node) < size) {
+
+ data = eval_node_alloc_val_buf(node, size);
+ }
+
+ return(data);
+}
+
+/*********************************************************************
+Evaluates a symbol table symbol. */
+UNIV_INLINE
+void
+eval_sym(
+/*=====*/
+ sym_node_t* sym_node) /* in: symbol table node */
+{
+
+ ut_ad(que_node_get_type(sym_node) == QUE_NODE_SYMBOL);
+
+ if (sym_node->indirection) {
+ /* The symbol table node is an alias for a variable or a
+ column */
+
+ dfield_copy_data(que_node_get_val(sym_node),
+ que_node_get_val(sym_node->indirection));
+ }
+}
+
+/*********************************************************************
+Evaluates an expression. */
+UNIV_INLINE
+void
+eval_exp(
+/*=====*/
+ que_node_t* exp_node) /* in: expression */
+{
+ if (que_node_get_type(exp_node) == QUE_NODE_SYMBOL) {
+
+ eval_sym((sym_node_t*)exp_node);
+
+ return;
+ }
+
+ eval_func(exp_node);
+}
+
+/*********************************************************************
+Sets an integer value as the value of an expression node. */
+UNIV_INLINE
+void
+eval_node_set_int_val(
+/*==================*/
+ que_node_t* node, /* in: expression node */
+ lint val) /* in: value to set */
+{
+ dfield_t* dfield;
+ byte* data;
+
+ dfield = que_node_get_val(node);
+
+ data = dfield_get_data(dfield);
+
+ if (data == NULL) {
+ data = eval_node_alloc_val_buf(node, 4);
+ }
+
+ ut_ad(dfield_get_len(dfield) == 4);
+
+ mach_write_to_4(data, (ulint)val);
+}
+
+/*********************************************************************
+Gets an integer non-SQL null value from an expression node. */
+UNIV_INLINE
+lint
+eval_node_get_int_val(
+/*==================*/
+ /* out: integer value */
+ que_node_t* node) /* in: expression node */
+{
+ dfield_t* dfield;
+
+ dfield = que_node_get_val(node);
+
+ ut_ad(dfield_get_len(dfield) == 4);
+
+ return((int)mach_read_from_4(dfield_get_data(dfield)));
+}
+
+/*********************************************************************
+Gets a iboolean value from a query node. */
+UNIV_INLINE
+ibool
+eval_node_get_ibool_val(
+/*===================*/
+ /* out: iboolean value */
+ que_node_t* node) /* in: query graph node */
+{
+ dfield_t* dfield;
+ byte* data;
+
+ dfield = que_node_get_val(node);
+
+ data = dfield_get_data(dfield);
+
+ ut_ad(data != NULL);
+
+ return(mach_read_from_1(data));
+}
+
+/*********************************************************************
+Sets a iboolean value as the value of a function node. */
+UNIV_INLINE
+void
+eval_node_set_ibool_val(
+/*===================*/
+ func_node_t* func_node, /* in: function node */
+ ibool val) /* in: value to set */
+{
+ dfield_t* dfield;
+ byte* data;
+
+ dfield = que_node_get_val(func_node);
+
+ data = dfield_get_data(dfield);
+
+ if (data == NULL) {
+ /* Allocate 1 byte to hold the value */
+
+ data = eval_node_alloc_val_buf(func_node, 1);
+ }
+
+ ut_ad(dfield_get_len(dfield) == 1);
+
+ mach_write_to_1(data, val);
+}
+
+/*********************************************************************
+Copies a binary string value as the value of a query graph node. Allocates a
+new buffer if necessary. */
+UNIV_INLINE
+void
+eval_node_copy_and_alloc_val(
+/*=========================*/
+ que_node_t* node, /* in: query graph node */
+ byte* str, /* in: binary string */
+ ulint len) /* in: string length or UNIV_SQL_NULL */
+{
+ byte* data;
+
+ ut_ad(UNIV_SQL_NULL > ULINT_MAX);
+
+ if (len == UNIV_SQL_NULL) {
+ dfield_set_len(que_node_get_val(node), len);
+
+ return;
+ }
+
+ data = eval_node_ensure_val_buf(node, len);
+
+ ut_memcpy(data, str, len);
+}
+
+/*********************************************************************
+Copies a query node value to another node. */
+UNIV_INLINE
+void
+eval_node_copy_val(
+/*===============*/
+ que_node_t* node1, /* in: node to copy to */
+ que_node_t* node2) /* in: node to copy from */
+{
+ dfield_t* dfield2;
+
+ dfield2 = que_node_get_val(node2);
+
+ eval_node_copy_and_alloc_val(node1, dfield_get_data(dfield2),
+ dfield_get_len(dfield2));
+}
diff --git a/innobase/include/eval0proc.h b/innobase/include/eval0proc.h
new file mode 100644
index 00000000000..5d685ad9076
--- /dev/null
+++ b/innobase/include/eval0proc.h
@@ -0,0 +1,79 @@
+/******************************************************
+Executes SQL stored procedures and their control structures
+
+(c) 1998 Innobase Oy
+
+Created 1/20/1998 Heikki Tuuri
+*******************************************************/
+
+#ifndef eval0proc_h
+#define eval0proc_h
+
+#include "univ.i"
+#include "que0types.h"
+#include "pars0sym.h"
+#include "pars0pars.h"
+
+/**************************************************************************
+Performs an execution step of a procedure node. */
+UNIV_INLINE
+que_thr_t*
+proc_step(
+/*======*/
+ /* out: query thread to run next or NULL */
+ que_thr_t* thr); /* in: query thread */
+/**************************************************************************
+Performs an execution step of an if-statement node. */
+
+que_thr_t*
+if_step(
+/*====*/
+ /* out: query thread to run next or NULL */
+ que_thr_t* thr); /* in: query thread */
+/**************************************************************************
+Performs an execution step of a while-statement node. */
+
+que_thr_t*
+while_step(
+/*=======*/
+ /* out: query thread to run next or NULL */
+ que_thr_t* thr); /* in: query thread */
+/**************************************************************************
+Performs an execution step of a for-loop node. */
+
+que_thr_t*
+for_step(
+/*=====*/
+ /* out: query thread to run next or NULL */
+ que_thr_t* thr); /* in: query thread */
+/**************************************************************************
+Performs an execution step of an assignment statement node. */
+
+que_thr_t*
+assign_step(
+/*========*/
+ /* out: query thread to run next or NULL */
+ que_thr_t* thr); /* in: query thread */
+/**************************************************************************
+Performs an execution step of a procedure call node. */
+UNIV_INLINE
+que_thr_t*
+proc_eval_step(
+/*===========*/
+ /* out: query thread to run next or NULL */
+ que_thr_t* thr); /* in: query thread */
+/**************************************************************************
+Performs an execution step of a return-statement node. */
+
+que_thr_t*
+return_step(
+/*========*/
+ /* out: query thread to run next or NULL */
+ que_thr_t* thr); /* in: query thread */
+
+
+#ifndef UNIV_NONINL
+#include "eval0proc.ic"
+#endif
+
+#endif
diff --git a/innobase/include/eval0proc.ic b/innobase/include/eval0proc.ic
new file mode 100644
index 00000000000..0d7ecb6d1dc
--- /dev/null
+++ b/innobase/include/eval0proc.ic
@@ -0,0 +1,71 @@
+/******************************************************
+Executes SQL stored procedures and their control structures
+
+(c) 1998 Innobase Oy
+
+Created 1/20/1998 Heikki Tuuri
+*******************************************************/
+
+#include "pars0pars.h"
+#include "que0que.h"
+#include "eval0eval.h"
+
+/**************************************************************************
+Performs an execution step of a procedure node. */
+UNIV_INLINE
+que_thr_t*
+proc_step(
+/*======*/
+ /* out: query thread to run next or NULL */
+ que_thr_t* thr) /* in: query thread */
+{
+ proc_node_t* node;
+
+ ut_ad(thr);
+
+ node = thr->run_node;
+ ut_ad(que_node_get_type(node) == QUE_NODE_PROC);
+
+ if (thr->prev_node == que_node_get_parent(node)) {
+ /* Start execution from the first statement in the statement
+ list */
+
+ thr->run_node = node->stat_list;
+ } else {
+ /* Move to the next statement */
+ ut_ad(que_node_get_next(thr->prev_node) == NULL);
+
+ thr->run_node = NULL;
+ }
+
+ if (thr->run_node == NULL) {
+ thr->run_node = que_node_get_parent(node);
+ }
+
+ return(thr);
+}
+
+/**************************************************************************
+Performs an execution step of a procedure call node. */
+UNIV_INLINE
+que_thr_t*
+proc_eval_step(
+/*===========*/
+ /* out: query thread to run next or NULL */
+ que_thr_t* thr) /* in: query thread */
+{
+ func_node_t* node;
+
+ ut_ad(thr);
+
+ node = thr->run_node;
+ ut_ad(que_node_get_type(node) == QUE_NODE_FUNC);
+
+ /* Evaluate the procedure */
+
+ eval_exp(node);
+
+ thr->run_node = que_node_get_parent(node);
+
+ return(thr);
+}
diff --git a/innobase/include/fil0fil.h b/innobase/include/fil0fil.h
new file mode 100644
index 00000000000..9905b5a2c3c
--- /dev/null
+++ b/innobase/include/fil0fil.h
@@ -0,0 +1,357 @@
+/******************************************************
+The low-level file system
+
+(c) 1995 Innobase Oy
+
+Created 10/25/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef fil0fil_h
+#define fil0fil_h
+
+#include "univ.i"
+#include "sync0rw.h"
+#include "dict0types.h"
+#include "ibuf0types.h"
+#include "ut0byte.h"
+#include "os0file.h"
+
+/* 'null' (undefined) page offset in the context of file spaces */
+#define FIL_NULL ULINT32_UNDEFINED
+
+/* Space address data type; this is intended to be used when
+addresses accurate to a byte are stored in file pages. If the page part
+of the address is FIL_NULL, the address is considered undefined. */
+
+typedef byte fil_faddr_t; /* 'type' definition in C: an address
+ stored in a file page is a string of bytes */
+#define FIL_ADDR_PAGE 0 /* first in address is the page offset */
+#define FIL_ADDR_BYTE 4 /* then comes 2-byte byte offset within page*/
+
+#define FIL_ADDR_SIZE 6 /* address size is 6 bytes */
+
+/* A struct for storing a space address FIL_ADDR, when it is used
+in C program data structures. */
+
+typedef struct fil_addr_struct fil_addr_t;
+struct fil_addr_struct{
+ ulint page; /* page number within a space */
+ ulint boffset; /* byte offset within the page */
+};
+
+/* Null file address */
+extern fil_addr_t fil_addr_null;
+
+/* The byte offsets on a file page for various variables */
+#define FIL_PAGE_SPACE 0 /* space id the page belongs to */
+#define FIL_PAGE_OFFSET 4 /* page offset inside space */
+#define FIL_PAGE_PREV 8 /* if there is a 'natural' predecessor
+ of the page, its offset */
+#define FIL_PAGE_NEXT 12 /* if there is a 'natural' successor
+ of the page, its offset */
+#define FIL_PAGE_LSN 16 /* lsn of the end of the newest
+ modification log record to the page */
+#define FIL_PAGE_TYPE 24 /* file page type: FIL_PAGE_INDEX,...,
+ 2 bytes */
+#define FIL_PAGE_FILE_FLUSH_LSN 26 /* this is only defined for the
+ first page in a data file: the file
+ has been flushed to disk at least up
+ to this lsn */
+#define FIL_PAGE_ARCH_LOG_NO 34 /* this is only defined for the
+ first page in a data file: the latest
+ archived log file number when the
+ flush lsn above was written */
+#define FIL_PAGE_DATA 38 /* start of the data on the page */
+
+/* File page trailer */
+#define FIL_PAGE_END_LSN 8 /* this should be same as
+ FIL_PAGE_LSN */
+#define FIL_PAGE_DATA_END 8
+
+/* File page types */
+#define FIL_PAGE_INDEX 17855
+#define FIL_PAGE_UNDO_LOG 2
+
+/* Space types */
+#define FIL_TABLESPACE 501
+#define FIL_LOG 502
+
+/***********************************************************************
+Reserves a right to open a single file. The right must be released with
+fil_release_right_to_open. */
+
+void
+fil_reserve_right_to_open(void);
+/*===========================*/
+/***********************************************************************
+Releases a right to open a single file. */
+
+void
+fil_release_right_to_open(void);
+/*===========================*/
+/************************************************************************
+Returns TRUE if file address is undefined. */
+ibool
+fil_addr_is_null(
+/*=============*/
+ /* out: TRUE if undefined */
+ fil_addr_t addr); /* in: address */
+/********************************************************************
+Initializes the file system of this module. */
+
+void
+fil_init(
+/*=====*/
+ ulint max_n_open); /* in: max number of open files */
+/********************************************************************
+Initializes the ibuf indexes at a database start. This can be called
+after the file space headers have been created and the dictionary system
+has been initialized. */
+
+void
+fil_ibuf_init_at_db_start(void);
+/*===========================*/
+/***********************************************************************
+Creates a space object and puts it to the file system. */
+
+void
+fil_space_create(
+/*=============*/
+ char* name, /* in: space name */
+ ulint id, /* in: space id */
+ ulint purpose);/* in: FIL_TABLESPACE, or FIL_LOG if log */
+/********************************************************************
+Drops files from the start of a file space, so that its size is cut by
+the amount given. */
+
+void
+fil_space_truncate_start(
+/*=====================*/
+ ulint id, /* in: space id */
+ ulint trunc_len); /* in: truncate by this much; it is an error
+ if this does not equal to the combined size of
+ some initial files in the space */
+/***********************************************************************
+Frees a space object from a file system. Closes the files in the chain
+but does not delete them. */
+
+void
+fil_space_free(
+/*===========*/
+ ulint id); /* in: space id */
+/***********************************************************************
+Returns the latch of a file space. */
+
+rw_lock_t*
+fil_space_get_latch(
+/*================*/
+ /* out: latch protecting storage allocation */
+ ulint id); /* in: space id */
+/***********************************************************************
+Returns the type of a file space. */
+
+ulint
+fil_space_get_type(
+/*===============*/
+ /* out: FIL_TABLESPACE or FIL_LOG */
+ ulint id); /* in: space id */
+/********************************************************************
+Writes the flushed lsn and the latest archived log number to the page
+header of the first page of each data file. */
+
+ulint
+fil_write_flushed_lsn_to_data_files(
+/*================================*/
+ /* out: DB_SUCCESS or error number */
+ dulint lsn, /* in: lsn to write */
+ ulint arch_log_no); /* in: latest archived log file number */
+/***********************************************************************
+Reads the flushed lsn and arch no fields from a data file at database
+startup. */
+
+void
+fil_read_flushed_lsn_and_arch_log_no(
+/*=================================*/
+ os_file_t data_file, /* in: open data file */
+ ibool one_read_already, /* in: TRUE if min and max parameters
+ below already contain sensible data */
+ dulint* min_flushed_lsn, /* in/out: */
+ ulint* min_arch_log_no, /* in/out: */
+ dulint* max_flushed_lsn, /* in/out: */
+ ulint* max_arch_log_no); /* in/out: */
+/***********************************************************************
+Returns the ibuf data of a file space. */
+
+ibuf_data_t*
+fil_space_get_ibuf_data(
+/*====================*/
+ /* out: ibuf data for this space */
+ ulint id); /* in: space id */
+/***********************************************************************
+Returns the size of the space in pages. */
+
+ulint
+fil_space_get_size(
+/*===============*/
+ /* out: space size */
+ ulint id); /* in: space id */
+/***********************************************************************
+Appends a new file to the chain of files of a space.
+File must be closed. */
+
+void
+fil_node_create(
+/*============*/
+ char* name, /* in: file name (file must be closed) */
+ ulint size, /* in: file size in database blocks, rounded downwards
+ to an integer */
+ ulint id); /* in: space id where to append */
+/************************************************************************
+Reads or writes data. This operation is asynchronous (aio). */
+
+void
+fil_io(
+/*===*/
+ ulint type, /* in: OS_FILE_READ or OS_FILE_WRITE,
+ ORed to OS_FILE_LOG, if a log i/o
+ and ORed to OS_AIO_SIMULATED_WAKE_LATER
+ if simulated aio and we want to post a
+ batch of i/os; NOTE that a simulated batch
+ may introduce hidden chances of deadlocks,
+ because i/os are not actually handled until
+ all have been posted: use with great
+ caution! */
+ ibool sync, /* in: TRUE if synchronous aio is desired */
+ ulint space_id, /* in: space id */
+ ulint block_offset, /* in: offset in number of blocks */
+ ulint byte_offset, /* in: remainder of offset in bytes; in
+ aio this must be divisible by the OS block
+ size */
+ ulint len, /* in: how many bytes to read; this must
+ not cross a file boundary; in aio this must
+ be a block size multiple */
+ void* buf, /* in/out: buffer where to store read data
+ or from where to write; in aio this must be
+ appropriately aligned */
+ void* message); /* in: message for aio handler if non-sync
+ aio used, else ignored */
+/************************************************************************
+Reads data from a space to a buffer. Remember that the possible incomplete
+blocks at the end of a file are ignored: they are not taken into account when
+calculating the byte offset within a space. */
+
+void
+fil_read(
+/*=====*/
+ ibool sync, /* in: TRUE if synchronous aio is desired */
+ ulint space_id, /* in: space id */
+ ulint block_offset, /* in: offset in number of blocks */
+ ulint byte_offset, /* in: remainder of offset in bytes; in aio
+ this must be divisible by the OS block size */
+ ulint len, /* in: how many bytes to read; this must not
+ cross a file boundary; in aio this must be a
+ block size multiple */
+ void* buf, /* in/out: buffer where to store data read;
+ in aio this must be appropriately aligned */
+ void* message); /* in: message for aio handler if non-sync
+ aio used, else ignored */
+/************************************************************************
+Writes data to a space from a buffer. Remember that the possible incomplete
+blocks at the end of a file are ignored: they are not taken into account when
+calculating the byte offset within a space. */
+
+void
+fil_write(
+/*======*/
+ ibool sync, /* in: TRUE if synchronous aio is desired */
+ ulint space_id, /* in: space id */
+ ulint block_offset, /* in: offset in number of blocks */
+ ulint byte_offset, /* in: remainder of offset in bytes; in aio
+ this must be divisible by the OS block size */
+ ulint len, /* in: how many bytes to write; this must
+ not cross a file boundary; in aio this must
+ be a block size multiple */
+ void* buf, /* in: buffer from which to write; in aio
+ this must be appropriately aligned */
+ void* message); /* in: message for aio handler if non-sync
+ aio used, else ignored */
+/**************************************************************************
+Waits for an aio operation to complete. This function is used to write the
+handler for completed requests. The aio array of pending requests is divided
+into segments (see os0file.c for more info). The thread specifies which
+segment it wants to wait for. */
+
+void
+fil_aio_wait(
+/*=========*/
+ ulint segment); /* in: the number of the segment in the aio
+ array to wait for */
+/**************************************************************************
+Flushes to disk possible writes cached by the OS. */
+
+void
+fil_flush(
+/*======*/
+ ulint space_id); /* in: file space id (this can be a group of
+ log files or a tablespace of the database) */
+/**************************************************************************
+Flushes to disk writes in file spaces of the given type possibly cached by
+the OS. */
+
+void
+fil_flush_file_spaces(
+/*==================*/
+ ulint purpose); /* in: FIL_TABLESPACE, FIL_LOG */
+/**********************************************************************
+Checks the consistency of the file system. */
+
+ibool
+fil_validate(void);
+/*==============*/
+ /* out: TRUE if ok */
+/************************************************************************
+Accessor functions for a file page */
+
+ulint
+fil_page_get_prev(byte* page);
+ulint
+fil_page_get_next(byte* page);
+/*************************************************************************
+Sets the file page type. */
+
+void
+fil_page_set_type(
+/*==============*/
+ byte* page, /* in: file page */
+ ulint type); /* in: type */
+/*************************************************************************
+Gets the file page type. */
+
+ulint
+fil_page_get_type(
+/*==============*/
+ /* out: type; NOTE that if the type has not been
+ written to page, the return value not defined */
+ byte* page); /* in: file page */
+/***********************************************************************
+Tries to reserve free extents in a file space. */
+
+ibool
+fil_space_reserve_free_extents(
+/*===========================*/
+ /* out: TRUE if succeed */
+ ulint id, /* in: space id */
+ ulint n_free_now, /* in: number of free extents now */
+ ulint n_to_reserve); /* in: how many one wants to reserve */
+/***********************************************************************
+Releases free extents in a file space. */
+
+void
+fil_space_release_free_extents(
+/*===========================*/
+ ulint id, /* in: space id */
+ ulint n_reserved); /* in: how many one reserved */
+
+typedef struct fil_space_struct fil_space_t;
+
+#endif
diff --git a/innobase/include/fsp0fsp.h b/innobase/include/fsp0fsp.h
new file mode 100644
index 00000000000..f1be4de4d40
--- /dev/null
+++ b/innobase/include/fsp0fsp.h
@@ -0,0 +1,331 @@
+/******************************************************
+File space management
+
+(c) 1995 Innobase Oy
+
+Created 12/18/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef fsp0fsp_h
+#define fsp0fsp_h
+
+#include "univ.i"
+
+#include "mtr0mtr.h"
+#include "fut0lst.h"
+#include "ut0byte.h"
+#include "page0types.h"
+
+/* If records are inserted in order, there are the following
+flags to tell this (their type is made byte for the compiler
+to warn if direction and hint parameters are switched in
+fseg_alloc_free_page): */
+#define FSP_UP ((byte)111) /* alphabetically upwards */
+#define FSP_DOWN ((byte)112) /* alphabetically downwards */
+#define FSP_NO_DIR ((byte)113) /* no order */
+
+/* File space extent size in pages */
+#define FSP_EXTENT_SIZE 64
+
+/* On a page of any file segment, data may be put starting from this offset: */
+#define FSEG_PAGE_DATA FIL_PAGE_DATA
+
+/* File segment header which points to the inode describing the file segment */
+typedef byte fseg_header_t;
+
+#define FSEG_HDR_SPACE 0 /* space id of the inode */
+#define FSEG_HDR_PAGE_NO 4 /* page number of the inode */
+#define FSEG_HDR_OFFSET 8 /* byte offset of the inode */
+
+#define FSEG_HEADER_SIZE 10
+
+/**************************************************************************
+Initializes the file space system. */
+
+void
+fsp_init(void);
+/*==========*/
+/**************************************************************************
+Initializes the space header of a new created space. */
+
+void
+fsp_header_init(
+/*============*/
+ ulint space, /* in: space id */
+ ulint size, /* in: current size in blocks */
+ mtr_t* mtr); /* in: mini-transaction handle */
+/**************************************************************************
+Increases the space size field of a space. */
+
+void
+fsp_header_inc_size(
+/*================*/
+ ulint space, /* in: space id */
+ ulint size_inc,/* in: size increment in pages */
+ mtr_t* mtr); /* in: mini-transaction handle */
+/**************************************************************************
+Creates a new segment. */
+
+page_t*
+fseg_create(
+/*========*/
+ /* out: the page where the segment header is placed,
+ x-latched, FIL_NULL if could not create segment
+ because of lack of space */
+ ulint space, /* in: space id */
+ ulint page, /* in: page where the segment header is placed: if
+ this is != 0, the page must belong to another segment,
+ if this is 0, a new page will be allocated and it
+ will belong to the created segment */
+ ulint byte_offset, /* in: byte offset of the created segment header
+ on the page */
+ mtr_t* mtr); /* in: mtr */
+/**************************************************************************
+Creates a new segment. */
+
+page_t*
+fseg_create_general(
+/*================*/
+ /* out: the page where the segment header is placed,
+ x-latched, NULL if could not create segment
+ because of lack of space */
+ ulint space, /* in: space id */
+ ulint page, /* in: page where the segment header is placed: if
+ this is != 0, the page must belong to another segment,
+ if this is 0, a new page will be allocated and it
+ will belong to the created segment */
+ ulint byte_offset, /* in: byte offset of the created segment header
+ on the page */
+ ibool has_done_reservation, /* in: TRUE if the caller has
+ already done the reservation for the pages
+ with fsp_reserve_free_extents (at least 2 extents:
+ one for the inode and, then there other for the
+ segment) is no need to do the check for this
+ individual operation */
+ mtr_t* mtr); /* in: mtr */
+/**************************************************************************
+Calculates the number of pages reserved by a segment, and how many pages are
+currently used. */
+
+ulint
+fseg_n_reserved_pages(
+/*==================*/
+ /* out: number of reserved pages */
+ fseg_header_t* header, /* in: segment header */
+ ulint* used, /* out: number of pages used (<= reserved) */
+ mtr_t* mtr); /* in: mtr handle */
+/**************************************************************************
+Allocates a single free page from a segment. This function implements
+the intelligent allocation strategy which tries to minimize
+file space fragmentation. */
+
+ulint
+fseg_alloc_free_page(
+/*=================*/
+ /* out: the allocated page offset
+ FIL_NULL if no page could be allocated */
+ fseg_header_t* seg_header, /* in: segment header */
+ ulint hint, /* in: hint of which page would be desirable */
+ byte direction, /* in: if the new page is needed because
+ of an index page split, and records are
+ inserted there in order, into which
+ direction they go alphabetically: FSP_DOWN,
+ FSP_UP, FSP_NO_DIR */
+ mtr_t* mtr); /* in: mtr handle */
+/**************************************************************************
+Allocates a single free page from a segment. This function implements
+the intelligent allocation strategy which tries to minimize file space
+fragmentation. */
+
+ulint
+fseg_alloc_free_page_general(
+/*=========================*/
+ /* out: allocated page offset, FIL_NULL if no
+ page could be allocated */
+ fseg_header_t* seg_header,/* in: segment header */
+ ulint hint, /* in: hint of which page would be desirable */
+ byte direction,/* in: if the new page is needed because
+ of an index page split, and records are
+ inserted there in order, into which
+ direction they go alphabetically: FSP_DOWN,
+ FSP_UP, FSP_NO_DIR */
+ ibool has_done_reservation, /* in: TRUE if the caller has
+ already done the reservation for the page
+ with fsp_reserve_free_extents, then there
+ is no need to do the check for this individual
+ page */
+ mtr_t* mtr); /* in: mtr handle */
+/**************************************************************************
+Reserves free pages from a tablespace. All mini-transactions which may
+use several pages from the tablespace should call this function beforehand
+and reserve enough free extents so that they certainly will be able
+to do their operation, like a B-tree page split, fully. Reservations
+must be released with function fil_space_release_free_extents!
+
+The alloc_type below has the following meaning: FSP_NORMAL means an
+operation which will probably result in more space usage, like an
+insert in a B-tree; FSP_UNDO means allocation to undo logs: if we are
+deleting rows, then this allocation will in the long run result in
+less space usage (after a purge); FSP_CLEANING means allocation done
+in a physical record delete (like in a purge) or other cleaning operation
+which will result in less space usage in the long run. We prefer the latter
+two types of allocation: when space is scarce, FSP_NORMAL allocations
+will not succeed, but the latter two allocations will succeed, if possible.
+The purpose is to avoid dead end where the database is full but the
+user cannot free any space because these freeing operations temporarily
+reserve some space. */
+
+ibool
+fsp_reserve_free_extents(
+/*=====================*/
+ /* out: TRUE if we were able to make the reservation */
+ ulint space, /* in: space id */
+ ulint n_ext, /* in: number of extents to reserve */
+ ulint alloc_type,/* in: FSP_NORMAL, FSP_UNDO, or FSP_CLEANING */
+ mtr_t* mtr); /* in: mtr */
+/**************************************************************************
+This function should be used to get information on how much we still
+will be able to insert new data to the database without running out the
+tablespace. Only free extents are taken into account and we also subtract
+the safety margin required by the above function fsp_reserve_free_extents. */
+
+ulint
+fsp_get_available_space_in_free_extents(
+/*====================================*/
+ /* out: available space in kB */
+ ulint space); /* in: space id */
+/**************************************************************************
+Frees a single page of a segment. */
+
+void
+fseg_free_page(
+/*===========*/
+ fseg_header_t* seg_header, /* in: segment header */
+ ulint space, /* in: space id */
+ ulint page, /* in: page offset */
+ mtr_t* mtr); /* in: mtr handle */
+/***********************************************************************
+Frees a segment. The freeing is performed in several mini-transactions,
+so that there is no danger of bufferfixing too many buffer pages. */
+
+void
+fseg_free(
+/*======*/
+ ulint space, /* in: space id */
+ ulint page_no,/* in: page number where the segment header is
+ placed */
+ ulint offset);/* in: byte offset of the segment header on that
+ page */
+/**************************************************************************
+Frees part of a segment. This function can be used to free a segment
+by repeatedly calling this function in different mini-transactions.
+Doing the freeing in a single mini-transaction might result in
+too big a mini-transaction. */
+
+ibool
+fseg_free_step(
+/*===========*/
+ /* out: TRUE if freeing completed */
+ fseg_header_t* header, /* in, own: segment header; NOTE: if the header
+ resides on the first page of the frag list
+ of the segment, this pointer becomes obsolete
+ after the last freeing step */
+ mtr_t* mtr); /* in: mtr */
+/**************************************************************************
+Frees part of a segment. Differs from fseg_free_step because this function
+leaves the header page unfreed. */
+
+ibool
+fseg_free_step_not_header(
+/*======================*/
+ /* out: TRUE if freeing completed, except the
+ header page */
+ fseg_header_t* header, /* in: segment header which must reside on
+ the first fragment page of the segment */
+ mtr_t* mtr); /* in: mtr */
+/***************************************************************************
+Checks if a page address is an extent descriptor page address. */
+UNIV_INLINE
+ibool
+fsp_descr_page(
+/*===========*/
+ /* out: TRUE if a descriptor page */
+ ulint page_no);/* in: page number */
+/***************************************************************
+Parses a redo log record of a file page init. */
+
+byte*
+fsp_parse_init_file_page(
+/*=====================*/
+ /* out: end of log record or NULL */
+ byte* ptr, /* in: buffer */
+ byte* end_ptr,/* in: buffer end */
+ page_t* page); /* in: page or NULL */
+/***********************************************************************
+Validates the file space system and its segments. */
+
+ibool
+fsp_validate(
+/*=========*/
+ /* out: TRUE if ok */
+ ulint space); /* in: space id */
+/***********************************************************************
+Prints info of a file space. */
+
+void
+fsp_print(
+/*======*/
+ ulint space); /* in: space id */
+/***********************************************************************
+Validates a segment. */
+
+ibool
+fseg_validate(
+/*==========*/
+ /* out: TRUE if ok */
+ fseg_header_t* header, /* in: segment header */
+ mtr_t* mtr2); /* in: mtr */
+/***********************************************************************
+Writes info of a segment. */
+
+void
+fseg_print(
+/*=======*/
+ fseg_header_t* header, /* in: segment header */
+ mtr_t* mtr); /* in: mtr */
+
+/* Flags for fsp_reserve_free_extents */
+#define FSP_NORMAL 1000000
+#define FSP_UNDO 2000000
+#define FSP_CLEANING 3000000
+
+/* Number of pages described in a single descriptor page: currently each page
+description takes less than 1 byte; a descriptor page is repeated every
+this many file pages */
+#define XDES_DESCRIBED_PER_PAGE UNIV_PAGE_SIZE
+
+/* The space low address page map, and also offsets for extent descriptor and
+bitmap pages which are repeated always after XDES_DESCRIBED_PER_PAGE more
+pages: */
+/*--------------------------------------*/
+#define FSP_XDES_OFFSET 0
+#define FSP_IBUF_BITMAP_OFFSET 1
+ /* The ibuf bitmap pages are the ones whose
+ page number is the number above plus a
+ multiple of XDES_DESCRIBED_PER_PAGE */
+#define FSP_FIRST_INODE_PAGE_NO 2
+#define FSP_IBUF_HEADER_PAGE_NO 3
+#define FSP_IBUF_TREE_ROOT_PAGE_NO 4
+ /* The ibuf tree root page number in each
+ tablespace; its fseg inode is on the page
+ number FSP_FIRST_INODE_PAGE_NO */
+#define FSP_TRX_SYS_PAGE_NO 5
+#define FSP_FIRST_RSEG_PAGE_NO 6
+#define FSP_DICT_HDR_PAGE_NO 7
+/*--------------------------------------*/
+
+#ifndef UNIV_NONINL
+#include "fsp0fsp.ic"
+#endif
+
+#endif
diff --git a/innobase/include/fsp0fsp.ic b/innobase/include/fsp0fsp.ic
new file mode 100644
index 00000000000..89cd9263bd6
--- /dev/null
+++ b/innobase/include/fsp0fsp.ic
@@ -0,0 +1,24 @@
+/******************************************************
+File space management
+
+(c) 1995 Innobase Oy
+
+Created 12/18/1995 Heikki Tuuri
+*******************************************************/
+
+/***************************************************************************
+Checks if a page address is an extent descriptor page address. */
+UNIV_INLINE
+ibool
+fsp_descr_page(
+/*===========*/
+ /* out: TRUE if a descriptor page */
+ ulint page_no)/* in: page number */
+{
+ if (page_no % XDES_DESCRIBED_PER_PAGE == FSP_XDES_OFFSET) {
+
+ return(TRUE);
+ }
+
+ return(FALSE);
+}
diff --git a/innobase/include/fut0fut.h b/innobase/include/fut0fut.h
new file mode 100644
index 00000000000..b9546b4e1a0
--- /dev/null
+++ b/innobase/include/fut0fut.h
@@ -0,0 +1,36 @@
+/**********************************************************************
+File-based utilities
+
+(c) 1995 Innobase Oy
+
+Created 12/13/1995 Heikki Tuuri
+***********************************************************************/
+
+
+#ifndef fut0fut_h
+#define fut0fut_h
+
+#include "univ.i"
+
+#include "fil0fil.h"
+#include "mtr0mtr.h"
+
+/************************************************************************
+Gets a pointer to a file address and latches the page. */
+UNIV_INLINE
+byte*
+fut_get_ptr(
+/*========*/
+ /* out: pointer to a byte in a frame; the file
+ page in the frame is bufferfixed and latched */
+ ulint space, /* in: space id */
+ fil_addr_t addr, /* in: file address */
+ ulint rw_latch, /* in: RW_S_LATCH, RW_X_LATCH */
+ mtr_t* mtr); /* in: mtr handle */
+
+#ifndef UNIV_NONINL
+#include "fut0fut.ic"
+#endif
+
+#endif
+
diff --git a/innobase/include/fut0fut.ic b/innobase/include/fut0fut.ic
new file mode 100644
index 00000000000..0f1aa9dd9ae
--- /dev/null
+++ b/innobase/include/fut0fut.ic
@@ -0,0 +1,36 @@
+/**********************************************************************
+File-based utilities
+
+(c) 1995 Innobase Oy
+
+Created 12/13/1995 Heikki Tuuri
+***********************************************************************/
+
+#include "sync0rw.h"
+#include "buf0buf.h"
+
+/************************************************************************
+Gets a pointer to a file address and latches the page. */
+UNIV_INLINE
+byte*
+fut_get_ptr(
+/*========*/
+ /* out: pointer to a byte in a frame; the file
+ page in the frame is bufferfixed and latched */
+ ulint space, /* in: space id */
+ fil_addr_t addr, /* in: file address */
+ ulint rw_latch, /* in: RW_S_LATCH, RW_X_LATCH */
+ mtr_t* mtr) /* in: mtr handle */
+{
+ byte* ptr;
+
+ ut_ad(mtr);
+ ut_ad(addr.boffset < UNIV_PAGE_SIZE);
+ ut_ad((rw_latch == RW_S_LATCH) || (rw_latch == RW_X_LATCH));
+
+ ptr = buf_page_get(space, addr.page, rw_latch, mtr) + addr.boffset;
+
+ buf_page_dbg_add_level(ptr, SYNC_NO_ORDER_CHECK);
+
+ return(ptr);
+}
diff --git a/innobase/include/fut0lst.h b/innobase/include/fut0lst.h
new file mode 100644
index 00000000000..5427e2248da
--- /dev/null
+++ b/innobase/include/fut0lst.h
@@ -0,0 +1,198 @@
+/**********************************************************************
+File-based list utilities
+
+(c) 1995 Innobase Oy
+
+Created 11/28/1995 Heikki Tuuri
+***********************************************************************/
+
+#ifndef fut0lst_h
+#define fut0lst_h
+
+#include "univ.i"
+
+#include "fil0fil.h"
+#include "mtr0mtr.h"
+
+
+/* The C 'types' of base node and list node: these should be used to
+write self-documenting code. Of course, the sizeof macro cannot be
+applied to these types! */
+
+typedef byte flst_base_node_t;
+typedef byte flst_node_t;
+
+/* The physical size of a list base node in bytes */
+#define FLST_BASE_NODE_SIZE (4 + 2 * FIL_ADDR_SIZE)
+
+/* The physical size of a list node in bytes */
+#define FLST_NODE_SIZE (2 * FIL_ADDR_SIZE)
+
+
+/************************************************************************
+Initializes a list base node. */
+UNIV_INLINE
+void
+flst_init(
+/*======*/
+ flst_base_node_t* base, /* in: pointer to base node */
+ mtr_t* mtr); /* in: mini-transaction handle */
+/************************************************************************
+Adds a node as the last node in a list. */
+
+void
+flst_add_last(
+/*==========*/
+ flst_base_node_t* base, /* in: pointer to base node of list */
+ flst_node_t* node, /* in: node to add */
+ mtr_t* mtr); /* in: mini-transaction handle */
+/************************************************************************
+Adds a node as the first node in a list. */
+
+void
+flst_add_first(
+/*===========*/
+ flst_base_node_t* base, /* in: pointer to base node of list */
+ flst_node_t* node, /* in: node to add */
+ mtr_t* mtr); /* in: mini-transaction handle */
+/************************************************************************
+Inserts a node after another in a list. */
+
+void
+flst_insert_after(
+/*==============*/
+ flst_base_node_t* base, /* in: pointer to base node of list */
+ flst_node_t* node1, /* in: node to insert after */
+ flst_node_t* node2, /* in: node to add */
+ mtr_t* mtr); /* in: mini-transaction handle */
+/************************************************************************
+Inserts a node before another in a list. */
+
+void
+flst_insert_before(
+/*===============*/
+ flst_base_node_t* base, /* in: pointer to base node of list */
+ flst_node_t* node2, /* in: node to insert */
+ flst_node_t* node3, /* in: node to insert before */
+ mtr_t* mtr); /* in: mini-transaction handle */
+/************************************************************************
+Removes a node. */
+
+void
+flst_remove(
+/*========*/
+ flst_base_node_t* base, /* in: pointer to base node of list */
+ flst_node_t* node2, /* in: node to remove */
+ mtr_t* mtr); /* in: mini-transaction handle */
+/************************************************************************
+Cuts off the tail of the list, including the node given. The number of
+nodes which will be removed must be provided by the caller, as this function
+does not measure the length of the tail. */
+
+void
+flst_cut_end(
+/*=========*/
+ flst_base_node_t* base, /* in: pointer to base node of list */
+ flst_node_t* node2, /* in: first node to remove */
+ ulint n_nodes,/* in: number of nodes to remove,
+ must be >= 1 */
+ mtr_t* mtr); /* in: mini-transaction handle */
+/************************************************************************
+Cuts off the tail of the list, not including the given node. The number of
+nodes which will be removed must be provided by the caller, as this function
+does not measure the length of the tail. */
+
+void
+flst_truncate_end(
+/*==============*/
+ flst_base_node_t* base, /* in: pointer to base node of list */
+ flst_node_t* node2, /* in: first node not to remove */
+ ulint n_nodes,/* in: number of nodes to remove */
+ mtr_t* mtr); /* in: mini-transaction handle */
+/************************************************************************
+Gets list length. */
+UNIV_INLINE
+ulint
+flst_get_len(
+/*=========*/
+ /* out: length */
+ flst_base_node_t* base, /* in: pointer to base node */
+ mtr_t* mtr); /* in: mini-transaction handle */
+/************************************************************************
+Gets list first node address. */
+UNIV_INLINE
+fil_addr_t
+flst_get_first(
+/*===========*/
+ /* out: file address */
+ flst_base_node_t* base, /* in: pointer to base node */
+ mtr_t* mtr); /* in: mini-transaction handle */
+/************************************************************************
+Gets list last node address. */
+UNIV_INLINE
+fil_addr_t
+flst_get_last(
+/*==========*/
+ /* out: file address */
+ flst_base_node_t* base, /* in: pointer to base node */
+ mtr_t* mtr); /* in: mini-transaction handle */
+/************************************************************************
+Gets list next node address. */
+UNIV_INLINE
+fil_addr_t
+flst_get_next_addr(
+/*===============*/
+ /* out: file address */
+ flst_node_t* node, /* in: pointer to node */
+ mtr_t* mtr); /* in: mini-transaction handle */
+/************************************************************************
+Gets list prev node address. */
+UNIV_INLINE
+fil_addr_t
+flst_get_prev_addr(
+/*===============*/
+ /* out: file address */
+ flst_node_t* node, /* in: pointer to node */
+ mtr_t* mtr); /* in: mini-transaction handle */
+/************************************************************************
+Writes a file address. */
+UNIV_INLINE
+void
+flst_write_addr(
+/*============*/
+ fil_faddr_t* faddr, /* in: pointer to file faddress */
+ fil_addr_t addr, /* in: file address */
+ mtr_t* mtr); /* in: mini-transaction handle */
+/************************************************************************
+Reads a file address. */
+UNIV_INLINE
+fil_addr_t
+flst_read_addr(
+/*===========*/
+ /* out: file address */
+ fil_faddr_t* faddr, /* in: pointer to file faddress */
+ mtr_t* mtr); /* in: mini-transaction handle */
+/************************************************************************
+Validates a file-based list. */
+
+ibool
+flst_validate(
+/*==========*/
+ /* out: TRUE if ok */
+ flst_base_node_t* base, /* in: pointer to base node of list */
+ mtr_t* mtr1); /* in: mtr */
+/************************************************************************
+Prints info of a file-based list. */
+
+void
+flst_print(
+/*=======*/
+ flst_base_node_t* base, /* in: pointer to base node of list */
+ mtr_t* mtr); /* in: mtr */
+
+
+#ifndef UNIV_NONINL
+#include "fut0lst.ic"
+#endif
+
+#endif
diff --git a/innobase/include/fut0lst.ic b/innobase/include/fut0lst.ic
new file mode 100644
index 00000000000..d2e79cf7640
--- /dev/null
+++ b/innobase/include/fut0lst.ic
@@ -0,0 +1,147 @@
+/**********************************************************************
+File-based list utilities
+
+(c) 1995 Innobase Oy
+
+Created 11/28/1995 Heikki Tuuri
+***********************************************************************/
+
+#include "fut0fut.h"
+#include "mtr0log.h"
+#include "buf0buf.h"
+
+/* We define the field offsets of a node for the list */
+#define FLST_PREV 0 /* 6-byte address of the previous list element;
+ the page part of address is FIL_NULL, if no
+ previous element */
+#define FLST_NEXT FIL_ADDR_SIZE /* 6-byte address of the next
+ list element; the page part of address
+ is FIL_NULL, if no next element */
+
+/* We define the field offsets of a base node for the list */
+#define FLST_LEN 0 /* 32-bit list length field */
+#define FLST_FIRST 4 /* 6-byte address of the first element
+ of the list; undefined if empty list */
+#define FLST_LAST (4 + FIL_ADDR_SIZE) /* 6-byte address of the
+ first element of the list; undefined
+ if empty list */
+
+/************************************************************************
+Writes a file address. */
+UNIV_INLINE
+void
+flst_write_addr(
+/*============*/
+ fil_faddr_t* faddr, /* in: pointer to file faddress */
+ fil_addr_t addr, /* in: file address */
+ mtr_t* mtr) /* in: mini-transaction handle */
+{
+ ut_ad(faddr && mtr);
+ ut_ad(mtr_memo_contains(mtr, buf_block_align(faddr),
+ MTR_MEMO_PAGE_X_FIX));
+
+ mlog_write_ulint(faddr + FIL_ADDR_PAGE, addr.page, MLOG_4BYTES, mtr);
+ mlog_write_ulint(faddr + FIL_ADDR_BYTE, addr.boffset,
+ MLOG_2BYTES, mtr);
+}
+
+/************************************************************************
+Reads a file address. */
+UNIV_INLINE
+fil_addr_t
+flst_read_addr(
+/*===========*/
+ /* out: file address */
+ fil_faddr_t* faddr, /* in: pointer to file faddress */
+ mtr_t* mtr) /* in: mini-transaction handle */
+{
+ fil_addr_t addr;
+
+ ut_ad(faddr && mtr);
+
+ addr.page = mtr_read_ulint(faddr + FIL_ADDR_PAGE, MLOG_4BYTES, mtr);
+ addr.boffset = mtr_read_ulint(faddr + FIL_ADDR_BYTE, MLOG_2BYTES,
+ mtr);
+ return(addr);
+}
+
+/************************************************************************
+Initializes a list base node. */
+UNIV_INLINE
+void
+flst_init(
+/*======*/
+ flst_base_node_t* base, /* in: pointer to base node */
+ mtr_t* mtr) /* in: mini-transaction handle */
+{
+ ut_ad(mtr_memo_contains(mtr, buf_block_align(base),
+ MTR_MEMO_PAGE_X_FIX));
+ mlog_write_ulint(base + FLST_LEN, 0, MLOG_4BYTES, mtr);
+ flst_write_addr(base + FLST_FIRST, fil_addr_null, mtr);
+ flst_write_addr(base + FLST_LAST, fil_addr_null, mtr);
+}
+
+/************************************************************************
+Gets list length. */
+UNIV_INLINE
+ulint
+flst_get_len(
+/*=========*/
+ /* out: length */
+ flst_base_node_t* base, /* in: pointer to base node */
+ mtr_t* mtr) /* in: mini-transaction handle */
+{
+ return(mtr_read_ulint(base + FLST_LEN, MLOG_4BYTES, mtr));
+}
+
+/************************************************************************
+Gets list first node address. */
+UNIV_INLINE
+fil_addr_t
+flst_get_first(
+/*===========*/
+ /* out: file address */
+ flst_base_node_t* base, /* in: pointer to base node */
+ mtr_t* mtr) /* in: mini-transaction handle */
+{
+ return(flst_read_addr(base + FLST_FIRST, mtr));
+}
+
+/************************************************************************
+Gets list last node address. */
+UNIV_INLINE
+fil_addr_t
+flst_get_last(
+/*==========*/
+ /* out: file address */
+ flst_base_node_t* base, /* in: pointer to base node */
+ mtr_t* mtr) /* in: mini-transaction handle */
+{
+ return(flst_read_addr(base + FLST_LAST, mtr));
+}
+
+/************************************************************************
+Gets list next node address. */
+UNIV_INLINE
+fil_addr_t
+flst_get_next_addr(
+/*===============*/
+ /* out: file address */
+ flst_node_t* node, /* in: pointer to node */
+ mtr_t* mtr) /* in: mini-transaction handle */
+{
+ return(flst_read_addr(node + FLST_NEXT, mtr));
+}
+
+/************************************************************************
+Gets list prev node address. */
+UNIV_INLINE
+fil_addr_t
+flst_get_prev_addr(
+/*===============*/
+ /* out: file address */
+ flst_node_t* node, /* in: pointer to node */
+ mtr_t* mtr) /* in: mini-transaction handle */
+{
+ return(flst_read_addr(node + FLST_PREV, mtr));
+}
diff --git a/innobase/include/ha0ha.h b/innobase/include/ha0ha.h
new file mode 100644
index 00000000000..aeed7c32eff
--- /dev/null
+++ b/innobase/include/ha0ha.h
@@ -0,0 +1,137 @@
+/******************************************************
+The hash table with external chains
+
+(c) 1994-1997 Innobase Oy
+
+Created 8/18/1994 Heikki Tuuri
+*******************************************************/
+
+#ifndef ha0ha_h
+#define ha0ha_h
+
+#include "univ.i"
+
+#include "hash0hash.h"
+#include "page0types.h"
+
+/*****************************************************************
+Looks for an element in a hash table. */
+UNIV_INLINE
+void*
+ha_search_and_get_data(
+/*===================*/
+ /* out: pointer to the data of the first hash
+ table node in chain having the fold number,
+ NULL if not found */
+ hash_table_t* table, /* in: hash table */
+ ulint fold); /* in: folded value of the searched data */
+/*************************************************************
+Looks for an element when we know the pointer to the data and updates
+the pointer to data if found. */
+UNIV_INLINE
+void
+ha_search_and_update_if_found(
+/*==========================*/
+ hash_table_t* table, /* in: hash table */
+ ulint fold, /* in: folded value of the searched data */
+ void* data, /* in: pointer to the data */
+ void* new_data);/* in: new pointer to the data */
+/*****************************************************************
+Creates a hash table with >= n array cells. The actual number of cells is
+chosen to be a prime number slightly bigger than n. */
+
+hash_table_t*
+ha_create(
+/*======*/
+ /* out, own: created table */
+ ibool in_btr_search, /* in: TRUE if the hash table is used in
+ the btr_search module */
+ ulint n, /* in: number of array cells */
+ ulint n_mutexes, /* in: number of mutexes to protect the
+ hash table: must be a power of 2 */
+ ulint mutex_level); /* in: level of the mutexes in the latching
+ order: this is used in the debug version */
+/*****************************************************************
+Inserts an entry into a hash table. If an entry with the same fold number
+is found, its node is updated to point to the new data, and no new node
+is inserted. */
+
+ibool
+ha_insert_for_fold(
+/*===============*/
+ /* out: TRUE if succeed, FALSE if no more
+ memory could be allocated */
+ hash_table_t* table, /* in: hash table */
+ ulint fold, /* in: folded value of data; if a node with
+ the same fold value already exists, it is
+ updated to point to the same data, and no new
+ node is created! */
+ void* data); /* in: data, must not be NULL */
+/*****************************************************************
+Reserves the necessary hash table mutex and inserts an entry into the hash
+table. */
+UNIV_INLINE
+ibool
+ha_insert_for_fold_mutex(
+/*=====================*/
+ /* out: TRUE if succeed, FALSE if no more
+ memory could be allocated */
+ hash_table_t* table, /* in: hash table */
+ ulint fold, /* in: folded value of data; if a node with
+ the same fold value already exists, it is
+ updated to point to the same data, and no new
+ node is created! */
+ void* data); /* in: data, must not be NULL */
+/*****************************************************************
+Deletes an entry from a hash table. */
+
+void
+ha_delete(
+/*======*/
+ hash_table_t* table, /* in: hash table */
+ ulint fold, /* in: folded value of data */
+ void* data); /* in: data, must not be NULL and must exist
+ in the hash table */
+/*************************************************************
+Looks for an element when we know the pointer to the data and deletes
+it from the hash table if found. */
+UNIV_INLINE
+ibool
+ha_search_and_delete_if_found(
+/*==========================*/
+ /* out: TRUE if found */
+ hash_table_t* table, /* in: hash table */
+ ulint fold, /* in: folded value of the searched data */
+ void* data); /* in: pointer to the data */
+/*********************************************************************
+Removes from the chain determined by fold all nodes whose data pointer
+points to the page given. */
+
+void
+ha_remove_all_nodes_to_page(
+/*========================*/
+ hash_table_t* table, /* in: hash table */
+ ulint fold, /* in: fold value */
+ page_t* page); /* in: buffer page */
+/*****************************************************************
+Validates a hash table. */
+
+ibool
+ha_validate(
+/*========*/
+ /* out: TRUE if ok */
+ hash_table_t* table); /* in: hash table */
+/*****************************************************************
+Prints info of a hash table. */
+
+void
+ha_print_info(
+/*==========*/
+ hash_table_t* table); /* in: hash table */
+
+
+#ifndef UNIV_NONINL
+#include "ha0ha.ic"
+#endif
+
+#endif
diff --git a/innobase/include/ha0ha.ic b/innobase/include/ha0ha.ic
new file mode 100644
index 00000000000..7b4c624c653
--- /dev/null
+++ b/innobase/include/ha0ha.ic
@@ -0,0 +1,280 @@
+/************************************************************************
+The hash table with external chains
+
+(c) 1994-1997 Innobase Oy
+
+Created 8/18/1994 Heikki Tuuri
+*************************************************************************/
+
+#include "ut0rnd.h"
+#include "mem0mem.h"
+
+/* The hash table external chain node */
+
+typedef struct ha_node_struct ha_node_t;
+
+struct ha_node_struct {
+ ha_node_t* next; /* next chain node or NULL if none */
+ void* data; /* pointer to the data */
+ ulint fold; /* fold value for the data */
+};
+
+/***************************************************************
+Deletes a hash node. */
+
+void
+ha_delete_hash_node(
+/*================*/
+ hash_table_t* table, /* in: hash table */
+ ha_node_t* del_node); /* in: node to be deleted */
+
+/**********************************************************************
+Gets a hash node data. */
+UNIV_INLINE
+void*
+ha_node_get_data(
+/*=============*/
+ /* out: pointer to the data */
+ ha_node_t* node) /* in: hash chain node */
+{
+ return(node->data);
+}
+
+/**********************************************************************
+Sets hash node data. */
+UNIV_INLINE
+void
+ha_node_set_data(
+/*=============*/
+ ha_node_t* node, /* in: hash chain node */
+ void* data) /* in: pointer to the data */
+{
+ node->data = data;
+}
+
+/**********************************************************************
+Gets the next node in a hash chain. */
+UNIV_INLINE
+ha_node_t*
+ha_chain_get_next(
+/*==============*/
+ /* out: next node, NULL if none */
+ hash_table_t* table, /* in: hash table */
+ ha_node_t* node) /* in: hash chain node */
+{
+ ut_ad(table);
+
+ return(node->next);
+}
+
+/**********************************************************************
+Gets the first node in a hash chain. */
+UNIV_INLINE
+ha_node_t*
+ha_chain_get_first(
+/*===============*/
+ /* out: first node, NULL if none */
+ hash_table_t* table, /* in: hash table */
+ ulint fold) /* in: fold value determining the chain */
+{
+ return(hash_get_nth_cell(table, hash_calc_hash(fold, table))->node);
+}
+
+/*****************************************************************
+Looks for an element in a hash table. */
+UNIV_INLINE
+ha_node_t*
+ha_search(
+/*======*/
+ /* out: pointer to the first hash table node
+ in chain having the fold number, NULL if not
+ found */
+ hash_table_t* table, /* in: hash table */
+ ulint fold) /* in: folded value of the searched data */
+{
+ ha_node_t* node;
+
+ ut_ad(!table->mutexes || mutex_own(hash_get_mutex(table, fold)));
+
+ node = ha_chain_get_first(table, fold);
+
+ while (node) {
+ if (node->fold == fold) {
+
+ return(node);
+ }
+
+ node = ha_chain_get_next(table, node);
+ }
+
+ return(NULL);
+}
+
+/*****************************************************************
+Looks for an element in a hash table. */
+UNIV_INLINE
+void*
+ha_search_and_get_data(
+/*===================*/
+ /* out: pointer to the data of the first hash
+ table node in chain having the fold number,
+ NULL if not found */
+ hash_table_t* table, /* in: hash table */
+ ulint fold) /* in: folded value of the searched data */
+{
+ ha_node_t* node;
+
+ ut_ad(!table->mutexes || mutex_own(hash_get_mutex(table, fold)));
+
+ node = ha_chain_get_first(table, fold);
+
+ while (node) {
+ if (node->fold == fold) {
+
+ return(node->data);
+ }
+
+ node = ha_chain_get_next(table, node);
+ }
+
+ return(NULL);
+}
+
+/*****************************************************************
+Returns the next matching hash table node in chain. */
+UNIV_INLINE
+ha_node_t*
+ha_next(
+/*====*/
+ /* out: pointer to the next hash table node
+ in chain with the fold value, NULL if not
+ found */
+ hash_table_t* table, /* in: hash table */
+ ha_node_t* node) /* in: hash table node */
+{
+ ulint fold;
+
+ fold = node->fold;
+
+ ut_ad(!table->mutexes || mutex_own(hash_get_mutex(table, fold)));
+
+ node = ha_chain_get_next(table, node);
+
+ while (node) {
+ if (node->fold == fold) {
+
+ return(node);
+ }
+
+ node = ha_chain_get_next(table, node);
+ }
+
+ return(NULL);
+}
+
+/*************************************************************
+Looks for an element when we know the pointer to the data. */
+UNIV_INLINE
+ha_node_t*
+ha_search_with_data(
+/*================*/
+ /* out: pointer to the hash table node, NULL
+ if not found in the table */
+ hash_table_t* table, /* in: hash table */
+ ulint fold, /* in: folded value of the searched data */
+ void* data) /* in: pointer to the data */
+{
+ ha_node_t* node;
+
+ ut_ad(!table->mutexes || mutex_own(hash_get_mutex(table, fold)));
+
+ node = ha_chain_get_first(table, fold);
+
+ while (node) {
+ if (node->data == data) {
+
+ return(node);
+ }
+
+ node = ha_chain_get_next(table, node);
+ }
+
+ return(NULL);
+}
+
+/*************************************************************
+Looks for an element when we know the pointer to the data, and updates
+the pointer to data, if found. */
+UNIV_INLINE
+void
+ha_search_and_update_if_found(
+/*==========================*/
+ hash_table_t* table, /* in: hash table */
+ ulint fold, /* in: folded value of the searched data */
+ void* data, /* in: pointer to the data */
+ void* new_data)/* in: new pointer to the data */
+{
+ ha_node_t* node;
+
+ ut_ad(!table->mutexes || mutex_own(hash_get_mutex(table, fold)));
+
+ node = ha_search_with_data(table, fold, data);
+
+ if (node) {
+ node->data = new_data;
+ }
+}
+
+/*************************************************************
+Looks for an element when we know the pointer to the data, and deletes
+it from the hash table, if found. */
+UNIV_INLINE
+ibool
+ha_search_and_delete_if_found(
+/*==========================*/
+ /* out: TRUE if found */
+ hash_table_t* table, /* in: hash table */
+ ulint fold, /* in: folded value of the searched data */
+ void* data) /* in: pointer to the data */
+{
+ ha_node_t* node;
+
+ ut_ad(!table->mutexes || mutex_own(hash_get_mutex(table, fold)));
+
+ node = ha_search_with_data(table, fold, data);
+
+ if (node) {
+ ha_delete_hash_node(table, node);
+
+ return(TRUE);
+ }
+
+ return(FALSE);
+}
+
+/*****************************************************************
+Reserves the necessary hash table mutex and inserts an entry into the hash
+table. */
+UNIV_INLINE
+ibool
+ha_insert_for_fold_mutex(
+/*=====================*/
+ /* out: TRUE if succeed, FALSE if no more
+ memory could be allocated */
+ hash_table_t* table, /* in: hash table */
+ ulint fold, /* in: folded value of data; if a node with
+ the same fold value already exists, it is
+ updated to point to the same data, and no new
+ node is created! */
+ void* data) /* in: data, must not be NULL */
+{
+ ibool ret;
+
+ hash_mutex_enter(table, fold);
+
+ ret = ha_insert_for_fold(table, fold, data);
+
+ hash_mutex_exit(table, fold);
+
+ return(ret);
+}
diff --git a/innobase/include/hash0hash.h b/innobase/include/hash0hash.h
new file mode 100644
index 00000000000..378925a5bea
--- /dev/null
+++ b/innobase/include/hash0hash.h
@@ -0,0 +1,345 @@
+/******************************************************
+The simple hash table utility
+
+(c) 1997 Innobase Oy
+
+Created 5/20/1997 Heikki Tuuri
+*******************************************************/
+
+#ifndef hash0hash_h
+#define hash0hash_h
+
+#include "univ.i"
+#include "mem0mem.h"
+#include "sync0sync.h"
+
+typedef struct hash_table_struct hash_table_t;
+typedef struct hash_cell_struct hash_cell_t;
+
+typedef void* hash_node_t;
+
+/*****************************************************************
+Creates a hash table with >= n array cells. The actual number
+of cells is chosen to be a prime number slightly bigger than n. */
+
+hash_table_t*
+hash_create(
+/*========*/
+ /* out, own: created table */
+ ulint n); /* in: number of array cells */
+/*****************************************************************
+Creates a mutex array to protect a hash table. */
+
+void
+hash_create_mutexes(
+/*================*/
+ hash_table_t* table, /* in: hash table */
+ ulint n_mutexes, /* in: number of mutexes */
+ ulint sync_level); /* in: latching order level of the
+ mutexes: used in the debug version */
+/*****************************************************************
+Frees a hash table. */
+
+void
+hash_table_free(
+/*============*/
+ hash_table_t* table); /* in, own: hash table */
+/******************************************************************
+Calculates the hash value from a folded value. */
+UNIV_INLINE
+ulint
+hash_calc_hash(
+/*===========*/
+ /* out: hashed value */
+ ulint fold, /* in: folded value */
+ hash_table_t* table); /* in: hash table */
+/***********************************************************************
+Inserts a struct to a hash table. */
+
+#define HASH_INSERT(TYPE, NAME, TABLE, FOLD, DATA)\
+{\
+ hash_cell_t* cell3333;\
+ TYPE* struct3333;\
+\
+ ut_ad(!(TABLE)->mutexes || mutex_own(hash_get_mutex(TABLE, FOLD)));\
+\
+ (DATA)->NAME = NULL;\
+\
+ cell3333 = hash_get_nth_cell(TABLE, hash_calc_hash(FOLD, TABLE));\
+\
+ if (cell3333->node == NULL) {\
+ cell3333->node = DATA;\
+ } else {\
+ struct3333 = cell3333->node;\
+\
+ while (struct3333->NAME != NULL) {\
+\
+ struct3333 = struct3333->NAME;\
+ }\
+\
+ struct3333->NAME = DATA;\
+ }\
+}
+
+/***********************************************************************
+Deletes a struct from a hash table. */
+
+#define HASH_DELETE(TYPE, NAME, TABLE, FOLD, DATA)\
+{\
+ hash_cell_t* cell3333;\
+ TYPE* struct3333;\
+\
+ ut_ad(!(TABLE)->mutexes || mutex_own(hash_get_mutex(TABLE, FOLD)));\
+\
+ cell3333 = hash_get_nth_cell(TABLE, hash_calc_hash(FOLD, TABLE));\
+\
+ if (cell3333->node == DATA) {\
+ cell3333->node = DATA->NAME;\
+ } else {\
+ struct3333 = cell3333->node;\
+\
+ while (struct3333->NAME != DATA) {\
+\
+ ut_ad(struct3333)\
+ struct3333 = struct3333->NAME;\
+ }\
+\
+ struct3333->NAME = DATA->NAME;\
+ }\
+}
+
+/***********************************************************************
+Gets the first struct in a hash chain, NULL if none. */
+
+#define HASH_GET_FIRST(TABLE, HASH_VAL)\
+ (hash_get_nth_cell(TABLE, HASH_VAL)->node)
+
+/***********************************************************************
+Gets the next struct in a hash chain, NULL if none. */
+
+#define HASH_GET_NEXT(NAME, DATA) ((DATA)->NAME)
+
+/************************************************************************
+Looks for a struct in a hash table. */
+#define HASH_SEARCH(NAME, TABLE, FOLD, DATA, TEST)\
+{\
+\
+ ut_ad(!(TABLE)->mutexes || mutex_own(hash_get_mutex(TABLE, FOLD)));\
+\
+ (DATA) = HASH_GET_FIRST(TABLE, hash_calc_hash(FOLD, TABLE));\
+\
+ while ((DATA) != NULL) {\
+ if (TEST) {\
+ break;\
+ } else {\
+ (DATA) = HASH_GET_NEXT(NAME, DATA);\
+ }\
+ }\
+}
+
+/****************************************************************
+Gets the nth cell in a hash table. */
+UNIV_INLINE
+hash_cell_t*
+hash_get_nth_cell(
+/*==============*/
+ /* out: pointer to cell */
+ hash_table_t* table, /* in: hash table */
+ ulint n); /* in: cell index */
+/*****************************************************************
+Returns the number of cells in a hash table. */
+UNIV_INLINE
+ulint
+hash_get_n_cells(
+/*=============*/
+ /* out: number of cells */
+ hash_table_t* table); /* in: table */
+/***********************************************************************
+Deletes a struct which is stored in the heap of the hash table, and compacts
+the heap. The fold value must be stored in the struct NODE in a field named
+'fold'. */
+
+#define HASH_DELETE_AND_COMPACT(TYPE, NAME, TABLE, NODE)\
+{\
+ TYPE* node111;\
+ TYPE* top_node111;\
+ hash_cell_t* cell111;\
+ ulint fold111;\
+\
+ fold111 = (NODE)->fold;\
+\
+ HASH_DELETE(TYPE, NAME, TABLE, fold111, NODE);\
+\
+ top_node111 = (TYPE*)mem_heap_get_top(\
+ hash_get_heap(TABLE, fold111),\
+ sizeof(TYPE));\
+\
+ /* If the node to remove is not the top node in the heap, compact the\
+ heap of nodes by moving the top node in the place of NODE. */\
+\
+ if (NODE != top_node111) {\
+\
+ /* Copy the top node in place of NODE */\
+\
+ *(NODE) = *top_node111;\
+\
+ cell111 = hash_get_nth_cell(TABLE,\
+ hash_calc_hash(top_node111->fold, TABLE));\
+\
+ /* Look for the pointer to the top node, to update it */\
+\
+ if (cell111->node == top_node111) {\
+ /* The top node is the first in the chain */\
+\
+ cell111->node = NODE;\
+ } else {\
+ /* We have to look for the predecessor of the top\
+ node */\
+ node111 = cell111->node;\
+\
+ while (top_node111 != HASH_GET_NEXT(NAME, node111)) {\
+\
+ node111 = HASH_GET_NEXT(NAME, node111);\
+ }\
+\
+ /* Now we have the predecessor node */\
+\
+ node111->NAME = NODE;\
+ }\
+ }\
+\
+ /* Free the space occupied by the top node */\
+\
+ mem_heap_free_top(hash_get_heap(TABLE, fold111), sizeof(TYPE));\
+}
+
+/***********************************************************************
+Calculates the number of stored structs in a hash table. */
+
+#define HASH_GET_N_NODES(TYPE, NAME, TABLE, N)\
+{\
+ hash_cell_t* cell3333;\
+ TYPE* struct3333;\
+ ulint i3333;\
+\
+ (N) = 0;\
+\
+ for (i3333 = 0; i3333 < hash_get_n_cells(TABLE); i3333++) {\
+\
+ cell3333 = hash_get_nth_cell(TABLE, i3333);\
+\
+ struct3333 = cell3333->node;\
+\
+ while (struct3333) {\
+\
+ (N) = (N) + 1;\
+\
+ struct = HASH_GET_NEXT(NAME, struct3333);\
+ }\
+ }\
+}
+
+/****************************************************************
+Gets the mutex index for a fold value in a hash table. */
+UNIV_INLINE
+ulint
+hash_get_mutex_no(
+/*==============*/
+ /* out: mutex number */
+ hash_table_t* table, /* in: hash table */
+ ulint fold); /* in: fold */
+/****************************************************************
+Gets the nth heap in a hash table. */
+UNIV_INLINE
+mem_heap_t*
+hash_get_nth_heap(
+/*==============*/
+ /* out: mem heap */
+ hash_table_t* table, /* in: hash table */
+ ulint i); /* in: index of the heap */
+/****************************************************************
+Gets the heap for a fold value in a hash table. */
+UNIV_INLINE
+mem_heap_t*
+hash_get_heap(
+/*==========*/
+ /* out: mem heap */
+ hash_table_t* table, /* in: hash table */
+ ulint fold); /* in: fold */
+/****************************************************************
+Gets the nth mutex in a hash table. */
+UNIV_INLINE
+mutex_t*
+hash_get_nth_mutex(
+/*===============*/
+ /* out: mutex */
+ hash_table_t* table, /* in: hash table */
+ ulint i); /* in: index of the mutex */
+/****************************************************************
+Gets the mutex for a fold value in a hash table. */
+UNIV_INLINE
+mutex_t*
+hash_get_mutex(
+/*===========*/
+ /* out: mutex */
+ hash_table_t* table, /* in: hash table */
+ ulint fold); /* in: fold */
+/****************************************************************
+Reserves the mutex for a fold value in a hash table. */
+
+void
+hash_mutex_enter(
+/*=============*/
+ hash_table_t* table, /* in: hash table */
+ ulint fold); /* in: fold */
+/****************************************************************
+Releases the mutex for a fold value in a hash table. */
+
+void
+hash_mutex_exit(
+/*============*/
+ hash_table_t* table, /* in: hash table */
+ ulint fold); /* in: fold */
+/****************************************************************
+Reserves all the mutexes of a hash table, in an ascending order. */
+
+void
+hash_mutex_enter_all(
+/*=================*/
+ hash_table_t* table); /* in: hash table */
+/****************************************************************
+Releases all the mutexes of a hash table. */
+
+void
+hash_mutex_exit_all(
+/*================*/
+ hash_table_t* table); /* in: hash table */
+
+
+struct hash_cell_struct{
+ void* node; /* hash chain node, NULL if none */
+};
+
+/* The hash table structure */
+struct hash_table_struct {
+ ulint n_cells;/* number of cells in the hash table */
+ hash_cell_t* array; /* pointer to cell array */
+ ulint n_mutexes;/* if mutexes != NULL, then the number of
+ mutexes, must be a power of 2 */
+ mutex_t* mutexes;/* NULL, or an array of mutexes used to
+ protect segments of the hash table */
+ mem_heap_t** heaps; /* if this is non-NULL, hash chain nodes for
+ external chaining can be allocated from these
+ memory heaps; there are then n_mutexes many of
+ these heaps */
+ mem_heap_t* heap;
+ ulint magic_n;
+};
+
+#define HASH_TABLE_MAGIC_N 76561114
+
+#ifndef UNIV_NONINL
+#include "hash0hash.ic"
+#endif
+
+#endif
diff --git a/innobase/include/hash0hash.ic b/innobase/include/hash0hash.ic
new file mode 100644
index 00000000000..3ed2f9088dd
--- /dev/null
+++ b/innobase/include/hash0hash.ic
@@ -0,0 +1,131 @@
+/******************************************************
+The simple hash table utility
+
+(c) 1997 Innobase Oy
+
+Created 5/20/1997 Heikki Tuuri
+*******************************************************/
+
+#include "ut0rnd.h"
+
+/****************************************************************
+Gets the nth cell in a hash table. */
+UNIV_INLINE
+hash_cell_t*
+hash_get_nth_cell(
+/*==============*/
+ /* out: pointer to cell */
+ hash_table_t* table, /* in: hash table */
+ ulint n) /* in: cell index */
+{
+ ut_ad(n >= 0);
+ ut_ad(n < table->n_cells);
+
+ return(table->array + n);
+}
+
+/*****************************************************************
+Returns the number of cells in a hash table. */
+UNIV_INLINE
+ulint
+hash_get_n_cells(
+/*=============*/
+ /* out: number of cells */
+ hash_table_t* table) /* in: table */
+{
+ return(table->n_cells);
+}
+
+/******************************************************************
+Calculates the hash value from a folded value. */
+UNIV_INLINE
+ulint
+hash_calc_hash(
+/*===========*/
+ /* out: hashed value */
+ ulint fold, /* in: folded value */
+ hash_table_t* table) /* in: hash table */
+{
+ return(ut_hash_ulint(fold, table->n_cells));
+}
+
+/****************************************************************
+Gets the mutex index for a fold value in a hash table. */
+UNIV_INLINE
+ulint
+hash_get_mutex_no(
+/*==============*/
+ /* out: mutex number */
+ hash_table_t* table, /* in: hash table */
+ ulint fold) /* in: fold */
+{
+ return(ut_2pow_remainder(fold, table->n_mutexes));
+}
+
+/****************************************************************
+Gets the nth heap in a hash table. */
+UNIV_INLINE
+mem_heap_t*
+hash_get_nth_heap(
+/*==============*/
+ /* out: mem heap */
+ hash_table_t* table, /* in: hash table */
+ ulint i) /* in: index of the heap */
+{
+ ut_ad(i < table->n_mutexes);
+
+ return(table->heaps[i]);
+}
+
+/****************************************************************
+Gets the heap for a fold value in a hash table. */
+UNIV_INLINE
+mem_heap_t*
+hash_get_heap(
+/*==========*/
+ /* out: mem heap */
+ hash_table_t* table, /* in: hash table */
+ ulint fold) /* in: fold */
+{
+ ulint i;
+
+ if (table->heap) {
+ return(table->heap);
+ }
+
+ i = hash_get_mutex_no(table, fold);
+
+ return(hash_get_nth_heap(table, i));
+}
+
+/****************************************************************
+Gets the nth mutex in a hash table. */
+UNIV_INLINE
+mutex_t*
+hash_get_nth_mutex(
+/*===============*/
+ /* out: mutex */
+ hash_table_t* table, /* in: hash table */
+ ulint i) /* in: index of the mutex */
+{
+ ut_ad(i < table->n_mutexes);
+
+ return(table->mutexes + i);
+}
+
+/****************************************************************
+Gets the mutex for a fold value in a hash table. */
+UNIV_INLINE
+mutex_t*
+hash_get_mutex(
+/*===========*/
+ /* out: mutex */
+ hash_table_t* table, /* in: hash table */
+ ulint fold) /* in: fold */
+{
+ ulint i;
+
+ i = hash_get_mutex_no(table, fold);
+
+ return(hash_get_nth_mutex(table, i));
+}
diff --git a/innobase/include/ib_odbc.h b/innobase/include/ib_odbc.h
new file mode 100644
index 00000000000..86884b41d39
--- /dev/null
+++ b/innobase/include/ib_odbc.h
@@ -0,0 +1,149 @@
+/******************************************************
+Innobase ODBC client library header; this is equivalent to
+the standard sql.h ODBC header file
+
+(c) 1998 Innobase Oy
+
+Created 2/22/1998 Heikki Tuuri
+*******************************************************/
+
+#ifndef ib_odbc_h
+#define ib_odbc_h
+
+typedef unsigned char UCHAR;
+typedef signed char SCHAR;
+typedef long int SDWORD;
+typedef short int SWORD;
+typedef unsigned long int UDWORD;
+typedef unsigned short int UWORD;
+
+typedef void* PTR;
+
+typedef void* HENV;
+typedef void* HDBC;
+typedef void* HSTMT;
+
+typedef signed short RETCODE;
+
+/* RETCODEs */
+#define SQL_NO_DATA_FOUND (-3)
+#define SQL_INVALID_HANDLE (-2)
+#define SQL_ERROR (-1)
+#define SQL_SUCCESS 0
+
+/* Standard SQL datatypes, using ANSI type numbering */
+#define SQL_CHAR 1
+#define SQL_INTEGER 4
+#define SQL_VARCHAR 12
+
+/* C datatype to SQL datatype mapping */
+#define SQL_C_CHAR SQL_CHAR
+#define SQL_C_LONG SQL_INTEGER
+
+/* Special length value */
+#define SQL_NULL_DATA (-1)
+
+#define SQL_PARAM_INPUT 1
+#define SQL_PARAM_OUTPUT 4
+
+/* Null handles */
+#define SQL_NULL_HENV NULL
+#define SQL_NULL_HDBC NULL
+#define SQL_NULL_HSTM NULL
+
+
+/**************************************************************************
+Allocates an SQL environment. */
+
+RETCODE
+SQLAllocEnv(
+/*========*/
+ /* out: SQL_SUCCESS */
+ HENV* phenv); /* out: pointer to an environment handle */
+/**************************************************************************
+Allocates an SQL connection. */
+
+RETCODE
+SQLAllocConnect(
+/*============*/
+ /* out: SQL_SUCCESS */
+ HENV henv, /* in: pointer to an environment handle */
+ HDBC* phdbc); /* out: pointer to a connection handle */
+/**************************************************************************
+Allocates an SQL statement. */
+
+RETCODE
+SQLAllocStmt(
+/*=========*/
+ HDBC hdbc, /* in: SQL connection */
+ HSTMT* phstmt); /* out: pointer to a statement handle */
+/**************************************************************************
+Connects to a database server process (establishes a connection and a
+session). */
+
+RETCODE
+SQLConnect(
+/*=======*/
+ /* out: SQL_SUCCESS */
+ HDBC hdbc, /* in: SQL connection handle */
+ UCHAR* szDSN, /* in: data source name (server name) */
+ SWORD cbDSN, /* in: data source name length */
+ UCHAR* szUID, /* in: user name */
+ SWORD cbUID, /* in: user name length */
+ UCHAR* szAuthStr, /* in: password */
+ SWORD cbAuthStr); /* in: password length */
+/**************************************************************************
+Makes the server to parse and optimize an SQL string. */
+
+RETCODE
+SQLPrepare(
+/*=======*/
+ /* out: SQL_SUCCESS */
+ HSTMT hstmt, /* in: statement handle */
+ UCHAR* szSqlStr, /* in: SQL string */
+ SDWORD cbSqlStr); /* in: SQL string length */
+/**************************************************************************
+Binds a parameter in a prepared statement. */
+
+RETCODE
+SQLBindParameter(
+/*=============*/
+ /* out: SQL_SUCCESS */
+ HSTMT hstmt, /* in: statement handle */
+ UWORD ipar, /* in: parameter index, starting from 1 */
+ SWORD fParamType, /* in: SQL_PARAM_INPUT or SQL_PARAM_OUTPUT */
+ SWORD fCType, /* in: SQL_C_CHAR, ... */
+ SWORD fSqlType, /* in: SQL_CHAR, ... */
+ UDWORD cbColDef, /* in: precision: ignored */
+ SWORD ibScale, /* in: scale: ignored */
+ PTR rgbValue, /* in: pointer to a buffer for the data */
+ SDWORD cbValueMax, /* in: buffer size */
+ SDWORD* pcbValue); /* in: pointer to a buffer for the data
+ length or SQL_NULL_DATA */
+/**************************************************************************
+Executes a prepared statement where all parameters have been bound. */
+
+RETCODE
+SQLExecute(
+/*=======*/
+ /* out: SQL_SUCCESS or SQL_ERROR */
+ HSTMT hstmt); /* in: statement handle */
+/**************************************************************************
+Queries an error message. */
+
+RETCODE
+SQLError(
+/*=====*/
+ /* out: SQL_SUCCESS or SQL_NO_DATA_FOUND */
+ HENV henv, /* in: SQL_NULL_HENV */
+ HDBC hdbc, /* in: SQL_NULL_HDBC */
+ HSTMT hstmt, /* in: statement handle */
+ UCHAR* szSqlState, /* in/out: SQLSTATE as a null-terminated string,
+ (currently, always == "S1000") */
+ SDWORD* pfNativeError, /* out: native error code */
+ UCHAR* szErrorMsg, /* in/out: buffer for an error message as a
+ null-terminated string */
+ SWORD cbErrorMsgMax, /* in: buffer size for szErrorMsg */
+ SWORD* pcbErrorMsg); /* out: error message length */
+
+#endif
diff --git a/innobase/include/ibuf0ibuf.h b/innobase/include/ibuf0ibuf.h
new file mode 100644
index 00000000000..f0b333192de
--- /dev/null
+++ b/innobase/include/ibuf0ibuf.h
@@ -0,0 +1,268 @@
+/******************************************************
+Insert buffer
+
+(c) 1997 Innobase Oy
+
+Created 7/19/1997 Heikki Tuuri
+*******************************************************/
+
+#ifndef ibuf0ibuf_h
+#define ibuf0ibuf_h
+
+#include "univ.i"
+
+#include "dict0mem.h"
+#include "dict0dict.h"
+#include "mtr0mtr.h"
+#include "que0types.h"
+#include "ibuf0types.h"
+#include "fsp0fsp.h"
+
+extern ibuf_t* ibuf;
+
+/**********************************************************************
+Creates the insert buffer data struct for a single tablespace. Reads the
+root page of the insert buffer tree in the tablespace. This function can
+be called only after the dictionary system has been initialized, as this
+creates also the insert buffer table and index for this tablespace. */
+
+ibuf_data_t*
+ibuf_data_init_for_space(
+/*=====================*/
+ /* out, own: ibuf data struct, linked to the list
+ in ibuf control structure. */
+ ulint space); /* in: space id */
+/**********************************************************************
+Creates the insert buffer data structure at a database startup and
+initializes the data structures for the insert buffer of each tablespace. */
+
+void
+ibuf_init_at_db_start(void);
+/*=======================*/
+/*************************************************************************
+Initializes an ibuf bitmap page. */
+
+void
+ibuf_bitmap_page_init(
+/*==================*/
+ page_t* page, /* in: bitmap page */
+ mtr_t* mtr); /* in: mtr */
+/****************************************************************************
+Resets the free bits of the page in the ibuf bitmap. This is done in a
+separate mini-transaction, hence this operation does not restrict further
+work to only ibuf bitmap operations, which would result if the latch to the
+bitmap page were kept. */
+
+void
+ibuf_reset_free_bits_with_type(
+/*===========================*/
+ ulint type, /* in: index type */
+ page_t* page); /* in: index page; free bits are set to 0 if the index
+ is non-clustered and non-unique and the page level is
+ 0 */
+/****************************************************************************
+Resets the free bits of the page in the ibuf bitmap. This is done in a
+separate mini-transaction, hence this operation does not restrict further
+work to solely ibuf bitmap operations, which would result if the latch to
+the bitmap page were kept. */
+
+void
+ibuf_reset_free_bits(
+/*=================*/
+ dict_index_t* index, /* in: index */
+ page_t* page); /* in: index page; free bits are set to 0 if
+ the index is non-clustered and non-unique and
+ the page level is 0 */
+/****************************************************************************
+Updates the free bits of the page in the ibuf bitmap if there is not enough
+free on the page any more. This is done in a separate mini-transaction, hence
+this operation does not restrict further work to only ibuf bitmap operations,
+which would result if the latch to the bitmap page were kept. */
+UNIV_INLINE
+void
+ibuf_update_free_bits_if_full(
+/*==========================*/
+ dict_index_t* index, /* in: index */
+ page_t* page, /* in: index page to which we have added new
+ records; the free bits are updated if the
+ index is non-clustered and non-unique and
+ the page level is 0, and the page becomes
+ fuller */
+ ulint max_ins_size,/* in: value of maximum insert size with
+ reorganize before the latest operation
+ performed to the page */
+ ulint increase);/* in: upper limit for the additional space
+ used in the latest operation, if known, or
+ ULINT_UNDEFINED */
+/**************************************************************************
+Updates the free bits for the page to reflect the present state. Does this
+in the mtr given, which means that the latching order rules virtually
+prevent any further operations for this OS thread until mtr is committed. */
+
+void
+ibuf_update_free_bits_low(
+/*======================*/
+ dict_index_t* index, /* in: index */
+ page_t* page, /* in: index page */
+ ulint max_ins_size, /* in: value of maximum insert size
+ with reorganize before the latest
+ operation performed to the page */
+ mtr_t* mtr); /* in: mtr */
+/**************************************************************************
+Updates the free bits for the two pages to reflect the present state. Does
+this in the mtr given, which means that the latching order rules virtually
+prevent any further operations until mtr is committed. */
+
+void
+ibuf_update_free_bits_for_two_pages_low(
+/*====================================*/
+ dict_index_t* index, /* in: index */
+ page_t* page1, /* in: index page */
+ page_t* page2, /* in: index page */
+ mtr_t* mtr); /* in: mtr */
+/**************************************************************************
+A basic partial test if an insert to the insert buffer could be possible and
+recommended. */
+UNIV_INLINE
+ibool
+ibuf_should_try(
+/*============*/
+ dict_index_t* index); /* in: index where to insert */
+/**********************************************************************
+Returns TRUE if the current OS thread is performing an insert buffer
+routine. */
+
+ibool
+ibuf_inside(void);
+/*=============*/
+ /* out: TRUE if inside an insert buffer routine: for instance,
+ a read-ahead of non-ibuf pages is then forbidden */
+/***************************************************************************
+Checks if a page address is an ibuf bitmap page (level 3 page) address. */
+UNIV_INLINE
+ibool
+ibuf_bitmap_page(
+/*=============*/
+ /* out: TRUE if a bitmap page */
+ ulint page_no);/* in: page number */
+/***************************************************************************
+Checks if a page is a level 2 or 3 page in the ibuf hierarchy of pages. */
+
+ibool
+ibuf_page(
+/*======*/
+ /* out: TRUE if level 2 or level 3 page */
+ ulint space, /* in: space id */
+ ulint page_no);/* in: page number */
+/***************************************************************************
+Checks if a page is a level 2 or 3 page in the ibuf hierarchy of pages. */
+
+ibool
+ibuf_page_low(
+/*==========*/
+ /* out: TRUE if level 2 or level 3 page */
+ ulint space, /* in: space id */
+ ulint page_no,/* in: page number */
+ mtr_t* mtr); /* in: mtr which will contain an x-latch to the
+ bitmap page if the page is not one of the fixed
+ address ibuf pages */
+/*************************************************************************
+Checks if an index page has so much free space that the free bit should
+be set TRUE in the ibuf bitmap. */
+
+ibool
+ibuf_index_page_has_free(
+/*=====================*/
+ /* out: TRUE if there is enough free space */
+ page_t* page); /* in: non-unique secondary index page */
+/***************************************************************************
+Frees excess pages from the ibuf free list. This function is called when an OS
+thread calls fsp services to allocate a new file segment, or a new page to a
+file segment, and the thread did not own the fsp latch before this call. */
+
+void
+ibuf_free_excess_pages(
+/*===================*/
+ ulint space); /* in: space id */
+/*************************************************************************
+Makes an index insert to the insert buffer, instead of directly to the disk
+page, if this is possible. Does not do insert if the index is clustered
+or unique. */
+
+ibool
+ibuf_insert(
+/*========*/
+ /* out: TRUE if success */
+ dtuple_t* entry, /* in: index entry to insert */
+ dict_index_t* index, /* in: index where to insert */
+ ulint space, /* in: space id where to insert */
+ ulint page_no,/* in: page number where to insert */
+ que_thr_t* thr); /* in: query thread */
+/*************************************************************************
+When an index page is read from a disk to the buffer pool, this function
+inserts to the page the possible index entries buffered in the insert buffer.
+The entries are deleted from the insert buffer. If the page is not read, but
+created in the buffer pool, this function deletes its buffered entries from
+the insert buffer; note that there can exist entries if the page belonged to
+an index which was dropped. */
+
+void
+ibuf_merge_or_delete_for_page(
+/*==========================*/
+ page_t* page, /* in: if page has been read from disk, pointer to
+ the page x-latched, else NULL */
+ ulint space, /* in: space id of the index page */
+ ulint page_no);/* in: page number of the index page */
+/*************************************************************************
+Contracts insert buffer trees by reading pages to the buffer pool. */
+
+ulint
+ibuf_contract(
+/*==========*/
+ /* out: a lower limit for the combined size in bytes
+ of entries which will be merged from ibuf trees to the
+ pages read, 0 if ibuf is empty */
+ ibool sync); /* in: TRUE if the caller wants to wait for the
+ issued read with the highest tablespace address
+ to complete */
+/*************************************************************************
+Parses a redo log record of an ibuf bitmap page init. */
+
+byte*
+ibuf_parse_bitmap_init(
+/*===================*/
+ /* out: end of log record or NULL */
+ byte* ptr, /* in: buffer */
+ byte* end_ptr,/* in: buffer end */
+ page_t* page, /* in: page or NULL */
+ mtr_t* mtr); /* in: mtr or NULL */
+/**********************************************************************
+Gets the ibuf count for a given page. */
+
+ulint
+ibuf_count_get(
+/*===========*/
+ /* out: number of entries in the insert buffer
+ currently buffered for this page */
+ ulint space, /* in: space id */
+ ulint page_no);/* in: page number */
+/**********************************************************************
+Prints info of ibuf. */
+
+void
+ibuf_print(void);
+/*============*/
+
+#define IBUF_HEADER_PAGE_NO FSP_IBUF_HEADER_PAGE_NO
+#define IBUF_TREE_ROOT_PAGE_NO FSP_IBUF_TREE_ROOT_PAGE_NO
+
+/* The ibuf header page currently contains only the file segment header
+for the file segment from which the pages for the ibuf tree are allocated */
+#define IBUF_HEADER PAGE_DATA
+#define IBUF_TREE_SEG_HEADER 0 /* fseg header for ibuf tree */
+
+#ifndef UNIV_NONINL
+#include "ibuf0ibuf.ic"
+#endif
+
+#endif
diff --git a/innobase/include/ibuf0ibuf.ic b/innobase/include/ibuf0ibuf.ic
new file mode 100644
index 00000000000..e969a0550da
--- /dev/null
+++ b/innobase/include/ibuf0ibuf.ic
@@ -0,0 +1,226 @@
+/******************************************************
+Insert buffer
+
+(c) 1997 Innobase Oy
+
+Created 7/19/1997 Heikki Tuuri
+*******************************************************/
+
+#include "buf0lru.h"
+#include "page0page.h"
+
+extern ulint ibuf_flush_count;
+
+/* If this number is n, an index page must contain at least the page size
+per n bytes of free space for ibuf to try to buffer inserts to this page.
+If there is this much of free space, the corresponding bits are set in the
+ibuf bitmap. */
+#define IBUF_PAGE_SIZE_PER_FREE_SPACE 32
+
+/* Insert buffer data struct for a single tablespace */
+struct ibuf_data_struct{
+ ulint space; /* space id */
+ ulint seg_size;/* allocated pages if the file segment
+ containing ibuf header and tree */
+ ulint size; /* size of the insert buffer tree in pages */
+ ibool empty; /* after an insert to the ibuf tree is
+ performed, this is set to FALSE, and if a
+ contract operation finds the tree empty, this
+ is set to TRUE */
+ ulint free_list_len;
+ /* length of the free list */
+ ulint height; /* tree height */
+ dict_index_t* index; /* insert buffer index */
+ UT_LIST_NODE_T(ibuf_data_t) data_list;
+ /* list of ibuf data structs */
+ ulint n_inserts;/* number of inserts made to the insert
+ buffer */
+ ulint n_merges;/* number of pages merged */
+ ulint n_merged_recs;/* number of records merged */
+};
+
+/* If the ibuf meter exceeds this value, then the suitable inserts are made to
+the insert buffer instead of directly to the disk page */
+#define IBUF_THRESHOLD 50
+
+struct ibuf_struct{
+ ulint size; /* current size of the ibuf index
+ trees in pages */
+ ulint max_size; /* recommended maximum size in pages
+ for the ibuf index tree */
+ ulint meter; /* heuristic meter which measures
+ desirability of doing inserts to the
+ insert buffer instead of directly to
+ the disk page */
+ UT_LIST_BASE_NODE_T(ibuf_data_t) data_list;
+ /* list of ibuf data structs for
+ each tablespace */
+};
+
+/****************************************************************************
+Sets the free bit of the page in the ibuf bitmap. This is done in a separate
+mini-transaction, hence this operation does not restrict further work to only
+ibuf bitmap operations, which would result if the latch to the bitmap page
+were kept. */
+
+void
+ibuf_set_free_bits(
+/*===============*/
+ ulint type, /* in: index type */
+ page_t* page, /* in: index page; free bit is reset if the index is
+ a non-clustered non-unique, and page level is 0 */
+ ulint val, /* in: value to set: < 4 */
+ ulint max_val);/* in: ULINT_UNDEFINED or a maximum value which
+ the bits must have before setting; this is for
+ debugging */
+
+/**************************************************************************
+A basic partial test if an insert to the insert buffer could be possible and
+recommended. */
+UNIV_INLINE
+ibool
+ibuf_should_try(
+/*============*/
+ dict_index_t* index) /* in: index where to insert */
+{
+ if (!(index->type & (DICT_CLUSTERED | DICT_UNIQUE))
+ && ibuf->meter > IBUF_THRESHOLD) {
+ ibuf_flush_count++;
+
+ if (ibuf_flush_count % 8 == 0) {
+
+ buf_LRU_try_free_flushed_blocks();
+ }
+
+ return(TRUE);
+ }
+
+ return(FALSE);
+}
+
+/***************************************************************************
+Checks if a page address is an ibuf bitmap page address. */
+UNIV_INLINE
+ibool
+ibuf_bitmap_page(
+/*=============*/
+ /* out: TRUE if a bitmap page */
+ ulint page_no)/* in: page number */
+{
+ if (page_no % XDES_DESCRIBED_PER_PAGE == FSP_IBUF_BITMAP_OFFSET) {
+
+ return(TRUE);
+ }
+
+ return(FALSE);
+}
+
+/*************************************************************************
+Translates the free space on a page to a value in the ibuf bitmap.*/
+UNIV_INLINE
+ulint
+ibuf_index_page_calc_free_bits(
+/*===========================*/
+ /* out: value for ibuf bitmap bits */
+ ulint max_ins_size) /* in: maximum insert size after reorganize
+ for the page */
+{
+ ulint n;
+
+ n = max_ins_size / (UNIV_PAGE_SIZE / IBUF_PAGE_SIZE_PER_FREE_SPACE);
+
+ if (n == 3) {
+ n = 2;
+ }
+
+ if (n > 3) {
+ n = 3;
+ }
+
+ return(n);
+}
+
+/*************************************************************************
+Translates the ibuf free bits to the free space on a page in bytes. */
+UNIV_INLINE
+ulint
+ibuf_index_page_calc_free_from_bits(
+/*================================*/
+ /* out: maximum insert size after reorganize for the
+ page */
+ ulint bits) /* in: value for ibuf bitmap bits */
+{
+ ut_ad(bits < 4);
+
+ if (bits == 3) {
+ return(4 * UNIV_PAGE_SIZE / IBUF_PAGE_SIZE_PER_FREE_SPACE);
+ }
+
+ return(bits * UNIV_PAGE_SIZE / IBUF_PAGE_SIZE_PER_FREE_SPACE);
+}
+
+/*************************************************************************
+Translates the free space on a page to a value in the ibuf bitmap.*/
+UNIV_INLINE
+ulint
+ibuf_index_page_calc_free(
+/*======================*/
+ /* out: value for ibuf bitmap bits */
+ page_t* page) /* in: non-unique secondary index page */
+{
+ return(ibuf_index_page_calc_free_bits(
+ page_get_max_insert_size_after_reorganize(page, 1)));
+}
+
+/****************************************************************************
+Updates the free bits of the page in the ibuf bitmap if there is not enough
+free on the page any more. This is done in a separate mini-transaction, hence
+this operation does not restrict further work to only ibuf bitmap operations,
+which would result if the latch to the bitmap page were kept. */
+UNIV_INLINE
+void
+ibuf_update_free_bits_if_full(
+/*==========================*/
+ dict_index_t* index, /* in: index */
+ page_t* page, /* in: index page to which we have added new
+ records; the free bits are updated if the
+ index is non-clustered and non-unique and
+ the page level is 0, and the page becomes
+ fuller */
+ ulint max_ins_size,/* in: value of maximum insert size with
+ reorganize before the latest operation
+ performed to the page */
+ ulint increase)/* in: upper limit for the additional space
+ used in the latest operation, if known, or
+ ULINT_UNDEFINED */
+{
+ ulint before;
+ ulint after;
+
+ before = ibuf_index_page_calc_free_bits(max_ins_size);
+
+ if (max_ins_size >= increase) {
+ ut_ad(ULINT_UNDEFINED > UNIV_PAGE_SIZE);
+
+ after = ibuf_index_page_calc_free_bits(max_ins_size
+ - increase);
+#ifdef UNIV_IBUF_DEBUG
+ ut_a(after <= ibuf_index_page_calc_free(page));
+#endif
+ } else {
+ after = ibuf_index_page_calc_free(page);
+ }
+
+ if (after == 0) {
+ /* We move the page to front of the buffer pool LRU list:
+ the purpose of this is to prevent those pages to which we
+ cannot make inserts using the insert buffer from slipping
+ out of the buffer pool */
+
+ buf_page_make_young(page);
+ }
+
+ if (before > after) {
+ ibuf_set_free_bits(index->type, page, after, before);
+ }
+}
diff --git a/innobase/include/ibuf0types.h b/innobase/include/ibuf0types.h
new file mode 100644
index 00000000000..fb202ac44b0
--- /dev/null
+++ b/innobase/include/ibuf0types.h
@@ -0,0 +1,15 @@
+/******************************************************
+Insert buffer global types
+
+(c) 1997 Innobase Oy
+
+Created 7/29/1997 Heikki Tuuri
+*******************************************************/
+
+#ifndef ibuf0types_h
+#define ibuf0types_h
+
+typedef struct ibuf_data_struct ibuf_data_t;
+typedef struct ibuf_struct ibuf_t;
+
+#endif
diff --git a/innobase/include/lock0lock.h b/innobase/include/lock0lock.h
new file mode 100644
index 00000000000..d2d4ce9290d
--- /dev/null
+++ b/innobase/include/lock0lock.h
@@ -0,0 +1,538 @@
+/******************************************************
+The transaction lock system
+
+(c) 1996 Innobase Oy
+
+Created 5/7/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef lock0lock_h
+#define lock0lock_h
+
+#include "univ.i"
+#include "trx0types.h"
+#include "rem0types.h"
+#include "dict0types.h"
+#include "que0types.h"
+#include "page0types.h"
+#include "lock0types.h"
+#include "read0types.h"
+#include "hash0hash.h"
+
+extern ibool lock_print_waits;
+
+/*****************************************************************
+Cancels a waiting record lock request and releases the waiting transaction
+that requested it. NOTE: does NOT check if waiting lock requests behind this
+one can now be granted! */
+
+void
+lock_rec_cancel(
+/*============*/
+ lock_t* lock); /* in: waiting record lock request */
+/*************************************************************************
+Creates the lock system at database start. */
+
+void
+lock_sys_create(
+/*============*/
+ ulint n_cells); /* in: number of slots in lock hash table */
+/*************************************************************************
+Checks if some transaction has an implicit x-lock on a record in a secondary
+index. */
+
+trx_t*
+lock_sec_rec_some_has_impl_off_kernel(
+/*==================================*/
+ /* out: transaction which has the x-lock, or
+ NULL */
+ rec_t* rec, /* in: user record */
+ dict_index_t* index); /* in: secondary index */
+/*************************************************************************
+Checks if some transaction has an implicit x-lock on a record in a clustered
+index. */
+UNIV_INLINE
+trx_t*
+lock_clust_rec_some_has_impl(
+/*=========================*/
+ /* out: transaction which has the x-lock, or
+ NULL */
+ rec_t* rec, /* in: user record */
+ dict_index_t* index); /* in: clustered index */
+/*****************************************************************
+Resets the lock bits for a single record. Releases transactions
+waiting for lock requests here. */
+
+void
+lock_rec_reset_and_release_wait(
+/*============================*/
+ rec_t* rec); /* in: record whose locks bits should be reset */
+/*****************************************************************
+Makes a record to inherit the locks of another record as gap type
+locks, but does not reset the lock bits of the other record. Also
+waiting lock requests on rec are inherited as GRANTED gap locks. */
+
+void
+lock_rec_inherit_to_gap(
+/*====================*/
+ rec_t* heir, /* in: record which inherits */
+ rec_t* rec); /* in: record from which inherited; does NOT reset
+ the locks on this record */
+/*****************************************************************
+Updates the lock table when we have reorganized a page. NOTE: we copy
+also the locks set on the infimum of the page; the infimum may carry
+locks if an update of a record is occurring on the page, and its locks
+were temporarily stored on the infimum. */
+
+void
+lock_move_reorganize_page(
+/*======================*/
+ page_t* page, /* in: old index page */
+ page_t* new_page); /* in: reorganized page */
+/*****************************************************************
+Moves the explicit locks on user records to another page if a record
+list end is moved to another page. */
+
+void
+lock_move_rec_list_end(
+/*===================*/
+ page_t* new_page, /* in: index page to move to */
+ page_t* page, /* in: index page */
+ rec_t* rec); /* in: record on page: this is the
+ first record moved */
+/*****************************************************************
+Moves the explicit locks on user records to another page if a record
+list start is moved to another page. */
+
+void
+lock_move_rec_list_start(
+/*=====================*/
+ page_t* new_page, /* in: index page to move to */
+ page_t* page, /* in: index page */
+ rec_t* rec, /* in: record on page: this is the
+ first record NOT copied */
+ rec_t* old_end); /* in: old previous-to-last record on
+ new_page before the records were copied */
+/*****************************************************************
+Updates the lock table when a page is split to the right. */
+
+void
+lock_update_split_right(
+/*====================*/
+ page_t* right_page, /* in: right page */
+ page_t* left_page); /* in: left page */
+/*****************************************************************
+Updates the lock table when a page is merged to the right. */
+
+void
+lock_update_merge_right(
+/*====================*/
+ rec_t* orig_succ, /* in: original successor of infimum
+ on the right page before merge */
+ page_t* left_page); /* in: merged index page which will be
+ discarded */
+/*****************************************************************
+Updates the lock table when the root page is copied to another in
+btr_root_raise_and_insert. Note that we leave lock structs on the
+root page, even though they do not make sense on other than leaf
+pages: the reason is that in a pessimistic update the infimum record
+of the root page will act as a dummy carrier of the locks of the record
+to be updated. */
+
+void
+lock_update_root_raise(
+/*===================*/
+ page_t* new_page, /* in: index page to which copied */
+ page_t* root); /* in: root page */
+/*****************************************************************
+Updates the lock table when a page is copied to another and the original page
+is removed from the chain of leaf pages, except if page is the root! */
+
+void
+lock_update_copy_and_discard(
+/*=========================*/
+ page_t* new_page, /* in: index page to which copied */
+ page_t* page); /* in: index page; NOT the root! */
+/*****************************************************************
+Updates the lock table when a page is split to the left. */
+
+void
+lock_update_split_left(
+/*===================*/
+ page_t* right_page, /* in: right page */
+ page_t* left_page); /* in: left page */
+/*****************************************************************
+Updates the lock table when a page is merged to the left. */
+
+void
+lock_update_merge_left(
+/*===================*/
+ page_t* left_page, /* in: left page to which merged */
+ rec_t* orig_pred, /* in: original predecessor of supremum
+ on the left page before merge */
+ page_t* right_page); /* in: merged index page which will be
+ discarded */
+/*****************************************************************
+Resets the original locks on heir and replaces them with gap type locks
+inherited from rec. */
+
+void
+lock_rec_reset_and_inherit_gap_locks(
+/*=================================*/
+ rec_t* heir, /* in: heir record */
+ rec_t* rec); /* in: record */
+/*****************************************************************
+Updates the lock table when a page is discarded. */
+
+void
+lock_update_discard(
+/*================*/
+ rec_t* heir, /* in: record which will inherit the locks */
+ page_t* page); /* in: index page which will be discarded */
+/*****************************************************************
+Updates the lock table when a new user record is inserted. */
+
+void
+lock_update_insert(
+/*===============*/
+ rec_t* rec); /* in: the inserted record */
+/*****************************************************************
+Updates the lock table when a record is removed. */
+
+void
+lock_update_delete(
+/*===============*/
+ rec_t* rec); /* in: the record to be removed */
+/*************************************************************************
+Stores on the page infimum record the explicit locks of another record.
+This function is used to store the lock state of a record when it is
+updated and the size of the record changes in the update. The record
+is in such an update moved, perhaps to another page. The infimum record
+acts as a dummy carrier record, taking care of lock releases while the
+actual record is being moved. */
+
+void
+lock_rec_store_on_page_infimum(
+/*===========================*/
+ rec_t* rec); /* in: record whose lock state is stored
+ on the infimum record of the same page; lock
+ bits are reset on the record */
+/*************************************************************************
+Restores the state of explicit lock requests on a single record, where the
+state was stored on the infimum of the page. */
+
+void
+lock_rec_restore_from_page_infimum(
+/*===============================*/
+ rec_t* rec, /* in: record whose lock state is restored */
+ page_t* page); /* in: page (rec is not necessarily on this page)
+ whose infimum stored the lock state; lock bits are
+ reset on the infimum */
+/*************************************************************************
+Returns TRUE if there are explicit record locks on a page. */
+
+ibool
+lock_rec_expl_exist_on_page(
+/*========================*/
+ /* out: TRUE if there are explicit record locks on
+ the page */
+ ulint space, /* in: space id */
+ ulint page_no);/* in: page number */
+/*************************************************************************
+Checks if locks of other transactions prevent an immediate insert of
+a record. If they do, first tests if the query thread should anyway
+be suspended for some reason; if not, then puts the transaction and
+the query thread to the lock wait state and inserts a waiting request
+for a gap x-lock to the lock queue. */
+
+ulint
+lock_rec_insert_check_and_lock(
+/*===========================*/
+ /* out: DB_SUCCESS, DB_LOCK_WAIT,
+ DB_DEADLOCK, or DB_QUE_THR_SUSPENDED */
+ ulint flags, /* in: if BTR_NO_LOCKING_FLAG bit is set,
+ does nothing */
+ rec_t* rec, /* in: record after which to insert */
+ dict_index_t* index, /* in: index */
+ que_thr_t* thr, /* in: query thread */
+ ibool* inherit);/* out: set to TRUE if the new inserted
+ record maybe should inherit LOCK_GAP type
+ locks from the successor record */
+/*************************************************************************
+Checks if locks of other transactions prevent an immediate modify (update,
+delete mark, or delete unmark) of a clustered index record. If they do,
+first tests if the query thread should anyway be suspended for some
+reason; if not, then puts the transaction and the query thread to the
+lock wait state and inserts a waiting request for a record x-lock to the
+lock queue. */
+
+ulint
+lock_clust_rec_modify_check_and_lock(
+/*=================================*/
+ /* out: DB_SUCCESS, DB_LOCK_WAIT,
+ DB_DEADLOCK, or DB_QUE_THR_SUSPENDED */
+ ulint flags, /* in: if BTR_NO_LOCKING_FLAG bit is set,
+ does nothing */
+ rec_t* rec, /* in: record which should be modified */
+ dict_index_t* index, /* in: clustered index */
+ que_thr_t* thr); /* in: query thread */
+/*************************************************************************
+Checks if locks of other transactions prevent an immediate modify
+(delete mark or delete unmark) of a secondary index record. */
+
+ulint
+lock_sec_rec_modify_check_and_lock(
+/*===============================*/
+ /* out: DB_SUCCESS, DB_LOCK_WAIT,
+ DB_DEADLOCK, or DB_QUE_THR_SUSPENDED */
+ ulint flags, /* in: if BTR_NO_LOCKING_FLAG bit is set,
+ does nothing */
+ rec_t* rec, /* in: record which should be modified;
+ NOTE: as this is a secondary index, we
+ always have to modify the clustered index
+ record first: see the comment below */
+ dict_index_t* index, /* in: secondary index */
+ que_thr_t* thr); /* in: query thread */
+/*************************************************************************
+Checks if locks of other transactions prevent an immediate read, or passing
+over by a read cursor, of a clustered index record. If they do, first tests
+if the query thread should anyway be suspended for some reason; if not, then
+puts the transaction and the query thread to the lock wait state and inserts a
+waiting request for a record lock to the lock queue. Sets the requested mode
+lock on the record. */
+
+ulint
+lock_clust_rec_read_check_and_lock(
+/*===============================*/
+ /* out: DB_SUCCESS, DB_LOCK_WAIT,
+ DB_DEADLOCK, or DB_QUE_THR_SUSPENDED */
+ ulint flags, /* in: if BTR_NO_LOCKING_FLAG bit is set,
+ does nothing */
+ rec_t* rec, /* in: user record or page supremum record
+ which should be read or passed over by a read
+ cursor */
+ dict_index_t* index, /* in: clustered index */
+ ulint mode, /* in: mode of the lock which the read cursor
+ should set on records: LOCK_S or LOCK_X; the
+ latter is possible in SELECT FOR UPDATE */
+ que_thr_t* thr); /* in: query thread */
+/*************************************************************************
+Like the counterpart for a clustered index above, but now we read a
+secondary index record. */
+
+ulint
+lock_sec_rec_read_check_and_lock(
+/*=============================*/
+ /* out: DB_SUCCESS, DB_LOCK_WAIT,
+ DB_DEADLOCK, or DB_QUE_THR_SUSPENDED */
+ ulint flags, /* in: if BTR_NO_LOCKING_FLAG bit is set,
+ does nothing */
+ rec_t* rec, /* in: user record or page supremum record
+ which should be read or passed over by a read
+ cursor */
+ dict_index_t* index, /* in: secondary index */
+ ulint mode, /* in: mode of the lock which the read cursor
+ should set on records: LOCK_S or LOCK_X; the
+ latter is possible in SELECT FOR UPDATE */
+ que_thr_t* thr); /* in: query thread */
+/*************************************************************************
+Checks that a record is seen in a consistent read. */
+
+ibool
+lock_clust_rec_cons_read_sees(
+/*==========================*/
+ /* out: TRUE if sees, or FALSE if an earlier
+ version of the record should be retrieved */
+ rec_t* rec, /* in: user record which should be read or
+ passed over by a read cursor */
+ dict_index_t* index, /* in: clustered index */
+ read_view_t* view); /* in: consistent read view */
+/*************************************************************************
+Checks that a non-clustered index record is seen in a consistent read. */
+
+ulint
+lock_sec_rec_cons_read_sees(
+/*========================*/
+ /* out: TRUE if certainly sees, or FALSE if an
+ earlier version of the clustered index record
+ might be needed: NOTE that a non-clustered
+ index page contains so little information on
+ its modifications that also in the case FALSE,
+ the present version of rec may be the right,
+ but we must check this from the clustered
+ index record */
+ rec_t* rec, /* in: user record which should be read or
+ passed over by a read cursor */
+ dict_index_t* index, /* in: non-clustered index */
+ read_view_t* view); /* in: consistent read view */
+/*************************************************************************
+Locks the specified database table in the mode given. If the lock cannot
+be granted immediately, the query thread is put to wait. */
+
+ulint
+lock_table(
+/*=======*/
+ /* out: DB_SUCCESS, DB_LOCK_WAIT,
+ DB_DEADLOCK, or DB_QUE_THR_SUSPENDED */
+ ulint flags, /* in: if BTR_NO_LOCKING_FLAG bit is set,
+ does nothing */
+ dict_table_t* table, /* in: database table in dictionary cache */
+ ulint mode, /* in: lock mode */
+ que_thr_t* thr); /* in: query thread */
+/*************************************************************************
+Checks if there are any locks set on the table. */
+
+ibool
+lock_is_on_table(
+/*=============*/
+ /* out: TRUE if there are lock(s) */
+ dict_table_t* table); /* in: database table in dictionary cache */
+/*************************************************************************
+Releases transaction locks, and releases possible other transactions waiting
+because of these locks. */
+
+void
+lock_release_off_kernel(
+/*====================*/
+ trx_t* trx); /* in: transaction */
+/*************************************************************************
+Calculates the fold value of a page file address: used in inserting or
+searching for a lock in the hash table. */
+UNIV_INLINE
+ulint
+lock_rec_fold(
+/*===========*/
+ /* out: folded value */
+ ulint space, /* in: space */
+ ulint page_no);/* in: page number */
+/*************************************************************************
+Calculates the hash value of a page file address: used in inserting or
+searching for a lock in the hash table. */
+UNIV_INLINE
+ulint
+lock_rec_hash(
+/*==========*/
+ /* out: hashed value */
+ ulint space, /* in: space */
+ ulint page_no);/* in: page number */
+/*************************************************************************
+Gets the mutex protecting record locks on a given page address. */
+
+mutex_t*
+lock_rec_get_mutex_for_addr(
+/*========================*/
+ ulint space, /* in: space id */
+ ulint page_no);/* in: page number */
+/*************************************************************************
+Validates the lock queue on a single record. */
+
+ibool
+lock_rec_queue_validate(
+/*====================*/
+ /* out: TRUE if ok */
+ rec_t* rec, /* in: record to look at */
+ dict_index_t* index); /* in: index, or NULL if not known */
+/*************************************************************************
+Prints info of a table lock. */
+
+void
+lock_table_print(
+/*=============*/
+ lock_t* lock); /* in: table type lock */
+/*************************************************************************
+Prints info of a record lock. */
+
+void
+lock_rec_print(
+/*===========*/
+ lock_t* lock); /* in: record type lock */
+/*************************************************************************
+Prints info of locks for all transactions. */
+
+void
+lock_print_info(void);
+/*=================*/
+/*************************************************************************
+Validates the lock queue on a table. */
+
+ibool
+lock_table_queue_validate(
+/*======================*/
+ /* out: TRUE if ok */
+ dict_table_t* table); /* in: table */
+/*************************************************************************
+Validates the record lock queues on a page. */
+
+ibool
+lock_rec_validate_page(
+/*===================*/
+ /* out: TRUE if ok */
+ ulint space, /* in: space id */
+ ulint page_no);/* in: page number */
+/*************************************************************************
+Validates the lock system. */
+
+ibool
+lock_validate(void);
+/*===============*/
+ /* out: TRUE if ok */
+
+/* The lock system */
+extern lock_sys_t* lock_sys;
+
+/* Lock modes and types */
+#define LOCK_NONE 0 /* this flag is used elsewhere to note
+ consistent read */
+#define LOCK_IS 2 /* intention shared */
+#define LOCK_IX 3 /* intention exclusive */
+#define LOCK_S 4 /* shared */
+#define LOCK_X 5 /* exclusive */
+#define LOCK_MODE_MASK 0xF /* mask used to extract mode from the
+ type_mode field in a lock */
+#define LOCK_TABLE 16 /* these type values should be so high that */
+#define LOCK_REC 32 /* they can be ORed to the lock mode */
+#define LOCK_TYPE_MASK 0xF0 /* mask used to extract lock type from the
+ type_mode field in a lock */
+#define LOCK_WAIT 256 /* this wait bit should be so high that
+ it can be ORed to the lock mode and type;
+ when this bit is set, it means that the
+ lock has not yet been granted, it is just
+ waiting for its turn in the wait queue */
+#define LOCK_GAP 512 /* this gap bit should be so high that
+ it can be ORed to the other flags;
+ when this bit is set, it means that the
+ lock holds only on the gap before the record;
+ for instance, an x-lock on the gap does not
+ give permission to modify the record on which
+ the bit is set; locks of this type are created
+ when records are removed from the index chain
+ of records */
+
+/* When lock bits are reset, the following flags are available: */
+#define LOCK_RELEASE_WAIT 1
+#define LOCK_NOT_RELEASE_WAIT 2
+
+/* Lock operation struct */
+typedef struct lock_op_struct lock_op_t;
+struct lock_op_struct{
+ dict_table_t* table; /* table to be locked */
+ ulint mode; /* lock mode */
+};
+
+#define LOCK_OP_START 1
+#define LOCK_OP_COMPLETE 2
+
+/* The lock system struct */
+struct lock_sys_struct{
+ hash_table_t* rec_hash; /* hash table of the record locks */
+};
+
+/* The lock system */
+extern lock_sys_t* lock_sys;
+
+
+#ifndef UNIV_NONINL
+#include "lock0lock.ic"
+#endif
+
+#endif
diff --git a/innobase/include/lock0lock.ic b/innobase/include/lock0lock.ic
new file mode 100644
index 00000000000..64c43c88d2e
--- /dev/null
+++ b/innobase/include/lock0lock.ic
@@ -0,0 +1,80 @@
+/******************************************************
+The transaction lock system
+
+(c) 1996 Innobase Oy
+
+Created 5/7/1996 Heikki Tuuri
+*******************************************************/
+
+#include "sync0sync.h"
+#include "srv0srv.h"
+#include "dict0dict.h"
+#include "row0row.h"
+#include "trx0sys.h"
+#include "trx0trx.h"
+#include "buf0buf.h"
+#include "page0page.h"
+#include "page0cur.h"
+#include "row0vers.h"
+#include "que0que.h"
+#include "btr0cur.h"
+#include "read0read.h"
+#include "log0recv.h"
+
+/*************************************************************************
+Calculates the fold value of a page file address: used in inserting or
+searching for a lock in the hash table. */
+UNIV_INLINE
+ulint
+lock_rec_fold(
+/*==========*/
+ /* out: folded value */
+ ulint space, /* in: space */
+ ulint page_no)/* in: page number */
+{
+ return(ut_fold_ulint_pair(space, page_no));
+}
+
+/*************************************************************************
+Calculates the hash value of a page file address: used in inserting or
+searching for a lock in the hash table. */
+UNIV_INLINE
+ulint
+lock_rec_hash(
+/*==========*/
+ /* out: hashed value */
+ ulint space, /* in: space */
+ ulint page_no)/* in: page number */
+{
+ return(hash_calc_hash(lock_rec_fold(space, page_no),
+ lock_sys->rec_hash));
+}
+
+/*************************************************************************
+Checks if some transaction has an implicit x-lock on a record in a clustered
+index. */
+UNIV_INLINE
+trx_t*
+lock_clust_rec_some_has_impl(
+/*=========================*/
+ /* out: transaction which has the x-lock, or
+ NULL */
+ rec_t* rec, /* in: user record */
+ dict_index_t* index) /* in: clustered index */
+{
+ dulint trx_id;
+
+ ut_ad(mutex_own(&kernel_mutex));
+ ut_ad(index->type & DICT_CLUSTERED);
+ ut_ad(page_rec_is_user_rec(rec));
+
+ trx_id = row_get_rec_trx_id(rec, index);
+
+ if (trx_is_active(trx_id)) {
+ /* The modifying or inserting transaction is active */
+
+ return(trx_get_on_id(trx_id));
+ }
+
+ return(NULL);
+}
diff --git a/innobase/include/lock0types.h b/innobase/include/lock0types.h
new file mode 100644
index 00000000000..705e64f6581
--- /dev/null
+++ b/innobase/include/lock0types.h
@@ -0,0 +1,15 @@
+/******************************************************
+The transaction lock system global types
+
+(c) 1996 Innobase Oy
+
+Created 5/7/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef lock0types_h
+#define lock0types_h
+
+typedef struct lock_struct lock_t;
+typedef struct lock_sys_struct lock_sys_t;
+
+#endif
diff --git a/innobase/include/log0log.h b/innobase/include/log0log.h
new file mode 100644
index 00000000000..001f98cfc3c
--- /dev/null
+++ b/innobase/include/log0log.h
@@ -0,0 +1,752 @@
+/******************************************************
+Database log
+
+(c) 1995 Innobase Oy
+
+Created 12/9/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef log0log_h
+#define log0log_h
+
+#include "univ.i"
+#include "ut0byte.h"
+#include "sync0sync.h"
+#include "sync0rw.h"
+
+typedef struct log_struct log_t;
+typedef struct log_group_struct log_group_t;
+
+extern ibool log_do_write;
+extern ibool log_debug_writes;
+
+/* Wait modes for log_flush_up_to */
+#define LOG_NO_WAIT 91
+#define LOG_WAIT_ONE_GROUP 92
+#define LOG_WAIT_ALL_GROUPS 93
+#define LOG_MAX_N_GROUPS 32
+
+/****************************************************************
+Writes to the log the string given. The log must be released with
+log_release. */
+UNIV_INLINE
+dulint
+log_reserve_and_write_fast(
+/*=======================*/
+ /* out: end lsn of the log record, ut_dulint_zero if
+ did not succeed */
+ byte* str, /* in: string */
+ ulint len, /* in: string length */
+ dulint* start_lsn,/* out: start lsn of the log record */
+ ibool* success);/* out: TRUE if success */
+/***************************************************************************
+Releases the log mutex. */
+UNIV_INLINE
+void
+log_release(void);
+/*=============*/
+/***************************************************************************
+Checks if there is need for a log buffer flush or a new checkpoint, and does
+this if yes. Any database operation should call this when it has modified
+more than about 4 pages. NOTE that this function may only be called when the
+OS thread owns no synchronization objects except the dictionary mutex. */
+UNIV_INLINE
+void
+log_free_check(void);
+/*================*/
+/****************************************************************
+Opens the log for log_write_low. The log must be closed with log_close and
+released with log_release. */
+
+dulint
+log_reserve_and_open(
+/*=================*/
+ /* out: start lsn of the log record */
+ ulint len); /* in: length of data to be catenated */
+/****************************************************************
+Writes to the log the string given. It is assumed that the caller holds the
+log mutex. */
+
+void
+log_write_low(
+/*==========*/
+ byte* str, /* in: string */
+ ulint str_len); /* in: string length */
+/****************************************************************
+Closes the log. */
+
+dulint
+log_close(void);
+/*===========*/
+ /* out: lsn */
+/****************************************************************
+Gets the current lsn. */
+UNIV_INLINE
+dulint
+log_get_lsn(void);
+/*=============*/
+ /* out: current lsn */
+/****************************************************************************
+Gets the online backup lsn. */
+UNIV_INLINE
+dulint
+log_get_online_backup_lsn_low(void);
+/*===============================*/
+/****************************************************************************
+Gets the online backup state. */
+UNIV_INLINE
+ibool
+log_get_online_backup_state_low(void);
+/*=================================*/
+ /* out: online backup state, the caller must
+ own the log_sys mutex */
+/**********************************************************
+Initializes the log. */
+
+void
+log_init(void);
+/*==========*/
+/**********************************************************************
+Inits a log group to the log system. */
+
+void
+log_group_init(
+/*===========*/
+ ulint id, /* in: group id */
+ ulint n_files, /* in: number of log files */
+ ulint file_size, /* in: log file size in bytes */
+ ulint space_id, /* in: space id of the file space
+ which contains the log files of this
+ group */
+ ulint archive_space_id); /* in: space id of the file space
+ which contains some archived log
+ files for this group; currently, only
+ for the first log group this is
+ used */
+/**********************************************************
+Completes an i/o to a log file. */
+
+void
+log_io_complete(
+/*============*/
+ log_group_t* group); /* in: log group */
+/**********************************************************
+This function is called, e.g., when a transaction wants to commit. It checks
+that the log has been flushed to disk up to the last log entry written by the
+transaction. If there is a flush running, it waits and checks if the flush
+flushed enough. If not, starts a new flush. */
+
+void
+log_flush_up_to(
+/*============*/
+ dulint lsn, /* in: log sequence number up to which the log should
+ be flushed, ut_dulint_max if not specified */
+ ulint wait); /* in: LOG_NO_WAIT, LOG_WAIT_ONE_GROUP,
+ or LOG_WAIT_ALL_GROUPS */
+/********************************************************************
+Advances the smallest lsn for which there are unflushed dirty blocks in the
+buffer pool and also may make a new checkpoint. NOTE: this function may only
+be called if the calling thread owns no synchronization objects! */
+
+ibool
+log_preflush_pool_modified_pages(
+/*=============================*/
+ /* out: FALSE if there was a flush batch of
+ the same type running, which means that we
+ could not start this flush batch */
+ dulint new_oldest, /* in: try to advance oldest_modified_lsn
+ at least to this lsn */
+ ibool sync); /* in: TRUE if synchronous operation is
+ desired */
+/**********************************************************
+Makes a checkpoint. Note that this function does not flush dirty
+blocks from the buffer pool: it only checks what is lsn of the oldest
+modification in the pool, and writes information about the lsn in
+log files. Use log_make_checkpoint_at to flush also the pool. */
+
+ibool
+log_checkpoint(
+/*===========*/
+ /* out: TRUE if success, FALSE if a checkpoint
+ write was already running */
+ ibool sync, /* in: TRUE if synchronous operation is
+ desired */
+ ibool write_always); /* in: the function normally checks if the
+ the new checkpoint would have a greater
+ lsn than the previous one: if not, then no
+ physical write is done; by setting this
+ parameter TRUE, a physical write will always be
+ made to log files */
+/********************************************************************
+Makes a checkpoint at a given lsn or later. */
+
+void
+log_make_checkpoint_at(
+/*===================*/
+ dulint lsn, /* in: make a checkpoint at this or a later
+ lsn, if ut_dulint_max, makes a checkpoint at
+ the latest lsn */
+ ibool write_always); /* in: the function normally checks if the
+ the new checkpoint would have a greater
+ lsn than the previous one: if not, then no
+ physical write is done; by setting this
+ parameter TRUE, a physical write will always be
+ made to log files */
+/********************************************************************
+Makes a checkpoint at the latest lsn and writes it to first page of each
+data file in the database, so that we know that the file spaces contain
+all modifications up to that lsn. This can only be called at database
+shutdown. This function also writes all log in log files to the log archive. */
+
+void
+logs_empty_and_mark_files_at_shutdown(void);
+/*=======================================*/
+/**********************************************************
+Reads a checkpoint info from a log group header to log_sys->checkpoint_buf. */
+
+void
+log_group_read_checkpoint_info(
+/*===========================*/
+ log_group_t* group, /* in: log group */
+ ulint field); /* in: LOG_CHECKPOINT_1 or LOG_CHECKPOINT_2 */
+/***********************************************************************
+Gets info from a checkpoint about a log group. */
+
+void
+log_checkpoint_get_nth_group_info(
+/*==============================*/
+ byte* buf, /* in: buffer containing checkpoint info */
+ ulint n, /* in: nth slot */
+ ulint* file_no,/* out: archived file number */
+ ulint* offset);/* out: archived file offset */
+/**********************************************************
+Writes checkpoint info to groups. */
+
+void
+log_groups_write_checkpoint_info(void);
+/*==================================*/
+/************************************************************************
+Starts an archiving operation. */
+
+ibool
+log_archive_do(
+/*===========*/
+ /* out: TRUE if succeed, FALSE if an archiving
+ operation was already running */
+ ibool sync, /* in: TRUE if synchronous operation is desired */
+ ulint* n_bytes);/* out: archive log buffer size, 0 if nothing to
+ archive */
+/********************************************************************
+Writes the log contents to the archive up to the lsn when this function was
+called, and stops the archiving. When archiving is started again, the archived
+log file numbers start from a number one higher, so that the archiving will
+not write again to the archived log files which exist when this function
+returns. */
+
+ulint
+log_archive_stop(void);
+/*==================*/
+ /* out: DB_SUCCESS or DB_ERROR */
+/********************************************************************
+Starts again archiving which has been stopped. */
+
+ulint
+log_archive_start(void);
+/*===================*/
+ /* out: DB_SUCCESS or DB_ERROR */
+/********************************************************************
+Stop archiving the log so that a gap may occur in the archived log files. */
+
+ulint
+log_archive_noarchivelog(void);
+/*==========================*/
+ /* out: DB_SUCCESS or DB_ERROR */
+/********************************************************************
+Start archiving the log so that a gap may occur in the archived log files. */
+
+ulint
+log_archive_archivelog(void);
+/*========================*/
+ /* out: DB_SUCCESS or DB_ERROR */
+/**********************************************************
+Generates an archived log file name. */
+
+void
+log_archived_file_name_gen(
+/*=======================*/
+ char* buf, /* in: buffer where to write */
+ ulint id, /* in: group id */
+ ulint file_no);/* in: file number */
+/**********************************************************
+Switches the database to the online backup state. */
+
+ulint
+log_switch_backup_state_on(void);
+/*============================*/
+ /* out: DB_SUCCESS or DB_ERROR */
+/**********************************************************
+Switches the online backup state off. */
+
+ulint
+log_switch_backup_state_off(void);
+/*=============================*/
+ /* out: DB_SUCCESS or DB_ERROR */
+/************************************************************************
+Checks that there is enough free space in the log to start a new query step.
+Flushes the log buffer or makes a new checkpoint if necessary. NOTE: this
+function may only be called if the calling thread owns no synchronization
+objects! */
+
+void
+log_check_margins(void);
+/*===================*/
+/**********************************************************
+Reads a specified log segment to a buffer. */
+
+void
+log_group_read_log_seg(
+/*===================*/
+ ulint type, /* in: LOG_ARCHIVE or LOG_RECOVER */
+ byte* buf, /* in: buffer where to read */
+ log_group_t* group, /* in: log group */
+ dulint start_lsn, /* in: read area start */
+ dulint end_lsn); /* in: read area end */
+/**********************************************************
+Writes a buffer to a log file group. */
+
+void
+log_group_write_buf(
+/*================*/
+ ulint type, /* in: LOG_FLUSH or LOG_RECOVER */
+ log_group_t* group, /* in: log group */
+ byte* buf, /* in: buffer */
+ ulint len, /* in: buffer len; must be divisible
+ by OS_FILE_LOG_BLOCK_SIZE */
+ dulint start_lsn, /* in: start lsn of the buffer; must
+ be divisible by
+ OS_FILE_LOG_BLOCK_SIZE */
+ ulint new_data_offset);/* in: start offset of new data in
+ buf: this parameter is used to decide
+ if we have to write a new log file
+ header */
+/************************************************************
+Sets the field values in group to correspond to a given lsn. For this function
+to work, the values must already be correctly initialized to correspond to
+some lsn, for instance, a checkpoint lsn. */
+
+void
+log_group_set_fields(
+/*=================*/
+ log_group_t* group, /* in: group */
+ dulint lsn); /* in: lsn for which the values should be
+ set */
+/**********************************************************
+Calculates the data capacity of a log group, when the log file headers are not
+included. */
+
+ulint
+log_group_get_capacity(
+/*===================*/
+ /* out: capacity in bytes */
+ log_group_t* group); /* in: log group */
+/****************************************************************
+Gets a log block flush bit. */
+UNIV_INLINE
+ibool
+log_block_get_flush_bit(
+/*====================*/
+ /* out: TRUE if this block was the first
+ to be written in a log flush */
+ byte* log_block); /* in: log block */
+/****************************************************************
+Gets a log block number stored in the header. */
+UNIV_INLINE
+ulint
+log_block_get_hdr_no(
+/*=================*/
+ /* out: log block number stored in the block
+ header */
+ byte* log_block); /* in: log block */
+/****************************************************************
+Gets a log block data length. */
+UNIV_INLINE
+ulint
+log_block_get_data_len(
+/*===================*/
+ /* out: log block data length measured as a
+ byte offset from the block start */
+ byte* log_block); /* in: log block */
+/****************************************************************
+Sets the log block data length. */
+UNIV_INLINE
+void
+log_block_set_data_len(
+/*===================*/
+ byte* log_block, /* in: log block */
+ ulint len); /* in: data length */
+/****************************************************************
+Gets a log block number stored in the trailer. */
+UNIV_INLINE
+ulint
+log_block_get_trl_no(
+/*=================*/
+ /* out: log block number stored in the block
+ trailer */
+ byte* log_block); /* in: log block */
+/****************************************************************
+Gets a log block first mtr log record group offset. */
+UNIV_INLINE
+ulint
+log_block_get_first_rec_group(
+/*==========================*/
+ /* out: first mtr log record group byte offset
+ from the block start, 0 if none */
+ byte* log_block); /* in: log block */
+/****************************************************************
+Sets the log block first mtr log record group offset. */
+UNIV_INLINE
+void
+log_block_set_first_rec_group(
+/*==========================*/
+ byte* log_block, /* in: log block */
+ ulint offset); /* in: offset, 0 if none */
+/****************************************************************
+Gets a log block checkpoint number field (4 lowest bytes). */
+UNIV_INLINE
+ulint
+log_block_get_checkpoint_no(
+/*========================*/
+ /* out: checkpoint no (4 lowest bytes) */
+ byte* log_block); /* in: log block */
+/****************************************************************
+Initializes a log block in the log buffer. */
+UNIV_INLINE
+void
+log_block_init(
+/*===========*/
+ byte* log_block, /* in: pointer to the log buffer */
+ dulint lsn); /* in: lsn within the log block */
+/****************************************************************
+Converts a lsn to a log block number. */
+UNIV_INLINE
+ulint
+log_block_convert_lsn_to_no(
+/*========================*/
+ /* out: log block number, it is > 0 and <= 1G */
+ dulint lsn); /* in: lsn of a byte within the block */
+/**********************************************************
+Prints info of the log. */
+
+void
+log_print(void);
+/*===========*/
+
+extern log_t* log_sys;
+
+/* Values used as flags */
+#define LOG_FLUSH 7652559
+#define LOG_CHECKPOINT 78656949
+#define LOG_ARCHIVE 11122331
+#define LOG_RECOVER 98887331
+
+/* The counting of lsn's starts from this value: this must be non-zero */
+#define LOG_START_LSN ut_dulint_create(0, 16 * OS_FILE_LOG_BLOCK_SIZE)
+
+#define LOG_BUFFER_SIZE (srv_log_buffer_size * UNIV_PAGE_SIZE)
+#define LOG_ARCHIVE_BUF_SIZE (srv_log_buffer_size * UNIV_PAGE_SIZE / 4)
+
+/* Offsets of a log block header */
+#define LOG_BLOCK_HDR_NO 0 /* block number which must be > 0 and
+ is allowed to wrap around at 2G; the
+ highest bit is set to 1 if this is the
+ first log block in a log flush write
+ segment */
+#define LOG_BLOCK_FLUSH_BIT_MASK 0x80000000
+ /* mask used to get the highest bit in
+ the preceding field */
+#define LOG_BLOCK_HDR_DATA_LEN 4 /* number of bytes of log written to
+ this block */
+#define LOG_BLOCK_FIRST_REC_GROUP 6 /* offset of the first start of an
+ mtr log record group in this log block,
+ 0 if none; if the value is the same
+ as LOG_BLOCK_HDR_DATA_LEN, it means
+ that the first rec group has not yet
+ been catenated to this log block, but
+ if it will, it will start at this
+ offset; an archive recovery can
+ start parsing the log records starting
+ from this offset in this log block,
+ if value not 0 */
+#define LOG_BLOCK_CHECKPOINT_NO 8 /* 4 lower bytes of the value of
+ log_sys->next_checkpoint_no when the
+ log block was last written to: if the
+ block has not yet been written full,
+ this value is only updated before a
+ log buffer flush */
+#define LOG_BLOCK_HDR_SIZE 12 /* size of the log block header in
+ bytes */
+
+/* Offsets of a log block trailer from the end of the block */
+#define LOG_BLOCK_TRL_NO 4 /* log block number */
+#define LOG_BLOCK_TRL_SIZE 4 /* trailer size in bytes */
+
+/* Offsets for a checkpoint field */
+#define LOG_CHECKPOINT_NO 0
+#define LOG_CHECKPOINT_LSN 8
+#define LOG_CHECKPOINT_OFFSET 16
+#define LOG_CHECKPOINT_LOG_BUF_SIZE 20
+#define LOG_CHECKPOINT_ARCHIVED_LSN 24
+#define LOG_CHECKPOINT_GROUP_ARRAY 32
+
+/* For each value < LOG_MAX_N_GROUPS the following 8 bytes: */
+
+#define LOG_CHECKPOINT_ARCHIVED_FILE_NO 0
+#define LOG_CHECKPOINT_ARCHIVED_OFFSET 4
+
+#define LOG_CHECKPOINT_ARRAY_END (LOG_CHECKPOINT_GROUP_ARRAY\
+ + LOG_MAX_N_GROUPS * 8)
+#define LOG_CHECKPOINT_CHECKSUM_1 LOG_CHECKPOINT_ARRAY_END
+#define LOG_CHECKPOINT_CHECKSUM_2 (4 + LOG_CHECKPOINT_ARRAY_END)
+#define LOG_CHECKPOINT_SIZE (8 + LOG_CHECKPOINT_ARRAY_END)
+
+/* Offsets of a log file header */
+#define LOG_GROUP_ID 0 /* log group number */
+#define LOG_FILE_START_LSN 4 /* lsn of the start of data in this
+ log file */
+#define LOG_FILE_NO 12 /* 4-byte archived log file number */
+#define LOG_FILE_ARCH_COMPLETED OS_FILE_LOG_BLOCK_SIZE
+ /* this 4-byte field is TRUE when
+ the writing of an archived log file
+ has been completed */
+#define LOG_FILE_END_LSN (OS_FILE_LOG_BLOCK_SIZE + 4)
+ /* lsn where the archived log file
+ at least extends: actually the
+ archived log file may extend to a
+ later lsn, as long as it is within the
+ same log block as this lsn; this field
+ is defined only when an archived log
+ file has been completely written */
+#define LOG_CHECKPOINT_1 OS_FILE_LOG_BLOCK_SIZE
+#define LOG_CHECKPOINT_2 (3 * OS_FILE_LOG_BLOCK_SIZE)
+#define LOG_FILE_HDR_SIZE (4 * OS_FILE_LOG_BLOCK_SIZE)
+
+#define LOG_GROUP_OK 301
+#define LOG_GROUP_CORRUPTED 302
+
+/* Log group consists of a number of log files, each of the same size; a log
+group is implemented as a space in the sense of the module fil0fil. */
+
+struct log_group_struct{
+ /* The following fields are protected by log_sys->mutex */
+ ulint id; /* log group id */
+ ulint n_files; /* number of files in the group */
+ ulint file_size; /* individual log file size in bytes,
+ including the log file header */
+ ulint space_id; /* file space which implements the log
+ group */
+ ulint state; /* LOG_GROUP_OK or
+ LOG_GROUP_CORRUPTED */
+ dulint lsn; /* lsn used to fix coordinates within
+ the log group */
+ ulint lsn_offset; /* the offset of the above lsn */
+ ulint n_pending_writes;/* number of currently pending flush
+ writes for this log group */
+ byte** file_header_bufs;/* buffers for each file header in the
+ group */
+ /*-----------------------------*/
+ byte** archive_file_header_bufs;/* buffers for each file
+ header in the group */
+ ulint archive_space_id;/* file space which implements the log
+ group archive */
+ ulint archived_file_no;/* file number corresponding to
+ log_sys->archived_lsn */
+ ulint archived_offset;/* file offset corresponding to
+ log_sys->archived_lsn, 0 if we have
+ not yet written to the archive file
+ number archived_file_no */
+ ulint next_archived_file_no;/* during an archive write,
+ until the write is completed, we
+ store the next value for
+ archived_file_no here: the write
+ completion function then sets the new
+ value to ..._file_no */
+ ulint next_archived_offset; /* like the preceding field */
+ /*-----------------------------*/
+ dulint scanned_lsn; /* used only in recovery: recovery scan
+ succeeded up to this lsn in this log
+ group */
+ byte* checkpoint_buf; /* checkpoint header is written from
+ this buffer to the group */
+ UT_LIST_NODE_T(log_group_t)
+ log_groups; /* list of log groups */
+};
+
+struct log_struct{
+ byte pad[64]; /* padding to prevent other memory
+ update hotspots from residing on the
+ same memory cache line */
+ dulint lsn; /* log sequence number */
+ ulint buf_free; /* first free offset within the log
+ buffer */
+ mutex_t mutex; /* mutex protecting the log */
+ byte* buf; /* log buffer */
+ ulint buf_size; /* log buffer size in bytes */
+ ulint max_buf_free; /* recommended maximum value of
+ buf_free, after which the buffer is
+ flushed */
+ ulint old_buf_free; /* value of buf free when log was
+ last time opened; only in the debug
+ version */
+ dulint old_lsn; /* value of lsn when log was last time
+ opened; only in the debug version */
+ ibool check_flush_or_checkpoint;
+ /* this is set to TRUE when there may
+ be need to flush the log buffer, or
+ preflush buffer pool pages, or make
+ a checkpoint; this MUST be TRUE when
+ lsn - last_checkpoint_lsn >
+ max_checkpoint_age; this flag is
+ peeked at by log_free_check(), which
+ does not reserve the log mutex */
+ UT_LIST_BASE_NODE_T(log_group_t)
+ log_groups; /* log groups */
+
+ /* The fields involved in the log buffer flush */
+
+ ulint buf_next_to_write;/* first offset in the log buffer
+ where the byte content may not exist
+ written to file, e.g., the start
+ offset of a log record catenated
+ later; this is advanced when a flush
+ operation is completed to all the log
+ groups */
+ dulint written_to_some_lsn;
+ /* first log sequence number not yet
+ written to any log group; for this to
+ be advanced, it is enough that the
+ write i/o has been completed for any
+ one log group */
+ dulint written_to_all_lsn;
+ /* first log sequence number not yet
+ written to some log group; for this to
+ be advanced, it is enough that the
+ write i/o has been completed for all
+ log groups */
+ dulint flush_lsn; /* end lsn for the current flush */
+ ulint flush_end_offset;/* the data in buffer ha been flushed
+ up to this offset when the current
+ flush ends: this field will then
+ be copied to buf_next_to_write */
+ ulint n_pending_writes;/* number of currently pending flush
+ writes */
+ os_event_t no_flush_event; /* this event is in the reset state
+ when a flush is running; a thread
+ should wait for this without owning
+ the log mutex, but NOTE that to set or
+ reset this event, the thread MUST own
+ the log mutex! */
+ ibool one_flushed; /* during a flush, this is first FALSE
+ and becomes TRUE when one log group
+ has been flushed */
+ os_event_t one_flushed_event;/* this event is reset when the
+ flush has not yet completed for any
+ log group; e.g., this means that a
+ transaction has been committed when
+ this is set; a thread should wait
+ for this without owning the log mutex,
+ but NOTE that to set or reset this
+ event, the thread MUST own the log
+ mutex! */
+ ulint n_log_ios; /* number of log i/os initiated thus
+ far */
+ /* Fields involved in checkpoints */
+ ulint max_modified_age_async;
+ /* when this recommended value for lsn
+ - buf_pool_get_oldest_modification()
+ is exceeded, we start an asynchronous
+ preflush of pool pages */
+ ulint max_modified_age_sync;
+ /* when this recommended value for lsn
+ - buf_pool_get_oldest_modification()
+ is exceeded, we start a synchronous
+ preflush of pool pages */
+ ulint adm_checkpoint_interval;
+ /* administrator-specified checkpoint
+ interval in terms of log growth in
+ bytes; the interval actually used by
+ the database can be smaller */
+ ulint max_checkpoint_age_async;
+ /* when this checkpoint age is exceeded
+ we start an asynchronous writing of a
+ new checkpoint */
+ ulint max_checkpoint_age;
+ /* this is the maximum allowed value
+ for lsn - last_checkpoint_lsn when a
+ new query step is started */
+ dulint next_checkpoint_no;
+ /* next checkpoint number */
+ dulint last_checkpoint_lsn;
+ /* latest checkpoint lsn */
+ dulint next_checkpoint_lsn;
+ /* next checkpoint lsn */
+ ulint n_pending_checkpoint_writes;
+ /* number of currently pending
+ checkpoint writes */
+ rw_lock_t checkpoint_lock;/* this latch is x-locked when a
+ checkpoint write is running; a thread
+ should wait for this without owning
+ the log mutex */
+ byte* checkpoint_buf; /* checkpoint header is read to this
+ buffer */
+ /* Fields involved in archiving */
+ ulint archiving_state;/* LOG_ARCH_ON, LOG_ARCH_STOPPING
+ LOG_ARCH_STOPPED, LOG_ARCH_OFF */
+ dulint archived_lsn; /* archiving has advanced to this lsn */
+ ulint max_archived_lsn_age_async;
+ /* recommended maximum age of
+ archived_lsn, before we start
+ asynchronous copying to the archive */
+ ulint max_archived_lsn_age;
+ /* maximum allowed age for
+ archived_lsn */
+ dulint next_archived_lsn;/* during an archive write,
+ until the write is completed, we
+ store the next value for
+ archived_lsn here: the write
+ completion function then sets the new
+ value to archived_lsn */
+ ulint archiving_phase;/* LOG_ARCHIVE_READ or
+ LOG_ARCHIVE_WRITE */
+ ulint n_pending_archive_ios;
+ /* number of currently pending reads
+ or writes in archiving */
+ rw_lock_t archive_lock; /* this latch is x-locked when an
+ archive write is running; a thread
+ should wait for this without owning
+ the log mutex */
+ ulint archive_buf_size;/* size of archive_buf */
+ byte* archive_buf; /* log segment is written to the
+ archive from this buffer */
+ os_event_t archiving_on; /* if archiving has been stopped,
+ a thread can wait for this event to
+ become signaled */
+ /* Fields involved in online backups */
+ ibool online_backup_state;
+ /* TRUE if the database is in the
+ online backup state */
+ dulint online_backup_lsn;
+ /* lsn when the state was changed to
+ the online backup state */
+};
+
+#define LOG_ARCH_ON 71
+#define LOG_ARCH_STOPPING 72
+#define LOG_ARCH_STOPPING2 73
+#define LOG_ARCH_STOPPED 74
+#define LOG_ARCH_OFF 75
+
+#ifndef UNIV_NONINL
+#include "log0log.ic"
+#endif
+
+#endif
diff --git a/innobase/include/log0log.ic b/innobase/include/log0log.ic
new file mode 100644
index 00000000000..e5c313d129b
--- /dev/null
+++ b/innobase/include/log0log.ic
@@ -0,0 +1,378 @@
+/******************************************************
+Database log
+
+(c) 1995 Innobase Oy
+
+Created 12/9/1995 Heikki Tuuri
+*******************************************************/
+
+#include "os0file.h"
+#include "mach0data.h"
+#include "mtr0mtr.h"
+
+/**********************************************************
+Checks by parsing that the catenated log segment for a single mtr is
+consistent. */
+
+ibool
+log_check_log_recs(
+/*===============*/
+ byte* buf, /* in: pointer to the start of the log segment
+ in the log_sys->buf log buffer */
+ ulint len, /* in: segment length in bytes */
+ dulint buf_start_lsn); /* in: buffer start lsn */
+
+/****************************************************************
+Gets a log block flush bit. */
+UNIV_INLINE
+ibool
+log_block_get_flush_bit(
+/*====================*/
+ /* out: TRUE if this block was the first
+ to be written in a log flush */
+ byte* log_block) /* in: log block */
+{
+ if (LOG_BLOCK_FLUSH_BIT_MASK
+ & mach_read_from_4(log_block + LOG_BLOCK_HDR_NO)) {
+
+ return(TRUE);
+ }
+
+ return(FALSE);
+}
+
+/****************************************************************
+Sets the log block flush bit. */
+UNIV_INLINE
+void
+log_block_set_flush_bit(
+/*====================*/
+ byte* log_block, /* in: log block */
+ ibool val) /* in: value to set */
+{
+ ulint field;
+
+ field = mach_read_from_4(log_block + LOG_BLOCK_HDR_NO);
+
+ if (val) {
+ field = field | LOG_BLOCK_FLUSH_BIT_MASK;
+ } else {
+ field = field & ~LOG_BLOCK_FLUSH_BIT_MASK;
+ }
+
+ mach_write_to_4(log_block + LOG_BLOCK_HDR_NO, field);
+}
+
+/****************************************************************
+Gets a log block number stored in the header. */
+UNIV_INLINE
+ulint
+log_block_get_hdr_no(
+/*=================*/
+ /* out: log block number stored in the block
+ header */
+ byte* log_block) /* in: log block */
+{
+ return(~LOG_BLOCK_FLUSH_BIT_MASK
+ & mach_read_from_4(log_block + LOG_BLOCK_HDR_NO));
+}
+
+/****************************************************************
+Sets the log block number stored in the header; NOTE that this must be set
+before the flush bit! */
+UNIV_INLINE
+void
+log_block_set_hdr_no(
+/*=================*/
+ byte* log_block, /* in: log block */
+ ulint n) /* in: log block number: must be > 0 and
+ < LOG_BLOCK_FLUSH_BIT_MASK */
+{
+ ut_ad(n > 0);
+ ut_ad(n < LOG_BLOCK_FLUSH_BIT_MASK);
+
+ mach_write_to_4(log_block + LOG_BLOCK_HDR_NO, n);
+}
+
+/****************************************************************
+Gets a log block data length. */
+UNIV_INLINE
+ulint
+log_block_get_data_len(
+/*===================*/
+ /* out: log block data length measured as a
+ byte offset from the block start */
+ byte* log_block) /* in: log block */
+{
+ return(mach_read_from_2(log_block + LOG_BLOCK_HDR_DATA_LEN));
+}
+
+/****************************************************************
+Sets the log block data length. */
+UNIV_INLINE
+void
+log_block_set_data_len(
+/*===================*/
+ byte* log_block, /* in: log block */
+ ulint len) /* in: data length */
+{
+ mach_write_to_2(log_block + LOG_BLOCK_HDR_DATA_LEN, len);
+}
+
+/****************************************************************
+Gets a log block first mtr log record group offset. */
+UNIV_INLINE
+ulint
+log_block_get_first_rec_group(
+/*==========================*/
+ /* out: first mtr log record group byte offset
+ from the block start, 0 if none */
+ byte* log_block) /* in: log block */
+{
+ return(mach_read_from_2(log_block + LOG_BLOCK_FIRST_REC_GROUP));
+}
+
+/****************************************************************
+Sets the log block first mtr log record group offset. */
+UNIV_INLINE
+void
+log_block_set_first_rec_group(
+/*==========================*/
+ byte* log_block, /* in: log block */
+ ulint offset) /* in: offset, 0 if none */
+{
+ mach_write_to_2(log_block + LOG_BLOCK_FIRST_REC_GROUP, offset);
+}
+
+/****************************************************************
+Gets a log block checkpoint number field (4 lowest bytes). */
+UNIV_INLINE
+ulint
+log_block_get_checkpoint_no(
+/*========================*/
+ /* out: checkpoint no (4 lowest bytes) */
+ byte* log_block) /* in: log block */
+{
+ return(mach_read_from_4(log_block + LOG_BLOCK_CHECKPOINT_NO));
+}
+
+/****************************************************************
+Sets a log block checkpoint number field (4 lowest bytes). */
+UNIV_INLINE
+void
+log_block_set_checkpoint_no(
+/*========================*/
+ byte* log_block, /* in: log block */
+ dulint no) /* in: checkpoint no */
+{
+ mach_write_to_4(log_block + LOG_BLOCK_CHECKPOINT_NO,
+ ut_dulint_get_low(no));
+}
+
+/****************************************************************
+Gets a log block number stored in the trailer. */
+UNIV_INLINE
+ulint
+log_block_get_trl_no(
+/*=================*/
+ /* out: log block number stored in the block
+ trailer */
+ byte* log_block) /* in: log block */
+{
+ return(mach_read_from_4(log_block + OS_FILE_LOG_BLOCK_SIZE
+ - LOG_BLOCK_TRL_NO));
+}
+
+/****************************************************************
+Sets the log block number stored in the trailer. */
+UNIV_INLINE
+void
+log_block_set_trl_no(
+/*=================*/
+ byte* log_block, /* in: log block */
+ ulint n) /* in: log block number */
+{
+ mach_write_to_4(log_block + OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_TRL_NO,
+ n);
+}
+
+/****************************************************************
+Converts a lsn to a log block number. */
+UNIV_INLINE
+ulint
+log_block_convert_lsn_to_no(
+/*========================*/
+ /* out: log block number, it is > 0 and <= 1G */
+ dulint lsn) /* in: lsn of a byte within the block */
+{
+ ulint no;
+
+ no = ut_dulint_get_low(lsn) / OS_FILE_LOG_BLOCK_SIZE;
+ no += (ut_dulint_get_high(lsn) % OS_FILE_LOG_BLOCK_SIZE)
+ * 2 * (0x80000000 / OS_FILE_LOG_BLOCK_SIZE);
+
+ no = no & 0x3FFFFFFF;
+
+ return(no + 1);
+}
+
+/****************************************************************
+Initializes a log block in the log buffer. */
+UNIV_INLINE
+void
+log_block_init(
+/*===========*/
+ byte* log_block, /* in: pointer to the log buffer */
+ dulint lsn) /* in: lsn within the log block */
+{
+ ulint no;
+
+ ut_ad(mutex_own(&(log_sys->mutex)));
+
+ no = log_block_convert_lsn_to_no(lsn);
+
+ log_block_set_hdr_no(log_block, no);
+ log_block_set_trl_no(log_block, no);
+
+ log_block_set_data_len(log_block, LOG_BLOCK_HDR_SIZE);
+ log_block_set_first_rec_group(log_block, 0);
+}
+
+/****************************************************************
+Writes to the log the string given. The log must be released with
+log_release. */
+UNIV_INLINE
+dulint
+log_reserve_and_write_fast(
+/*=======================*/
+ /* out: end lsn of the log record, ut_dulint_zero if
+ did not succeed */
+ byte* str, /* in: string */
+ ulint len, /* in: string length */
+ dulint* start_lsn,/* out: start lsn of the log record */
+ ibool* success)/* out: TRUE if success */
+{
+ log_t* log = log_sys;
+ ulint data_len;
+ dulint lsn;
+
+ *success = TRUE;
+
+ mutex_enter(&(log->mutex));
+
+ data_len = len + log->buf_free % OS_FILE_LOG_BLOCK_SIZE;
+
+ if (log->online_backup_state
+ || (data_len >= OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_TRL_SIZE)) {
+
+ /* The string does not fit within the current log block
+ or the log block would become full */
+
+ *success = FALSE;
+
+ mutex_exit(&(log->mutex));
+
+ return(ut_dulint_zero);
+ }
+
+ *start_lsn = log->lsn;
+
+ ut_memcpy(log->buf + log->buf_free, str, len);
+
+ log_block_set_data_len(ut_align_down(log->buf + log->buf_free,
+ OS_FILE_LOG_BLOCK_SIZE),
+ data_len);
+#ifdef UNIV_LOG_DEBUG
+ log->old_buf_free = log->buf_free;
+ log->old_lsn = log->lsn;
+#endif
+ log->buf_free += len;
+
+ ut_ad(log->buf_free <= log->buf_size);
+
+ lsn = ut_dulint_add(log->lsn, len);
+
+ log->lsn = lsn;
+
+#ifdef UNIV_LOG_DEBUG
+ log_check_log_recs(log->buf + log->old_buf_free,
+ log->buf_free - log->old_buf_free, log->old_lsn);
+#endif
+ return(lsn);
+}
+
+/***************************************************************************
+Releases the log mutex. */
+UNIV_INLINE
+void
+log_release(void)
+/*=============*/
+{
+ mutex_exit(&(log_sys->mutex));
+}
+
+/****************************************************************
+Gets the current lsn. */
+UNIV_INLINE
+dulint
+log_get_lsn(void)
+/*=============*/
+ /* out: current lsn */
+{
+ dulint lsn;
+
+ mutex_enter(&(log_sys->mutex));
+
+ lsn = log_sys->lsn;
+
+ mutex_exit(&(log_sys->mutex));
+
+ return(lsn);
+}
+
+/***************************************************************************
+Checks if there is need for a log buffer flush or a new checkpoint, and does
+this if yes. Any database operation should call this when it has modified
+more than about 4 pages. NOTE that this function may only be called when the
+OS thread owns no synchronization objects except the dictionary mutex. */
+UNIV_INLINE
+void
+log_free_check(void)
+/*================*/
+{
+ /* ut_ad(sync_thread_levels_empty()); */
+
+ if (log_sys->check_flush_or_checkpoint) {
+
+ log_check_margins();
+ }
+}
+
+/****************************************************************************
+Gets the online backup lsn. */
+UNIV_INLINE
+dulint
+log_get_online_backup_lsn_low(void)
+/*===============================*/
+ /* out: online_backup_lsn, the caller must
+ own the log_sys mutex */
+{
+ ut_ad(mutex_own(&(log_sys->mutex)));
+ ut_ad(log_sys->online_backup_state);
+
+ return(log_sys->online_backup_lsn);
+}
+
+/****************************************************************************
+Gets the online backup state. */
+UNIV_INLINE
+ibool
+log_get_online_backup_state_low(void)
+/*=================================*/
+ /* out: online backup state, the caller must
+ own the log_sys mutex */
+{
+ ut_ad(mutex_own(&(log_sys->mutex)));
+
+ return(log_sys->online_backup_state);
+}
diff --git a/innobase/include/log0recv.h b/innobase/include/log0recv.h
new file mode 100644
index 00000000000..51f14393d38
--- /dev/null
+++ b/innobase/include/log0recv.h
@@ -0,0 +1,284 @@
+/******************************************************
+Recovery
+
+(c) 1997 Innobase Oy
+
+Created 9/20/1997 Heikki Tuuri
+*******************************************************/
+
+#ifndef log0recv_h
+#define log0recv_h
+
+#include "univ.i"
+#include "ut0byte.h"
+#include "page0types.h"
+#include "hash0hash.h"
+#include "log0log.h"
+
+/***********************************************************************
+Returns TRUE if recovery is currently running. */
+UNIV_INLINE
+ibool
+recv_recovery_is_on(void);
+/*=====================*/
+/***********************************************************************
+Returns TRUE if recovery from backup is currently running. */
+UNIV_INLINE
+ibool
+recv_recovery_from_backup_is_on(void);
+/*=================================*/
+/****************************************************************************
+Applies the hashed log records to the page, if the page lsn is less than the
+lsn of a log record. This can be called when a buffer page has just been
+read in, or also for a page already in the buffer pool. */
+
+void
+recv_recover_page(
+/*==============*/
+ ibool just_read_in, /* in: TRUE if the i/o-handler calls this for
+ a freshly read page */
+ page_t* page, /* in: buffer page */
+ ulint space, /* in: space id */
+ ulint page_no); /* in: page number */
+/************************************************************
+Recovers from a checkpoint. When this function returns, the database is able
+to start processing of new user transactions, but the function
+recv_recovery_from_checkpoint_finish should be called later to complete
+the recovery and free the resources used in it. */
+
+ulint
+recv_recovery_from_checkpoint_start(
+/*================================*/
+ /* out: error code or DB_SUCCESS */
+ ulint type, /* in: LOG_CHECKPOINT or LOG_ARCHIVE */
+ dulint limit_lsn, /* in: recover up to this lsn if possible */
+ dulint min_flushed_lsn,/* in: min flushed lsn from data files */
+ dulint max_flushed_lsn);/* in: max flushed lsn from data files */
+/************************************************************
+Completes recovery from a checkpoint. */
+
+void
+recv_recovery_from_checkpoint_finish(void);
+/*======================================*/
+/***********************************************************
+Scans log from a buffer and stores new log data to the parsing buffer. Parses
+and hashes the log records if new data found. */
+
+ibool
+recv_scan_log_recs(
+/*===============*/
+ /* out: TRUE if limit_lsn has been reached, or
+ not able to scan any more in this log group */
+ ibool store_to_hash, /* in: TRUE if the records should be stored
+ to the hash table; this is set FALSE if just
+ debug checking is needed */
+ byte* buf, /* in: buffer containing a log segment or
+ garbage */
+ ulint len, /* in: buffer length */
+ dulint start_lsn, /* in: buffer start lsn */
+ dulint* contiguous_lsn, /* in/out: it is known that all log groups
+ contain contiguous log data up to this lsn */
+ dulint* group_scanned_lsn);/* out: scanning succeeded up to this lsn */
+/**********************************************************
+Resets the logs. The contents of log files will be lost! */
+
+void
+recv_reset_logs(
+/*============*/
+ dulint lsn, /* in: reset to this lsn rounded up to
+ be divisible by OS_FILE_LOG_BLOCK_SIZE,
+ after which we add LOG_BLOCK_HDR_SIZE */
+ ulint arch_log_no, /* in: next archived log file number */
+ ibool new_logs_created);/* in: TRUE if resetting logs is done
+ at the log creation; FALSE if it is done
+ after archive recovery */
+/************************************************************
+Creates the recovery system. */
+
+void
+recv_sys_create(void);
+/*=================*/
+/************************************************************
+Inits the recovery system for a recovery operation. */
+
+void
+recv_sys_init(void);
+/*===============*/
+/***********************************************************************
+Empties the hash table of stored log records, applying them to appropriate
+pages. */
+
+void
+recv_apply_hashed_log_recs(
+/*=======================*/
+ ibool allow_ibuf); /* in: if TRUE, also ibuf operations are
+ allowed during the application; if FALSE,
+ no ibuf operations are allowed, and after
+ the application all file pages are flushed to
+ disk and invalidated in buffer pool: this
+ alternative means that no new log records
+ can be generated during the application */
+/************************************************************
+Recovers from archived log files, and also from log files, if they exist. */
+
+ulint
+recv_recovery_from_archive_start(
+/*=============================*/
+ /* out: error code or DB_SUCCESS */
+ dulint min_flushed_lsn,/* in: min flushed lsn field from the
+ data files */
+ dulint limit_lsn, /* in: recover up to this lsn if possible */
+ ulint first_log_no); /* in: number of the first archived log file
+ to use in the recovery; the file will be
+ searched from INNOBASE_LOG_ARCH_DIR specified
+ in server config file */
+/************************************************************
+Completes recovery from archive. */
+
+void
+recv_recovery_from_archive_finish(void);
+/*===================================*/
+/***********************************************************************
+Checks that a replica of a space is identical to the original space. */
+
+void
+recv_compare_spaces(
+/*================*/
+ ulint space1, /* in: space id */
+ ulint space2, /* in: space id */
+ ulint n_pages);/* in: number of pages */
+/***********************************************************************
+Checks that a replica of a space is identical to the original space. Disables
+ibuf operations and flushes and invalidates the buffer pool pages after the
+test. This function can be used to check the recovery before dict or trx
+systems are initialized. */
+
+void
+recv_compare_spaces_low(
+/*====================*/
+ ulint space1, /* in: space id */
+ ulint space2, /* in: space id */
+ ulint n_pages);/* in: number of pages */
+
+/* Block of log record data */
+typedef struct recv_data_struct recv_data_t;
+struct recv_data_struct{
+ recv_data_t* next; /* pointer to the next block or NULL */
+ /* the log record data is stored physically
+ immediately after this struct, max amount
+ RECV_DATA_BLOCK_SIZE bytes of it */
+};
+
+/* Stored log record struct */
+typedef struct recv_struct recv_t;
+struct recv_struct{
+ byte type; /* log record type */
+ ulint len; /* log record body length in bytes */
+ recv_data_t* data; /* chain of blocks containing the log record
+ body */
+ dulint start_lsn;/* start lsn of the log segment written by
+ the mtr which generated this log record: NOTE
+ that this is not necessarily the start lsn of
+ this log record */
+ dulint end_lsn;/* end lsn of the log segment written by
+ the mtr which generated this log record: NOTE
+ that this is not necessarily the end lsn of
+ this log record */
+ UT_LIST_NODE_T(recv_t)
+ rec_list;/* list of log records for this page */
+};
+
+/* Hashed page file address struct */
+typedef struct recv_addr_struct recv_addr_t;
+struct recv_addr_struct{
+ ulint state; /* RECV_NOT_PROCESSED, RECV_BEING_PROCESSED,
+ or RECV_PROCESSED */
+ ulint space; /* space id */
+ ulint page_no;/* page number */
+ UT_LIST_BASE_NODE_T(recv_t)
+ rec_list;/* list of log records for this page */
+ hash_node_t addr_hash;
+};
+
+/* Recovery system data structure */
+typedef struct recv_sys_struct recv_sys_t;
+struct recv_sys_struct{
+ mutex_t mutex; /* mutex protecting the fields apply_log_recs,
+ n_addrs, and the state field in each recv_addr
+ struct */
+ ibool apply_log_recs;
+ /* this is TRUE when log rec application to
+ pages is allowed; this flag tells the
+ i/o-handler if it should do log record
+ application */
+ ibool apply_batch_on;
+ /* this is TRUE when a log rec application
+ batch is running */
+ dulint lsn; /* log sequence number */
+ ulint last_log_buf_size;
+ /* size of the log buffer when the database
+ last time wrote to the log */
+ byte* last_block;
+ /* possible incomplete last recovered log
+ block */
+ byte* last_block_buf_start;
+ /* the nonaligned start address of the
+ preceding buffer */
+ byte* buf; /* buffer for parsing log records */
+ ulint len; /* amount of data in buf */
+ dulint parse_start_lsn;
+ /* this is the lsn from which we were able to
+ start parsing log records and adding them to
+ the hash table; ut_dulint_zero if a suitable
+ start point not found yet */
+ dulint scanned_lsn;
+ /* the log data has been scanned up to this
+ lsn */
+ ulint scanned_checkpoint_no;
+ /* the log data has been scanned up to this
+ checkpoint number (lowest 4 bytes) */
+ ulint recovered_offset;
+ /* start offset of non-parsed log records in
+ buf */
+ dulint recovered_lsn;
+ /* the log records have been parsed up to
+ this lsn */
+ dulint limit_lsn;/* recovery should be made at most up to this
+ lsn */
+ log_group_t* archive_group;
+ /* in archive recovery: the log group whose
+ archive is read */
+ mem_heap_t* heap; /* memory heap of log records and file
+ addresses*/
+ hash_table_t* addr_hash;/* hash table of file addresses of pages */
+ ulint n_addrs;/* number of not processed hashed file
+ addresses in the hash table */
+};
+
+extern recv_sys_t* recv_sys;
+extern ibool recv_recovery_on;
+extern ibool recv_no_ibuf_operations;
+
+/* States of recv_addr_struct */
+#define RECV_NOT_PROCESSED 71
+#define RECV_BEING_READ 72
+#define RECV_BEING_PROCESSED 73
+#define RECV_PROCESSED 74
+
+/* The number which is added to a space id to obtain the replicate space
+in the debug version: spaces with an odd number as the id are replicate
+spaces */
+#define RECV_REPLICA_SPACE_ADD 1
+
+/* This many blocks must be left free in the buffer pool when we scan
+the log and store the scanned log records in the buffer pool: we will
+use these free blocks to read in pages when we start applying the
+log records to the database. */
+
+#define RECV_POOL_N_FREE_BLOCKS (ut_min(256, buf_pool_get_curr_size() / 8))
+
+#ifndef UNIV_NONINL
+#include "log0recv.ic"
+#endif
+
+#endif
diff --git a/innobase/include/log0recv.ic b/innobase/include/log0recv.ic
new file mode 100644
index 00000000000..489641bade2
--- /dev/null
+++ b/innobase/include/log0recv.ic
@@ -0,0 +1,35 @@
+/******************************************************
+Recovery
+
+(c) 1997 Innobase Oy
+
+Created 9/20/1997 Heikki Tuuri
+*******************************************************/
+
+#include "sync0sync.h"
+#include "mem0mem.h"
+#include "log0log.h"
+#include "os0file.h"
+
+extern ibool recv_recovery_from_backup_on;
+
+/***********************************************************************
+Returns TRUE if recovery is currently running. */
+UNIV_INLINE
+ibool
+recv_recovery_is_on(void)
+/*=====================*/
+{
+ return(recv_recovery_on);
+}
+
+/***********************************************************************
+Returns TRUE if recovery from backup is currently running. */
+UNIV_INLINE
+ibool
+recv_recovery_from_backup_is_on(void)
+/*=================================*/
+{
+ return(recv_recovery_from_backup_on);
+}
+
diff --git a/innobase/include/mach0data.h b/innobase/include/mach0data.h
new file mode 100644
index 00000000000..006f55d5f1f
--- /dev/null
+++ b/innobase/include/mach0data.h
@@ -0,0 +1,332 @@
+/**********************************************************************
+Utilities for converting data from the database file
+to the machine format.
+
+(c) 1995 Innobase Oy
+
+Created 11/28/1995 Heikki Tuuri
+***********************************************************************/
+
+#ifndef mach0data_h
+#define mach0data_h
+
+#include "univ.i"
+#include "ut0byte.h"
+
+/* The data and all fields are always stored in a database file
+in the same format: ascii, big-endian, ... .
+All data in the files MUST be accessed using the functions in this
+module. */
+
+/***********************************************************
+The following function is used to store data in one byte. */
+UNIV_INLINE
+void
+mach_write_to_1(
+/*============*/
+ byte* b, /* in: pointer to byte where to store */
+ ulint n); /* in: ulint integer to be stored, >= 0, < 256 */
+/************************************************************
+The following function is used to fetch data from one byte. */
+UNIV_INLINE
+ulint
+mach_read_from_1(
+/*=============*/
+ /* out: ulint integer, >= 0, < 256 */
+ byte* b); /* in: pointer to byte */
+/***********************************************************
+The following function is used to store data in two consecutive
+bytes. We store the most significant byte to the lower address. */
+UNIV_INLINE
+void
+mach_write_to_2(
+/*============*/
+ byte* b, /* in: pointer to two bytes where to store */
+ ulint n); /* in: ulint integer to be stored, >= 0, < 64k */
+/************************************************************
+The following function is used to fetch data from two consecutive
+bytes. The most significant byte is at the lowest address. */
+UNIV_INLINE
+ulint
+mach_read_from_2(
+/*=============*/
+ /* out: ulint integer, >= 0, < 64k */
+ byte* b); /* in: pointer to two bytes */
+/***********************************************************
+The following function is used to store data in 3 consecutive
+bytes. We store the most significant byte to the lowest address. */
+UNIV_INLINE
+void
+mach_write_to_3(
+/*============*/
+ byte* b, /* in: pointer to 3 bytes where to store */
+ ulint n); /* in: ulint integer to be stored */
+/************************************************************
+The following function is used to fetch data from 3 consecutive
+bytes. The most significant byte is at the lowest address. */
+UNIV_INLINE
+ulint
+mach_read_from_3(
+/*=============*/
+ /* out: ulint integer */
+ byte* b); /* in: pointer to 3 bytes */
+/***********************************************************
+The following function is used to store data in four consecutive
+bytes. We store the most significant byte to the lowest address. */
+UNIV_INLINE
+void
+mach_write_to_4(
+/*============*/
+ byte* b, /* in: pointer to four bytes where to store */
+ ulint n); /* in: ulint integer to be stored */
+/************************************************************
+The following function is used to fetch data from 4 consecutive
+bytes. The most significant byte is at the lowest address. */
+UNIV_INLINE
+ulint
+mach_read_from_4(
+/*=============*/
+ /* out: ulint integer */
+ byte* b); /* in: pointer to four bytes */
+/***********************************************************
+The following function is used to store data from a ulint to memory
+in standard order:
+we store the most significant byte to the lowest address. */
+UNIV_INLINE
+void
+mach_write(
+/*=======*/
+ byte* b, /* in: pointer to sizeof(ulint) bytes where to store */
+ ulint n); /* in: ulint integer to be stored */
+/************************************************************
+The following function is used to fetch data from memory to a ulint.
+The most significant byte is at the lowest address. */
+UNIV_INLINE
+ulint
+mach_read(
+/*======*/
+ /* out: ulint integer */
+ byte* b); /* in: pointer to sizeof(ulint) bytes */
+/*************************************************************
+Writes a ulint in a compressed form. */
+UNIV_INLINE
+ulint
+mach_write_compressed(
+/*==================*/
+ /* out: stored size in bytes */
+ byte* b, /* in: pointer to memory where to store */
+ ulint n); /* in: ulint integer to be stored */
+/*************************************************************
+Returns the size of an ulint when written in the compressed form. */
+UNIV_INLINE
+ulint
+mach_get_compressed_size(
+/*=====================*/
+ /* out: compressed size in bytes */
+ ulint n); /* in: ulint integer to be stored */
+/*************************************************************
+Reads a ulint in a compressed form. */
+UNIV_INLINE
+ulint
+mach_read_compressed(
+/*=================*/
+ /* out: read integer */
+ byte* b); /* in: pointer to memory from where to read */
+/***********************************************************
+The following function is used to store data in 6 consecutive
+bytes. We store the most significant byte to the lowest address. */
+UNIV_INLINE
+void
+mach_write_to_6(
+/*============*/
+ byte* b, /* in: pointer to 6 bytes where to store */
+ dulint n); /* in: dulint integer to be stored */
+/************************************************************
+The following function is used to fetch data from 6 consecutive
+bytes. The most significant byte is at the lowest address. */
+UNIV_INLINE
+dulint
+mach_read_from_6(
+/*=============*/
+ /* out: dulint integer */
+ byte* b); /* in: pointer to 6 bytes */
+/***********************************************************
+The following function is used to store data in 7 consecutive
+bytes. We store the most significant byte to the lowest address. */
+UNIV_INLINE
+void
+mach_write_to_7(
+/*============*/
+ byte* b, /* in: pointer to 7 bytes where to store */
+ dulint n); /* in: dulint integer to be stored */
+/************************************************************
+The following function is used to fetch data from 7 consecutive
+bytes. The most significant byte is at the lowest address. */
+UNIV_INLINE
+dulint
+mach_read_from_7(
+/*=============*/
+ /* out: dulint integer */
+ byte* b); /* in: pointer to 7 bytes */
+/***********************************************************
+The following function is used to store data in 8 consecutive
+bytes. We store the most significant byte to the lowest address. */
+UNIV_INLINE
+void
+mach_write_to_8(
+/*============*/
+ byte* b, /* in: pointer to 8 bytes where to store */
+ dulint n); /* in: dulint integer to be stored */
+/************************************************************
+The following function is used to fetch data from 8 consecutive
+bytes. The most significant byte is at the lowest address. */
+UNIV_INLINE
+dulint
+mach_read_from_8(
+/*=============*/
+ /* out: dulint integer */
+ byte* b); /* in: pointer to 8 bytes */
+/*************************************************************
+Writes a dulint in a compressed form. */
+UNIV_INLINE
+ulint
+mach_dulint_write_compressed(
+/*=========================*/
+ /* out: size in bytes */
+ byte* b, /* in: pointer to memory where to store */
+ dulint n); /* in: dulint integer to be stored */
+/*************************************************************
+Returns the size of a dulint when written in the compressed form. */
+UNIV_INLINE
+ulint
+mach_dulint_get_compressed_size(
+/*============================*/
+ /* out: compressed size in bytes */
+ dulint n); /* in: dulint integer to be stored */
+/*************************************************************
+Reads a dulint in a compressed form. */
+UNIV_INLINE
+dulint
+mach_dulint_read_compressed(
+/*========================*/
+ /* out: read dulint */
+ byte* b); /* in: pointer to memory from where to read */
+/*************************************************************
+Writes a dulint in a compressed form. */
+UNIV_INLINE
+ulint
+mach_dulint_write_much_compressed(
+/*==============================*/
+ /* out: size in bytes */
+ byte* b, /* in: pointer to memory where to store */
+ dulint n); /* in: dulint integer to be stored */
+/*************************************************************
+Returns the size of a dulint when written in the compressed form. */
+UNIV_INLINE
+ulint
+mach_dulint_get_much_compressed_size(
+/*=================================*/
+ /* out: compressed size in bytes */
+ dulint n); /* in: dulint integer to be stored */
+/*************************************************************
+Reads a dulint in a compressed form. */
+UNIV_INLINE
+dulint
+mach_dulint_read_much_compressed(
+/*=============================*/
+ /* out: read dulint */
+ byte* b); /* in: pointer to memory from where to read */
+/*************************************************************
+Reads a ulint in a compressed form if the log record fully contains it. */
+
+byte*
+mach_parse_compressed(
+/*==================*/
+ /* out: pointer to end of the stored field, NULL if
+ not complete */
+ byte* ptr, /* in: pointer to buffer from where to read */
+ byte* end_ptr,/* in: pointer to end of the buffer */
+ ulint* val); /* out: read value */
+/*************************************************************
+Reads a dulint in a compressed form if the log record fully contains it. */
+
+byte*
+mach_dulint_parse_compressed(
+/*=========================*/
+ /* out: pointer to end of the stored field, NULL if
+ not complete */
+ byte* ptr, /* in: pointer to buffer from where to read */
+ byte* end_ptr,/* in: pointer to end of the buffer */
+ dulint* val); /* out: read value */
+/*************************************************************
+Reads a double. It is stored in a little-endian format. */
+UNIV_INLINE
+double
+mach_double_read(
+/*=============*/
+ /* out: double read */
+ byte* b); /* in: pointer to memory from where to read */
+/*************************************************************
+Writes a double. It is stored in a little-endian format. */
+UNIV_INLINE
+void
+mach_double_write(
+/*==============*/
+ byte* b, /* in: pointer to memory where to write */
+ double d); /* in: double */
+/*************************************************************
+Reads a float. It is stored in a little-endian format. */
+UNIV_INLINE
+float
+mach_float_read(
+/*=============*/
+ /* out: float read */
+ byte* b); /* in: pointer to memory from where to read */
+/*************************************************************
+Writes a float. It is stored in a little-endian format. */
+UNIV_INLINE
+void
+mach_float_write(
+/*==============*/
+ byte* b, /* in: pointer to memory where to write */
+ float d); /* in: float */
+/*************************************************************
+Reads a ulint stored in the little-endian format. */
+UNIV_INLINE
+ulint
+mach_read_from_n_little_endian(
+/*===========================*/
+ /* out: unsigned long int */
+ byte* buf, /* in: from where to read */
+ ulint buf_size); /* in: from how many bytes to read */
+/*************************************************************
+Writes a ulint in the little-endian format. */
+UNIV_INLINE
+void
+mach_write_to_n_little_endian(
+/*==========================*/
+ byte* dest, /* in: where to write */
+ ulint dest_size, /* in: into how many bytes to write */
+ ulint n); /* in: unsigned long int to write */
+/*************************************************************
+Reads a ulint stored in the little-endian format. */
+UNIV_INLINE
+ulint
+mach_read_from_2_little_endian(
+/*===========================*/
+ /* out: unsigned long int */
+ byte* buf); /* in: from where to read */
+/*************************************************************
+Writes a ulint in the little-endian format. */
+UNIV_INLINE
+void
+mach_write_to_2_little_endian(
+/*==========================*/
+ byte* dest, /* in: where to write */
+ ulint n); /* in: unsigned long int to write */
+
+#ifndef UNIV_NONINL
+#include "mach0data.ic"
+#endif
+
+#endif
diff --git a/innobase/include/mach0data.ic b/innobase/include/mach0data.ic
new file mode 100644
index 00000000000..6c93cb687a5
--- /dev/null
+++ b/innobase/include/mach0data.ic
@@ -0,0 +1,727 @@
+/**********************************************************************
+Utilities for converting data from the database file
+to the machine format.
+
+(c) 1995 Innobase Oy
+
+Created 11/28/1995 Heikki Tuuri
+***********************************************************************/
+
+/***********************************************************
+The following function is used to store data in one byte. */
+UNIV_INLINE
+void
+mach_write_to_1(
+/*============*/
+ byte* b, /* in: pointer to byte where to store */
+ ulint n) /* in: ulint integer to be stored, >= 0, < 256 */
+{
+ ut_ad(b);
+ ut_ad((n >= 0) && (n <= 0xFF));
+
+ b[0] = (byte)n;
+}
+
+/************************************************************
+The following function is used to fetch data from one byte. */
+UNIV_INLINE
+ulint
+mach_read_from_1(
+/*=============*/
+ /* out: ulint integer, >= 0, < 256 */
+ byte* b) /* in: pointer to byte */
+{
+ ut_ad(b);
+ return((ulint)(b[0]));
+}
+
+/***********************************************************
+The following function is used to store data in two consecutive
+bytes. We store the most significant byte to the lowest address. */
+UNIV_INLINE
+void
+mach_write_to_2(
+/*============*/
+ byte* b, /* in: pointer to two bytes where to store */
+ ulint n) /* in: ulint integer to be stored */
+{
+ ut_ad(b);
+ ut_ad(n <= 0xFFFF);
+
+ b[0] = (byte)(n >> 8);
+ b[1] = (byte)(n);
+}
+
+/************************************************************
+The following function is used to fetch data from 2 consecutive
+bytes. The most significant byte is at the lowest address. */
+UNIV_INLINE
+ulint
+mach_read_from_2(
+/*=============*/
+ /* out: ulint integer */
+ byte* b) /* in: pointer to 2 bytes */
+{
+ ut_ad(b);
+ return( ((ulint)(b[0]) << 8)
+ + (ulint)(b[1])
+ );
+}
+
+/***********************************************************
+The following function is used to store data in 3 consecutive
+bytes. We store the most significant byte to the lowest address. */
+UNIV_INLINE
+void
+mach_write_to_3(
+/*============*/
+ byte* b, /* in: pointer to 3 bytes where to store */
+ ulint n) /* in: ulint integer to be stored */
+{
+ ut_ad(b);
+ ut_ad(n <= 0xFFFFFF);
+
+ b[0] = (byte)(n >> 16);
+ b[1] = (byte)(n >> 8);
+ b[2] = (byte)(n);
+}
+
+/************************************************************
+The following function is used to fetch data from 3 consecutive
+bytes. The most significant byte is at the lowest address. */
+UNIV_INLINE
+ulint
+mach_read_from_3(
+/*=============*/
+ /* out: ulint integer */
+ byte* b) /* in: pointer to 3 bytes */
+{
+ ut_ad(b);
+ return( ((ulint)(b[0]) << 16)
+ + ((ulint)(b[1]) << 8)
+ + (ulint)(b[2])
+ );
+}
+
+/***********************************************************
+The following function is used to store data in four consecutive
+bytes. We store the most significant byte to the lowest address. */
+UNIV_INLINE
+void
+mach_write_to_4(
+/*============*/
+ byte* b, /* in: pointer to four bytes where to store */
+ ulint n) /* in: ulint integer to be stored */
+{
+ ut_ad(b);
+
+#if notdefined && !defined(__STDC__) && defined(UNIV_INTEL) && (UNIV_WORD_SIZE == 4) && defined(UNIV_VISUALC)
+
+ /* We do not use this even on Intel, because unaligned accesses may
+ be slow */
+
+ __asm MOV EAX, n
+ __asm BSWAP EAX /* Intel is little-endian, must swap bytes */
+ __asm MOV n, EAX
+
+ *((ulint*)b) = n;
+#else
+ b[0] = (byte)(n >> 24);
+ b[1] = (byte)(n >> 16);
+ b[2] = (byte)(n >> 8);
+ b[3] = (byte)n;
+#endif
+}
+
+/************************************************************
+The following function is used to fetch data from 4 consecutive
+bytes. The most significant byte is at the lowest address. */
+UNIV_INLINE
+ulint
+mach_read_from_4(
+/*=============*/
+ /* out: ulint integer */
+ byte* b) /* in: pointer to four bytes */
+{
+#if notdefined && !defined(__STDC__) && defined(UNIV_INTEL) && (UNIV_WORD_SIZE == 4) && defined(UNIV_VISUALC)
+ /* We do not use this even on Intel, because unaligned accesses may
+ be slow */
+
+ ulint res;
+
+ ut_ad(b);
+
+ __asm MOV EDX, b
+ __asm MOV ECX, DWORD PTR [EDX]
+ __asm BSWAP ECX /* Intel is little-endian, must swap bytes */
+ __asm MOV res, ECX
+
+ return(res);
+#else
+ ut_ad(b);
+ return( ((ulint)(b[0]) << 24)
+ + ((ulint)(b[1]) << 16)
+ + ((ulint)(b[2]) << 8)
+ + (ulint)(b[3])
+ );
+#endif
+}
+
+/***********************************************************
+The following function is used to store data from a ulint to memory
+in standard order: we store the most significant byte to the lowest
+address. */
+UNIV_INLINE
+void
+mach_write(
+/*=======*/
+ byte* b, /* in: pointer to 4 bytes where to store */
+ ulint n) /* in: ulint integer to be stored */
+{
+ ut_ad(b);
+
+ b[0] = (byte)(n >> 24);
+ b[1] = (byte)(n >> 16);
+ b[2] = (byte)(n >> 8);
+ b[3] = (byte)n;
+}
+
+/************************************************************
+The following function is used to fetch data from memory to a ulint.
+The most significant byte is at the lowest address. */
+UNIV_INLINE
+ulint
+mach_read(
+/*======*/
+ /* out: ulint integer */
+ byte* b) /* in: pointer to 4 bytes */
+{
+ ut_ad(b);
+
+ return( ((ulint)(b[0]) << 24)
+ + ((ulint)(b[1]) << 16)
+ + ((ulint)(b[2]) << 8)
+ + (ulint)(b[3])
+ );
+}
+
+/*************************************************************
+Writes a ulint in a compressed form where the first byte codes the
+length of the stored ulint. We look at the most significant bits of
+the byte. If the most significant bit is zero, it means 1-byte storage,
+else if the 2nd bit is 0, it means 2-byte storage, else if 3rd is 0,
+it means 3-byte storage, else if 4th is 0, it means 4-byte storage,
+else the storage is 5-byte. */
+UNIV_INLINE
+ulint
+mach_write_compressed(
+/*==================*/
+ /* out: compressed size in bytes */
+ byte* b, /* in: pointer to memory where to store */
+ ulint n) /* in: ulint integer (< 2^32) to be stored */
+{
+ ut_ad(b);
+
+ if (n < 0x80) {
+ mach_write_to_1(b, n);
+ return(1);
+ } else if (n < 0x4000) {
+ mach_write_to_2(b, n | 0x8000);
+ return(2);
+ } else if (n < 0x200000) {
+ mach_write_to_3(b, n | 0xC00000);
+ return(3);
+ } else if (n < 0x10000000) {
+ mach_write_to_4(b, n | 0xE0000000);
+ return(4);
+ } else {
+ mach_write_to_1(b, 0xF0);
+ mach_write_to_4(b + 1, n);
+ return(5);
+ }
+}
+
+/*************************************************************
+Returns the size of a ulint when written in the compressed form. */
+UNIV_INLINE
+ulint
+mach_get_compressed_size(
+/*=====================*/
+ /* out: compressed size in bytes */
+ ulint n) /* in: ulint integer (< 2^32) to be stored */
+{
+ if (n < 0x80) {
+ return(1);
+ } else if (n < 0x4000) {
+ return(2);
+ } else if (n < 0x200000) {
+ return(3);
+ } else if (n < 0x10000000) {
+ return(4);
+ } else {
+ return(5);
+ }
+}
+
+/*************************************************************
+Reads a ulint in a compressed form. */
+UNIV_INLINE
+ulint
+mach_read_compressed(
+/*=================*/
+ /* out: read integer (< 2^32) */
+ byte* b) /* in: pointer to memory from where to read */
+{
+ ulint flag;
+
+ ut_ad(b);
+
+ flag = mach_read_from_1(b);
+
+ if (flag < 0x80) {
+ return(flag);
+ } else if (flag < 0xC0) {
+ return(mach_read_from_2(b) & 0x7FFF);
+ } else if (flag < 0xE0) {
+ return(mach_read_from_3(b) & 0x3FFFFF);
+ } else if (flag < 0xF0) {
+ return(mach_read_from_4(b) & 0x1FFFFFFF);
+ } else {
+ ut_ad(flag == 0xF0);
+ return(mach_read_from_4(b + 1));
+ }
+}
+
+/***********************************************************
+The following function is used to store data in 8 consecutive
+bytes. We store the most significant byte to the lowest address. */
+UNIV_INLINE
+void
+mach_write_to_8(
+/*============*/
+ byte* b, /* in: pointer to 8 bytes where to store */
+ dulint n) /* in: dulint integer to be stored */
+{
+ ut_ad(b);
+
+ mach_write_to_4(b, ut_dulint_get_high(n));
+ mach_write_to_4(b + 4, ut_dulint_get_low(n));
+}
+
+/************************************************************
+The following function is used to fetch data from 8 consecutive
+bytes. The most significant byte is at the lowest address. */
+UNIV_INLINE
+dulint
+mach_read_from_8(
+/*=============*/
+ /* out: dulint integer */
+ byte* b) /* in: pointer to 8 bytes */
+{
+ ulint high;
+ ulint low;
+
+ ut_ad(b);
+
+ high = mach_read_from_4(b);
+ low = mach_read_from_4(b + 4);
+
+ return(ut_dulint_create(high, low));
+}
+
+/***********************************************************
+The following function is used to store data in 7 consecutive
+bytes. We store the most significant byte to the lowest address. */
+UNIV_INLINE
+void
+mach_write_to_7(
+/*============*/
+ byte* b, /* in: pointer to 7 bytes where to store */
+ dulint n) /* in: dulint integer to be stored */
+{
+ ut_ad(b);
+
+ mach_write_to_3(b, ut_dulint_get_high(n));
+ mach_write_to_4(b + 3, ut_dulint_get_low(n));
+}
+
+/************************************************************
+The following function is used to fetch data from 7 consecutive
+bytes. The most significant byte is at the lowest address. */
+UNIV_INLINE
+dulint
+mach_read_from_7(
+/*=============*/
+ /* out: dulint integer */
+ byte* b) /* in: pointer to 7 bytes */
+{
+ ulint high;
+ ulint low;
+
+ ut_ad(b);
+
+ high = mach_read_from_3(b);
+ low = mach_read_from_4(b + 3);
+
+ return(ut_dulint_create(high, low));
+}
+
+/***********************************************************
+The following function is used to store data in 6 consecutive
+bytes. We store the most significant byte to the lowest address. */
+UNIV_INLINE
+void
+mach_write_to_6(
+/*============*/
+ byte* b, /* in: pointer to 6 bytes where to store */
+ dulint n) /* in: dulint integer to be stored */
+{
+ ut_ad(b);
+
+ mach_write_to_2(b, ut_dulint_get_high(n));
+ mach_write_to_4(b + 2, ut_dulint_get_low(n));
+}
+
+/************************************************************
+The following function is used to fetch data from 6 consecutive
+bytes. The most significant byte is at the lowest address. */
+UNIV_INLINE
+dulint
+mach_read_from_6(
+/*=============*/
+ /* out: dulint integer */
+ byte* b) /* in: pointer to 7 bytes */
+{
+ ulint high;
+ ulint low;
+
+ ut_ad(b);
+
+ high = mach_read_from_2(b);
+ low = mach_read_from_4(b + 2);
+
+ return(ut_dulint_create(high, low));
+}
+
+/*************************************************************
+Writes a dulint in a compressed form. */
+UNIV_INLINE
+ulint
+mach_dulint_write_compressed(
+/*=========================*/
+ /* out: size in bytes */
+ byte* b, /* in: pointer to memory where to store */
+ dulint n) /* in: dulint integer to be stored */
+{
+ ulint size;
+
+ ut_ad(b);
+
+ size = mach_write_compressed(b, ut_dulint_get_high(n));
+ mach_write_to_4(b + size, ut_dulint_get_low(n));
+
+ return(size + 4);
+}
+
+/*************************************************************
+Returns the size of a dulint when written in the compressed form. */
+UNIV_INLINE
+ulint
+mach_dulint_get_compressed_size(
+/*============================*/
+ /* out: compressed size in bytes */
+ dulint n) /* in: dulint integer to be stored */
+{
+ return(4 + mach_get_compressed_size(ut_dulint_get_high(n)));
+}
+
+/*************************************************************
+Reads a dulint in a compressed form. */
+UNIV_INLINE
+dulint
+mach_dulint_read_compressed(
+/*========================*/
+ /* out: read dulint */
+ byte* b) /* in: pointer to memory from where to read */
+{
+ ulint high;
+ ulint low;
+ ulint size;
+
+ ut_ad(b);
+
+ high = mach_read_compressed(b);
+
+ size = mach_get_compressed_size(high);
+
+ low = mach_read_from_4(b + size);
+
+ return(ut_dulint_create(high, low));
+}
+
+/*************************************************************
+Writes a dulint in a compressed form. */
+UNIV_INLINE
+ulint
+mach_dulint_write_much_compressed(
+/*==============================*/
+ /* out: size in bytes */
+ byte* b, /* in: pointer to memory where to store */
+ dulint n) /* in: dulint integer to be stored */
+{
+ ulint size;
+
+ ut_ad(b);
+
+ if (ut_dulint_get_high(n) == 0) {
+ return(mach_write_compressed(b, ut_dulint_get_low(n)));
+ }
+
+ *b = 0xFF;
+ size = 1 + mach_write_compressed(b + 1, ut_dulint_get_high(n));
+
+ size += mach_write_compressed(b + size, ut_dulint_get_low(n));
+
+ return(size);
+}
+
+/*************************************************************
+Returns the size of a dulint when written in the compressed form. */
+UNIV_INLINE
+ulint
+mach_dulint_get_much_compressed_size(
+/*=================================*/
+ /* out: compressed size in bytes */
+ dulint n) /* in: dulint integer to be stored */
+{
+ if (0 == ut_dulint_get_high(n)) {
+ return(mach_get_compressed_size(ut_dulint_get_low(n)));
+ }
+
+ return(1 + mach_get_compressed_size(ut_dulint_get_high(n))
+ + mach_get_compressed_size(ut_dulint_get_low(n)));
+}
+
+/*************************************************************
+Reads a dulint in a compressed form. */
+UNIV_INLINE
+dulint
+mach_dulint_read_much_compressed(
+/*=============================*/
+ /* out: read dulint */
+ byte* b) /* in: pointer to memory from where to read */
+{
+ ulint high;
+ ulint low;
+ ulint size;
+
+ ut_ad(b);
+
+ if (*b != 0xFF) {
+ high = 0;
+ size = 0;
+ } else {
+ high = mach_read_compressed(b + 1);
+
+ size = 1 + mach_get_compressed_size(high);
+ }
+
+ low = mach_read_compressed(b + size);
+
+ return(ut_dulint_create(high, low));
+}
+
+/*************************************************************
+Reads a double. It is stored in a little-endian format. */
+UNIV_INLINE
+double
+mach_double_read(
+/*=============*/
+ /* out: double read */
+ byte* b) /* in: pointer to memory from where to read */
+{
+ double d;
+ ulint i;
+ byte* ptr;
+
+ ptr = (byte*)&d;
+
+ for (i = 0; i < sizeof(double); i++) {
+#ifdef WORDS_BIGENDIAN
+ ptr[sizeof(double) - i - 1] = b[i];
+#else
+ ptr[i] = b[i];
+#endif
+ }
+
+ return(d);
+}
+
+/*************************************************************
+Writes a double. It is stored in a little-endian format. */
+UNIV_INLINE
+void
+mach_double_write(
+/*==============*/
+ byte* b, /* in: pointer to memory where to write */
+ double d) /* in: double */
+{
+ ulint i;
+ byte* ptr;
+
+ ptr = (byte*)&d;
+
+ for (i = 0; i < sizeof(double); i++) {
+#ifdef WORDS_BIGENDIAN
+ b[i] = ptr[sizeof(double) - i - 1];
+#else
+ b[i] = ptr[i];
+#endif
+ }
+}
+
+/*************************************************************
+Reads a float. It is stored in a little-endian format. */
+UNIV_INLINE
+float
+mach_float_read(
+/*=============*/
+ /* out: float read */
+ byte* b) /* in: pointer to memory from where to read */
+{
+ float d;
+ ulint i;
+ byte* ptr;
+
+ ptr = (byte*)&d;
+
+ for (i = 0; i < sizeof(float); i++) {
+#ifdef WORDS_BIGENDIAN
+ ptr[sizeof(float) - i - 1] = b[i];
+#else
+ ptr[i] = b[i];
+#endif
+ }
+
+ return(d);
+}
+
+/*************************************************************
+Writes a float. It is stored in a little-endian format. */
+UNIV_INLINE
+void
+mach_float_write(
+/*==============*/
+ byte* b, /* in: pointer to memory where to write */
+ float d) /* in: float */
+{
+ ulint i;
+ byte* ptr;
+
+ ptr = (byte*)&d;
+
+ for (i = 0; i < sizeof(float); i++) {
+#ifdef WORDS_BIGENDIAN
+ b[i] = ptr[sizeof(float) - i - 1];
+#else
+ b[i] = ptr[i];
+#endif
+ }
+}
+
+/*************************************************************
+Reads a ulint stored in the little-endian format. */
+UNIV_INLINE
+ulint
+mach_read_from_n_little_endian(
+/*===========================*/
+ /* out: unsigned long int */
+ byte* buf, /* in: from where to read */
+ ulint buf_size) /* in: from how many bytes to read */
+{
+ ulint n = 0;
+ byte* ptr;
+
+ ut_ad(buf_size <= sizeof(ulint));
+ ut_ad(buf_size > 0);
+
+ ptr = buf + buf_size;
+
+ for (;;) {
+ ptr--;
+
+ n = n << 8;
+
+ n += (ulint)(*ptr);
+
+ if (ptr == buf) {
+ break;
+ }
+ }
+
+ return(n);
+}
+
+/*************************************************************
+Writes a ulint in the little-endian format. */
+UNIV_INLINE
+void
+mach_write_to_n_little_endian(
+/*==========================*/
+ byte* dest, /* in: where to write */
+ ulint dest_size, /* in: into how many bytes to write */
+ ulint n) /* in: unsigned long int to write */
+{
+ byte* end;
+
+ ut_ad(dest_size <= sizeof(ulint));
+ ut_ad(dest_size > 0);
+
+ end = dest + dest_size;
+
+ for (;;) {
+ *dest = (byte)(n & 0xFF);
+
+ n = n >> 8;
+
+ dest++;
+
+ if (dest == end) {
+ break;
+ }
+ }
+
+ ut_ad(n == 0);
+}
+
+/*************************************************************
+Reads a ulint stored in the little-endian format. */
+UNIV_INLINE
+ulint
+mach_read_from_2_little_endian(
+/*===========================*/
+ /* out: unsigned long int */
+ byte* buf) /* in: from where to read */
+{
+ return((ulint)(*buf) + ((ulint)(*(buf + 1))) * 256);
+}
+
+/*************************************************************
+Writes a ulint in the little-endian format. */
+UNIV_INLINE
+void
+mach_write_to_2_little_endian(
+/*==========================*/
+ byte* dest, /* in: where to write */
+ ulint n) /* in: unsigned long int to write */
+{
+ ut_ad(n < 256 * 256);
+
+ *dest = (byte)(n & 0xFF);
+
+ n = n >> 8;
+ dest++;
+
+ *dest = (byte)(n & 0xFF);
+}
+
diff --git a/innobase/include/makefilewin.i b/innobase/include/makefilewin.i
new file mode 100644
index 00000000000..f756cf2ea3a
--- /dev/null
+++ b/innobase/include/makefilewin.i
@@ -0,0 +1,34 @@
+# File included in all makefiles of the database
+# (c) Innobase Oy 1995 - 2000
+
+CCOM=cl
+
+# Flags for the debug version
+#CFL= -MTd -Za -Zi -W4 -WX -F8192 -D "WIN32"
+#CFLN = -MTd -Zi -W4 -F8192 -D "WIN32"
+#CFLW = -MTd -Zi -W3 -WX -F8192 -D "WIN32"
+#LFL =
+
+# Flags for the fast version
+#CFL= -MT -Zi -Og -O2 -W3 -WX -D "WIN32"
+#CFLN = -MT -Zi -Og -O2 -W3 -D "WIN32"
+#CFLW = -MT -Zi -Og -O2 -W3 -WX -D "WIN32"
+#LFL =
+
+# Flags for the fast debug version
+CFL= -MTd -Zi -W3 -WX -F8192 -D "WIN32"
+CFLN = -MTd -Zi -W3 -F8192 -D "WIN32"
+CFLW = -MTd -Zi -W3 -WX -F8192 -D "WIN32"
+LFL = /link/NODEFAULTLIB:LIBCMT
+
+# Flags for the profiler version
+#CFL= -MT -Zi -Og -O2 -W3 -WX -D "WIN32"
+#CFLN = -MT -Zi -Og -O2 -WX -D "WIN32"
+#CFLW = -MT -Zi -Og -O2 -W3 -WX -D "WIN32"
+#LFL= -link -PROFILE
+
+# Flags for the fast version without debug info (= the production version)
+#CFL= -MT -Og -O2 -G6 -W3 -WX -D "WIN32"
+#CFLN = -MT -Og -O2 -G6 -W3 -D "WIN32"
+#CFLW = -MT -Og -O2 -G6 -W3 -WX -D "WIN32"
+#LFL =
diff --git a/innobase/include/mem0dbg.h b/innobase/include/mem0dbg.h
new file mode 100644
index 00000000000..dda37626198
--- /dev/null
+++ b/innobase/include/mem0dbg.h
@@ -0,0 +1,117 @@
+/******************************************************
+The memory management: the debug code. This is not a compilation module,
+but is included in mem0mem.* !
+
+(c) 1994, 1995 Innobase Oy
+
+Created 6/9/1994 Heikki Tuuri
+*******************************************************/
+
+/* In the debug version each allocated field is surrounded with
+check fields whose sizes are given below */
+
+#define MEM_FIELD_HEADER_SIZE ut_calc_align(2 * sizeof(ulint),\
+ UNIV_MEM_ALIGNMENT)
+#define MEM_FIELD_TRAILER_SIZE sizeof(ulint)
+
+#define MEM_BLOCK_MAGIC_N 764741
+
+/* Space needed when allocating for a user a field of
+length N. The space is allocated only in multiples of
+UNIV_MEM_ALIGNMENT. In the debug version there are also
+check fields at the both ends of the field. */
+#ifdef UNIV_MEM_DEBUG
+#define MEM_SPACE_NEEDED(N) ut_calc_align((N) + MEM_FIELD_HEADER_SIZE\
+ + MEM_FIELD_TRAILER_SIZE,\
+ UNIV_MEM_ALIGNMENT)
+#else
+#define MEM_SPACE_NEEDED(N) ut_calc_align((N), UNIV_MEM_ALIGNMENT)
+#endif
+
+/*******************************************************************
+Checks a memory heap for consistency and prints the contents if requested.
+Outputs the sum of sizes of buffers given to the user (only in
+the debug version), the physical size of the heap and the number of
+blocks in the heap. In case of error returns 0 as sizes and number
+of blocks. */
+
+void
+mem_heap_validate_or_print(
+/*=======================*/
+ mem_heap_t* heap, /* in: memory heap */
+ byte* top, /* in: calculate and validate only until
+ this top pointer in the heap is reached,
+ if this pointer is NULL, ignored */
+ ibool print, /* in: if TRUE, prints the contents
+ of the heap; works only in
+ the debug version */
+ ibool* error, /* out: TRUE if error */
+ ulint* us_size,/* out: allocated memory
+ (for the user) in the heap,
+ if a NULL pointer is passed as this
+ argument, it is ignored; in the
+ non-debug version this is always -1 */
+ ulint* ph_size,/* out: physical size of the heap,
+ if a NULL pointer is passed as this
+ argument, it is ignored */
+ ulint* n_blocks); /* out: number of blocks in the heap,
+ if a NULL pointer is passed as this
+ argument, it is ignored */
+/******************************************************************
+Prints the contents of a memory heap. */
+
+void
+mem_heap_print(
+/*===========*/
+ mem_heap_t* heap); /* in: memory heap */
+/******************************************************************
+Checks that an object is a memory heap (or a block of it) */
+
+ibool
+mem_heap_check(
+/*===========*/
+ /* out: TRUE if ok */
+ mem_heap_t* heap); /* in: memory heap */
+/******************************************************************
+Validates the contents of a memory heap. */
+
+ibool
+mem_heap_validate(
+/*==============*/
+ /* out: TRUE if ok */
+ mem_heap_t* heap); /* in: memory heap */
+/*********************************************************************
+Prints information of dynamic memory usage and currently live
+memory heaps or buffers. Can only be used in the debug version. */
+
+void
+mem_print_info(void);
+/*=================*/
+/*********************************************************************
+Prints information of dynamic memory usage and currently allocated memory
+heaps or buffers since the last ..._print_info or..._print_new_info. */
+
+void
+mem_print_new_info(void);
+/*====================*/
+/*********************************************************************
+TRUE if no memory is currently allocated. */
+
+ibool
+mem_all_freed(void);
+/*===============*/
+ /* out: TRUE if no heaps exist */
+/*********************************************************************
+Validates the dynamic memory */
+
+ibool
+mem_validate_no_assert(void);
+/*=========================*/
+ /* out: TRUE if error */
+/****************************************************************
+Validates the dynamic memory */
+
+ibool
+mem_validate(void);
+/*===============*/
+ /* out: TRUE if ok */
diff --git a/innobase/include/mem0dbg.ic b/innobase/include/mem0dbg.ic
new file mode 100644
index 00000000000..765e23e747e
--- /dev/null
+++ b/innobase/include/mem0dbg.ic
@@ -0,0 +1,91 @@
+/************************************************************************
+The memory management: the debug code. This is not an independent
+compilation module but is included in mem0mem.*.
+
+(c) 1994, 1995 Innobase Oy
+
+Created 6/8/1994 Heikki Tuuri
+*************************************************************************/
+
+extern mutex_t mem_hash_mutex;
+extern ulint mem_current_allocated_memory;
+
+/**********************************************************************
+Initializes an allocated memory field in the debug version. */
+
+void
+mem_field_init(
+/*===========*/
+ byte* buf, /* in: memory field */
+ ulint n); /* in: how many bytes the user requested */
+/**********************************************************************
+Erases an allocated memory field in the debug version. */
+
+void
+mem_field_erase(
+/*============*/
+ byte* buf, /* in: memory field */
+ ulint n); /* in: how many bytes the user requested */
+/*******************************************************************
+Initializes a buffer to a random combination of hex BA and BE.
+Used to initialize allocated memory. */
+
+void
+mem_init_buf(
+/*=========*/
+ byte* buf, /* in: pointer to buffer */
+ ulint n); /* in: length of buffer */
+/*******************************************************************
+Initializes a buffer to a random combination of hex DE and AD.
+Used to erase freed memory.*/
+
+void
+mem_erase_buf(
+/*==========*/
+ byte* buf, /* in: pointer to buffer */
+ ulint n); /* in: length of buffer */
+/*******************************************************************
+Inserts a created memory heap to the hash table of
+current allocated memory heaps.
+Initializes the hash table when first called. */
+
+void
+mem_hash_insert(
+/*============*/
+ mem_heap_t* heap, /* in: the created heap */
+ char* file_name, /* in: file name of creation */
+ ulint line); /* in: line where created */
+/*******************************************************************
+Removes a memory heap (which is going to be freed by the caller)
+from the list of live memory heaps. Returns the size of the heap
+in terms of how much memory in bytes was allocated for the user of
+the heap (not the total space occupied by the heap).
+Also validates the heap.
+NOTE: This function does not free the storage occupied by the
+heap itself, only the node in the list of heaps. */
+
+void
+mem_hash_remove(
+/*============*/
+ mem_heap_t* heap, /* in: the heap to be freed */
+ char* file_name, /* in: file name of freeing */
+ ulint line); /* in: line where freed */
+
+
+void
+mem_field_header_set_len(byte* field, ulint len);
+
+ulint
+mem_field_header_get_len(byte* field);
+
+void
+mem_field_header_set_check(byte* field, ulint check);
+
+ulint
+mem_field_header_get_check(byte* field);
+
+void
+mem_field_trailer_set_check(byte* field, ulint check);
+
+ulint
+mem_field_trailer_get_check(byte* field);
diff --git a/innobase/include/mem0mem.h b/innobase/include/mem0mem.h
new file mode 100644
index 00000000000..a2259a97503
--- /dev/null
+++ b/innobase/include/mem0mem.h
@@ -0,0 +1,350 @@
+/******************************************************
+The memory management
+
+(c) 1994, 1995 Innobase Oy
+
+Created 6/9/1994 Heikki Tuuri
+*******************************************************/
+
+#ifndef mem0mem_h
+#define mem0mem_h
+
+#include "univ.i"
+#include "ut0mem.h"
+#include "ut0byte.h"
+#include "ut0ut.h"
+#include "ut0rnd.h"
+#include "sync0sync.h"
+#include "ut0lst.h"
+#include "mach0data.h"
+
+/* -------------------- MEMORY HEAPS ----------------------------- */
+
+/* The info structure stored at the beginning of a heap block */
+typedef struct mem_block_info_struct mem_block_info_t;
+
+/* A block of a memory heap consists of the info structure
+followed by an area of memory */
+typedef mem_block_info_t mem_block_t;
+
+/* A memory heap is a nonempty linear list of memory blocks */
+typedef mem_block_t mem_heap_t;
+
+/* Types of allocation for memory heaps: DYNAMIC means allocation from the
+dynamic memory pool of the C compiler, BUFFER means allocation from the index
+page buffer pool; the latter method is used for very big heaps */
+
+#define MEM_HEAP_DYNAMIC 0 /* the most common type */
+#define MEM_HEAP_BUFFER 1
+#define MEM_HEAP_BTR_SEARCH 2 /* this flag can be ORed to the
+ previous */
+
+/* The following start size is used for the first block in the memory heap if
+the size is not specified, i.e., 0 is given as the parameter in the call of
+create. The standard size is the maximum size of the blocks used for
+allocations of small buffers. */
+
+#define MEM_BLOCK_START_SIZE 64
+#define MEM_BLOCK_STANDARD_SIZE 8192
+
+/* If a memory heap is allowed to grow into the buffer pool, the following
+is the maximum size for a single allocated buffer: */
+#define MEM_MAX_ALLOC_IN_BUF (UNIV_PAGE_SIZE - 200)
+
+/**********************************************************************
+Initializes the memory system. */
+
+void
+mem_init(
+/*=====*/
+ ulint size); /* in: common pool size in bytes */
+/******************************************************************
+Use this macro instead of the corresponding function! Macro for memory
+heap creation. */
+#ifdef UNIV_MEM_DEBUG
+#define mem_heap_create(N) mem_heap_create_func(\
+ (N), NULL, MEM_HEAP_DYNAMIC,\
+ __FILE__, __LINE__)
+#else
+#define mem_heap_create(N) mem_heap_create_func(N, NULL, MEM_HEAP_DYNAMIC)
+#endif
+/******************************************************************
+Use this macro instead of the corresponding function! Macro for memory
+heap creation. */
+#ifdef UNIV_MEM_DEBUG
+#define mem_heap_create_in_buffer(N) mem_heap_create_func(\
+ (N), NULL, MEM_HEAP_BUFFER,\
+ __FILE__, __LINE__)
+#else
+#define mem_heap_create_in_buffer(N) mem_heap_create_func(N, NULL,\
+ MEM_HEAP_BUFFER)
+#endif
+/******************************************************************
+Use this macro instead of the corresponding function! Macro for memory
+heap creation. */
+#ifdef UNIV_MEM_DEBUG
+#define mem_heap_create_in_btr_search(N) mem_heap_create_func(\
+ (N), NULL, MEM_HEAP_BTR_SEARCH |\
+ MEM_HEAP_BUFFER,\
+ __FILE__, __LINE__)
+#else
+#define mem_heap_create_in_btr_search(N) mem_heap_create_func(N, NULL,\
+ MEM_HEAP_BTR_SEARCH | MEM_HEAP_BUFFER)
+#endif
+/******************************************************************
+Use this macro instead of the corresponding function! Macro for fast
+memory heap creation. An initial block of memory B is given by the
+caller, N is its size, and this memory block is not freed by
+mem_heap_free. See the parameter comment in mem_heap_create_func below. */
+#ifdef UNIV_MEM_DEBUG
+#define mem_heap_fast_create(N, B) mem_heap_create_func(\
+ (N), (B), MEM_HEAP_DYNAMIC,\
+ __FILE__, __LINE__)
+#else
+#define mem_heap_fast_create(N, B) mem_heap_create_func(N, (B),\
+ MEM_HEAP_DYNAMIC)
+#endif
+/******************************************************************
+Use this macro instead of the corresponding function! Macro for memory
+heap freeing. */
+#ifdef UNIV_MEM_DEBUG
+#define mem_heap_free(heap) mem_heap_free_func(\
+ (heap), __FILE__, __LINE__)
+#else
+#define mem_heap_free(heap) mem_heap_free_func(heap)
+#endif
+/*********************************************************************
+NOTE: Use the corresponding macros instead of this function. Creates a
+memory heap which allocates memory from dynamic space. For debugging
+purposes, takes also the file name and line as argument in the debug
+version. */
+UNIV_INLINE
+mem_heap_t*
+mem_heap_create_func(
+/*=================*/
+ /* out, own: memory heap */
+ ulint n, /* in: desired start block size,
+ this means that a single user buffer
+ of size n will fit in the block,
+ 0 creates a default size block;
+ if init_block is not NULL, n tells
+ its size in bytes */
+ void* init_block, /* in: if very fast creation is
+ wanted, the caller can reserve some
+ memory from its stack, for example,
+ and pass it as the the initial block
+ to the heap: then no OS call of malloc
+ is needed at the creation. CAUTION:
+ the caller must make sure the initial
+ block is not unintentionally erased
+ (if allocated in the stack), before
+ the memory heap is explicitly freed. */
+ ulint type /* in: MEM_HEAP_DYNAMIC or MEM_HEAP_BUFFER */
+ #ifdef UNIV_MEM_DEBUG
+ ,char* file_name, /* in: file name where created */
+ ulint line /* in: line where created */
+ #endif
+ );
+/*********************************************************************
+NOTE: Use the corresponding macro instead of this function.
+Frees the space occupied by a memory heap. */
+UNIV_INLINE
+void
+mem_heap_free_func(
+/*===============*/
+ mem_heap_t* heap /* in, own: heap to be freed */
+ #ifdef UNIV_MEM_DEBUG
+ ,char* file_name, /* in: file name where freed */
+ ulint line /* in: line where freed */
+ #endif
+);
+/*******************************************************************
+Allocates n bytes of memory from a memory heap. */
+UNIV_INLINE
+void*
+mem_heap_alloc(
+/*===========*/
+ /* out: allocated storage, NULL if
+ did not succeed */
+ mem_heap_t* heap, /* in: memory heap */
+ ulint n); /* in: number of bytes; if the heap is allowed
+ to grow into the buffer pool, this must be
+ <= MEM_MAX_ALLOC_IN_BUF */
+/*********************************************************************
+Returns a pointer to the heap top. */
+UNIV_INLINE
+byte*
+mem_heap_get_heap_top(
+/*==================*/
+ /* out: pointer to the heap top */
+ mem_heap_t* heap); /* in: memory heap */
+/*********************************************************************
+Frees the space in a memory heap exceeding the pointer given. The
+pointer must have been acquired from mem_heap_get_heap_top. The first
+memory block of the heap is not freed. */
+UNIV_INLINE
+void
+mem_heap_free_heap_top(
+/*===================*/
+ mem_heap_t* heap, /* in: heap from which to free */
+ byte* old_top);/* in: pointer to old top of heap */
+/*********************************************************************
+Empties a memory heap. The first memory block of the heap is not freed. */
+UNIV_INLINE
+void
+mem_heap_empty(
+/*===========*/
+ mem_heap_t* heap); /* in: heap to empty */
+/*********************************************************************
+Returns a pointer to the topmost element in a memory heap.
+The size of the element must be given. */
+UNIV_INLINE
+void*
+mem_heap_get_top(
+/*=============*/
+ /* out: pointer to the topmost element */
+ mem_heap_t* heap, /* in: memory heap */
+ ulint n); /* in: size of the topmost element */
+/*********************************************************************
+Frees the topmost element in a memory heap.
+The size of the element must be given. */
+UNIV_INLINE
+void
+mem_heap_free_top(
+/*==============*/
+ mem_heap_t* heap, /* in: memory heap */
+ ulint n); /* in: size of the topmost element */
+/*********************************************************************
+Returns the space in bytes occupied by a memory heap. */
+UNIV_INLINE
+ulint
+mem_heap_get_size(
+/*==============*/
+ mem_heap_t* heap); /* in: heap */
+/******************************************************************
+Use this macro instead of the corresponding function!
+Macro for memory buffer allocation */
+#ifdef UNIV_MEM_DEBUG
+#define mem_alloc(N) mem_alloc_func(\
+ (N), __FILE__, __LINE__)
+#else
+#define mem_alloc(N) mem_alloc_func(N)
+#endif
+/******************************************************************
+Use this macro instead of the corresponding function!
+Macro for memory buffer allocation */
+#ifdef UNIV_MEM_DEBUG
+#define mem_alloc_noninline(N) mem_alloc_func_noninline(\
+ (N), __FILE__, __LINE__)
+#else
+#define mem_alloc_noninline(N) mem_alloc_func_noninline(N)
+#endif
+/*******************************************************************
+NOTE: Use the corresponding macro instead of this function.
+Allocates a single buffer of memory from the dynamic memory of
+the C compiler. Is like malloc of C. The buffer must be freed
+with mem_free. */
+UNIV_INLINE
+void*
+mem_alloc_func(
+/*===========*/
+ /* out, own: free storage, NULL
+ if did not succeed */
+ ulint n /* in: desired number of bytes */
+ #ifdef UNIV_MEM_DEBUG
+ ,char* file_name, /* in: file name where created */
+ ulint line /* in: line where created */
+ #endif
+);
+/*******************************************************************
+NOTE: Use the corresponding macro instead of this function.
+Allocates a single buffer of memory from the dynamic memory of
+the C compiler. Is like malloc of C. The buffer must be freed
+with mem_free. */
+
+void*
+mem_alloc_func_noninline(
+/*=====================*/
+ /* out, own: free storage, NULL if did not
+ succeed */
+ ulint n /* in: desired number of bytes */
+ #ifdef UNIV_MEM_DEBUG
+ ,char* file_name, /* in: file name where created */
+ ulint line /* in: line where created */
+ #endif
+ );
+/******************************************************************
+Use this macro instead of the corresponding function!
+Macro for memory buffer freeing */
+#ifdef UNIV_MEM_DEBUG
+#define mem_free(PTR) mem_free_func(\
+ (PTR), __FILE__, __LINE__)
+#else
+#define mem_free(PTR) mem_free_func(PTR)
+#endif
+/*******************************************************************
+NOTE: Use the corresponding macro instead of this function.
+Frees a single buffer of storage from
+the dynamic memory of C compiler. Similar to free of C. */
+UNIV_INLINE
+void
+mem_free_func(
+/*==========*/
+ void* ptr /* in, own: buffer to be freed */
+ #ifdef UNIV_MEM_DEBUG
+ ,char* file_name, /* in: file name where created */
+ ulint line /* in: line where created */
+ #endif
+);
+/*******************************************************************
+Implements realloc. */
+UNIV_INLINE
+void*
+mem_realloc(
+/*========*/
+ /* out, own: free storage, NULL if did not succeed */
+ void* buf, /* in: pointer to an old buffer */
+ ulint n); /* in: desired number of bytes */
+
+
+/*#######################################################################*/
+
+/* The info header of a block in a memory heap */
+
+struct mem_block_info_struct {
+ UT_LIST_BASE_NODE_T(mem_block_t) base; /* In the first block in the
+ the list this is the base node of the list of blocks;
+ in subsequent blocks this is undefined */
+ UT_LIST_NODE_T(mem_block_t) list; /* This contains pointers to next
+ and prev in the list. The first block allocated
+ to the heap is also the first block in this list,
+ though it also contains the base node of the list. */
+ ulint len; /* physical length of this block in bytes */
+ ulint type; /* type of heap: MEM_HEAP_DYNAMIC, or
+ MEM_HEAP_BUF possibly ORed to MEM_HEAP_BTR_SEARCH */
+ ibool init_block; /* TRUE if this is the first block used in fast
+ creation of a heap: the memory will be freed
+ by the creator, not by mem_heap_free */
+ ulint free; /* offset in bytes of the first free position for
+ user data in the block */
+ ulint start; /* the value of the struct field 'free' at the
+ creation of the block */
+ byte* free_block;
+ /* if the MEM_HEAP_BTR_SEARCH bit is set in type,
+ and this is the heap root, this can contain an
+ allocated buffer frame, which can be appended as a
+ free block to the heap, if we need more space;
+ otherwise, this is NULL */
+ ulint magic_n;/* magic number for debugging */
+};
+
+/* Header size for a memory heap block */
+#define MEM_BLOCK_HEADER_SIZE ut_calc_align(sizeof(mem_block_info_t),\
+ UNIV_MEM_ALIGNMENT)
+#include "mem0dbg.h"
+
+#ifndef UNIV_NONINL
+#include "mem0mem.ic"
+#endif
+
+#endif
diff --git a/innobase/include/mem0mem.ic b/innobase/include/mem0mem.ic
new file mode 100644
index 00000000000..8b8449469ef
--- /dev/null
+++ b/innobase/include/mem0mem.ic
@@ -0,0 +1,597 @@
+/************************************************************************
+The memory management
+
+(c) 1994, 1995 Innobase Oy
+
+Created 6/8/1994 Heikki Tuuri
+*************************************************************************/
+
+#include "mem0dbg.ic"
+
+#include "mem0pool.h"
+
+/*******************************************************************
+Creates a memory heap block where data can be allocated. */
+
+mem_block_t*
+mem_heap_create_block(
+/*==================*/
+ /* out, own: memory heap block, NULL if did not
+ succeed */
+ mem_heap_t* heap,/* in: memory heap or NULL if first block should
+ be created */
+ ulint n, /* in: number of bytes needed for user data, or
+ if init_block is not NULL, its size in bytes */
+ void* init_block, /* in: init block in fast create, type must be
+ MEM_HEAP_DYNAMIC */
+ ulint type); /* in: type of heap: MEM_HEAP_DYNAMIC or
+ MEM_HEAP_BUFFER */
+/**********************************************************************
+Frees a block from a memory heap. */
+
+void
+mem_heap_block_free(
+/*================*/
+ mem_heap_t* heap, /* in: heap */
+ mem_block_t* block); /* in: block to free */
+/**********************************************************************
+Frees the free_block field from a memory heap. */
+
+void
+mem_heap_free_block_free(
+/*=====================*/
+ mem_heap_t* heap); /* in: heap */
+/*******************************************************************
+Adds a new block to a memory heap. */
+
+mem_block_t*
+mem_heap_add_block(
+/*===============*/
+ /* out: created block, NULL if did not
+ succeed */
+ mem_heap_t* heap, /* in: memory heap */
+ ulint n); /* in: number of bytes user needs */
+
+UNIV_INLINE
+void
+mem_block_set_len(mem_block_t* block, ulint len)
+{
+ ut_ad(len > 0);
+
+ block->len = len;
+}
+
+UNIV_INLINE
+ulint
+mem_block_get_len(mem_block_t* block)
+{
+ return(block->len);
+}
+
+UNIV_INLINE
+void
+mem_block_set_type(mem_block_t* block, ulint type)
+{
+ ut_ad((type == MEM_HEAP_DYNAMIC) || (type == MEM_HEAP_BUFFER)
+ || (type == MEM_HEAP_BUFFER + MEM_HEAP_BTR_SEARCH));
+
+ block->type = type;
+}
+
+UNIV_INLINE
+ulint
+mem_block_get_type(mem_block_t* block)
+{
+ return(block->type);
+}
+
+UNIV_INLINE
+void
+mem_block_set_free(mem_block_t* block, ulint free)
+{
+ ut_ad(free > 0);
+ ut_ad(free <= mem_block_get_len(block));
+
+ block->free = free;
+}
+
+UNIV_INLINE
+ulint
+mem_block_get_free(mem_block_t* block)
+{
+ return(block->free);
+}
+
+UNIV_INLINE
+void
+mem_block_set_start(mem_block_t* block, ulint start)
+{
+ ut_ad(start > 0);
+
+ block->start = start;
+}
+
+UNIV_INLINE
+ulint
+mem_block_get_start(mem_block_t* block)
+{
+ return(block->start);
+}
+
+/*******************************************************************
+Allocates n bytes of memory from a memory heap. */
+UNIV_INLINE
+void*
+mem_heap_alloc(
+/*===========*/
+ /* out: allocated storage */
+ mem_heap_t* heap, /* in: memory heap */
+ ulint n) /* in: number of bytes; if the heap is allowed
+ to grow into the buffer pool, this must be
+ <= MEM_MAX_ALLOC_IN_BUF */
+{
+ mem_block_t* block;
+ void* buf;
+ ulint free;
+
+ ut_ad(mem_heap_check(heap));
+
+ block = UT_LIST_GET_LAST(heap->base);
+
+ ut_ad(!(block->type & MEM_HEAP_BUFFER) || (n <= MEM_MAX_ALLOC_IN_BUF));
+
+ /* Check if there is enough space in block. If not, create a new
+ block to the heap */
+
+ if (mem_block_get_len(block)
+ < mem_block_get_free(block) + MEM_SPACE_NEEDED(n)) {
+
+ block = mem_heap_add_block(heap, n);
+
+ if (block == NULL) {
+
+ return(NULL);
+ }
+ }
+
+ free = mem_block_get_free(block);
+
+ buf = (byte*)block + free;
+
+ mem_block_set_free(block, free + MEM_SPACE_NEEDED(n));
+
+ #ifdef UNIV_MEM_DEBUG
+
+ /* In the debug version write debugging info to the field */
+ mem_field_init((byte*)buf, n);
+
+ /* Advance buf to point at the storage which will be given to the
+ caller */
+ buf = (byte*)buf + MEM_FIELD_HEADER_SIZE;
+
+ #endif
+
+ return(buf);
+}
+
+/*********************************************************************
+Returns a pointer to the heap top. */
+UNIV_INLINE
+byte*
+mem_heap_get_heap_top(
+/*==================*/
+ /* out: pointer to the heap top */
+ mem_heap_t* heap) /* in: memory heap */
+{
+ mem_block_t* block;
+ byte* buf;
+
+ ut_ad(mem_heap_check(heap));
+
+ block = UT_LIST_GET_LAST(heap->base);
+
+ buf = (byte*)block + mem_block_get_free(block);
+
+ return(buf);
+}
+
+/*********************************************************************
+Frees the space in a memory heap exceeding the pointer given. The
+pointer must have been acquired from mem_heap_get_heap_top. The first
+memory block of the heap is not freed. */
+UNIV_INLINE
+void
+mem_heap_free_heap_top(
+/*===================*/
+ mem_heap_t* heap, /* in: heap from which to free */
+ byte* old_top)/* in: pointer to old top of heap */
+{
+ mem_block_t* block;
+ mem_block_t* prev_block;
+ #ifdef UNIV_MEM_DEBUG
+ ibool error;
+ ulint total_size;
+ ulint size;
+ #endif
+
+ ut_ad(mem_heap_check(heap));
+
+ #ifdef UNIV_MEM_DEBUG
+
+ /* Validate the heap and get its total allocated size */
+ mem_heap_validate_or_print(heap, NULL, FALSE, &error, &total_size,
+ NULL, NULL);
+ ut_a(!error);
+
+ /* Get the size below top pointer */
+ mem_heap_validate_or_print(heap, old_top, FALSE, &error, &size, NULL,
+ NULL);
+ ut_a(!error);
+
+ #endif
+
+ block = UT_LIST_GET_LAST(heap->base);
+
+ while (block != NULL) {
+ if (((byte*)block + mem_block_get_free(block) >= old_top)
+ && ((byte*)block <= old_top)) {
+ /* Found the right block */
+
+ break;
+ }
+
+ /* Store prev_block value before freeing the current block
+ (the current block will be erased in freeing) */
+
+ prev_block = UT_LIST_GET_PREV(list, block);
+
+ mem_heap_block_free(heap, block);
+
+ block = prev_block;
+ }
+
+ ut_ad(block);
+
+ /* Set the free field of block */
+ mem_block_set_free(block, old_top - (byte*)block);
+
+ #ifdef UNIV_MEM_DEBUG
+ ut_ad(mem_block_get_start(block) <= mem_block_get_free(block));
+
+ /* In the debug version erase block from top up */
+
+ mem_erase_buf(old_top, (byte*)block + block->len - old_top);
+
+ /* Update allocated memory count */
+ mutex_enter(&mem_hash_mutex);
+ mem_current_allocated_memory -= (total_size - size);
+ mutex_exit(&mem_hash_mutex);
+
+ #endif
+
+ /* If free == start, we may free the block if it is not the first
+ one */
+
+ if ((heap != block) && (mem_block_get_free(block) ==
+ mem_block_get_start(block))) {
+ mem_heap_block_free(heap, block);
+ }
+}
+
+/*********************************************************************
+Empties a memory heap. The first memory block of the heap is not freed. */
+UNIV_INLINE
+void
+mem_heap_empty(
+/*===========*/
+ mem_heap_t* heap) /* in: heap to empty */
+{
+ mem_heap_free_heap_top(heap, (byte*)heap + mem_block_get_start(heap));
+
+ if (heap->free_block) {
+ mem_heap_free_block_free(heap);
+ }
+}
+
+/*********************************************************************
+Returns a pointer to the topmost element in a memory heap. The size of the
+element must be given. */
+UNIV_INLINE
+void*
+mem_heap_get_top(
+/*=============*/
+ /* out: pointer to the topmost element */
+ mem_heap_t* heap, /* in: memory heap */
+ ulint n) /* in: size of the topmost element */
+{
+ mem_block_t* block;
+ void* buf;
+
+ ut_ad(mem_heap_check(heap));
+
+ block = UT_LIST_GET_LAST(heap->base);
+
+ buf = (byte*)block + mem_block_get_free(block) - MEM_SPACE_NEEDED(n);
+
+ #ifdef UNIV_MEM_DEBUG
+ ut_ad(mem_block_get_start(block) <=(ulint)((byte*)buf - (byte*)block));
+
+ /* In the debug version, advance buf to point at the storage which
+ was given to the caller in the allocation*/
+
+ buf = (byte*)buf + MEM_FIELD_HEADER_SIZE;
+
+ /* Check that the field lengths agree */
+ ut_ad(n == (ulint)mem_field_header_get_len(buf));
+ #endif
+
+ return(buf);
+}
+
+/*********************************************************************
+Frees the topmost element in a memory heap. The size of the element must be
+given. */
+UNIV_INLINE
+void
+mem_heap_free_top(
+/*==============*/
+ mem_heap_t* heap, /* in: memory heap */
+ ulint n) /* in: size of the topmost element */
+{
+ mem_block_t* block;
+
+ ut_ad(mem_heap_check(heap));
+
+ block = UT_LIST_GET_LAST(heap->base);
+
+ /* Subtract the free field of block */
+ mem_block_set_free(block, mem_block_get_free(block)
+ - MEM_SPACE_NEEDED(n));
+ #ifdef UNIV_MEM_DEBUG
+
+ ut_ad(mem_block_get_start(block) <= mem_block_get_free(block));
+
+ /* In the debug version check the consistency, and erase field */
+ mem_field_erase((byte*)block + mem_block_get_free(block), n);
+ #endif
+
+ /* If free == start, we may free the block if it is not the first
+ one */
+
+ if ((heap != block) && (mem_block_get_free(block) ==
+ mem_block_get_start(block))) {
+ mem_heap_block_free(heap, block);
+ }
+}
+
+/*********************************************************************
+NOTE: Use the corresponding macros instead of this function. Creates a
+memory heap which allocates memory from dynamic space. For debugging
+purposes, takes also the file name and line as argument in the debug
+version. */
+UNIV_INLINE
+mem_heap_t*
+mem_heap_create_func(
+/*=================*/
+ /* out, own: memory heap */
+ ulint n, /* in: desired start block size,
+ this means that a single user buffer
+ of size n will fit in the block,
+ 0 creates a default size block;
+ if init_block is not NULL, n tells
+ its size in bytes */
+ void* init_block, /* in: if very fast creation is
+ wanted, the caller can reserve some
+ memory from its stack, for example,
+ and pass it as the the initial block
+ to the heap: then no OS call of malloc
+ is needed at the creation. CAUTION:
+ the caller must make sure the initial
+ block is not unintentionally erased
+ (if allocated in the stack), before
+ the memory heap is explicitly freed. */
+ ulint type /* in: MEM_HEAP_DYNAMIC, or MEM_HEAP_BUFFER
+ possibly ORed to MEM_HEAP_BTR_SEARCH */
+ #ifdef UNIV_MEM_DEBUG
+ ,char* file_name, /* in: file name where created */
+ ulint line /* in: line where created */
+ #endif
+ )
+{
+ mem_block_t* block;
+
+ if (n > 0) {
+ block = mem_heap_create_block(NULL, n, init_block, type);
+ } else {
+ block = mem_heap_create_block(NULL, MEM_BLOCK_START_SIZE,
+ init_block, type);
+ }
+
+ ut_ad(block);
+
+ UT_LIST_INIT(block->base);
+
+ /* Add the created block itself as the first block in the list */
+ UT_LIST_ADD_FIRST(list, block->base, block);
+
+ #ifdef UNIV_MEM_DEBUG
+
+ if (block == NULL) {
+
+ return(block);
+ }
+
+ mem_hash_insert(block, file_name, line);
+
+ #endif
+
+ return(block);
+}
+
+/*********************************************************************
+NOTE: Use the corresponding macro instead of this function. Frees the space
+occupied by a memory heap. In the debug version erases the heap memory
+blocks. */
+UNIV_INLINE
+void
+mem_heap_free_func(
+/*===============*/
+ mem_heap_t* heap /* in, own: heap to be freed */
+ #ifdef UNIV_MEM_DEBUG
+ ,char* file_name, /* in: file name where freed */
+ ulint line /* in: line where freed */
+ #endif
+ )
+{
+ mem_block_t* block;
+ mem_block_t* prev_block;
+
+ ut_ad(mem_heap_check(heap));
+
+ block = UT_LIST_GET_LAST(heap->base);
+
+ #ifdef UNIV_MEM_DEBUG
+
+ /* In the debug version remove the heap from the hash table of heaps
+ and check its consistency */
+
+ mem_hash_remove(heap, file_name, line);
+
+ #endif
+
+ if (heap->free_block) {
+ mem_heap_free_block_free(heap);
+ }
+
+ while (block != NULL) {
+ /* Store the contents of info before freeing current block
+ (it is erased in freeing) */
+
+ prev_block = UT_LIST_GET_PREV(list, block);
+
+ mem_heap_block_free(heap, block);
+
+ block = prev_block;
+ }
+}
+
+/*******************************************************************
+NOTE: Use the corresponding macro instead of this function.
+Allocates a single buffer of memory from the dynamic memory of
+the C compiler. Is like malloc of C. The buffer must be freed
+with mem_free. */
+UNIV_INLINE
+void*
+mem_alloc_func(
+/*===========*/
+ /* out, own: free storage, NULL if did not
+ succeed */
+ ulint n /* in: desired number of bytes */
+ #ifdef UNIV_MEM_DEBUG
+ ,char* file_name, /* in: file name where created */
+ ulint line /* in: line where created */
+ #endif
+ )
+{
+ #ifndef UNIV_MEM_DEBUG
+
+ return(mem_area_alloc(n, mem_comm_pool));
+
+ #else
+
+ mem_heap_t* heap;
+ void* buf;
+
+ heap = mem_heap_create_func(n, NULL, MEM_HEAP_DYNAMIC, file_name,
+ line);
+ if (heap == NULL) {
+
+ return(NULL);
+ }
+
+ /* Note that as we created the first block in the heap big enough
+ for the buffer requested by the caller, the buffer will be in the
+ first block and thus we can calculate the pointer to the heap from
+ the pointer to the buffer when we free the memory buffer. */
+
+ buf = mem_heap_alloc(heap, n);
+
+ ut_ad((byte*)heap == (byte*)buf - MEM_BLOCK_HEADER_SIZE
+ - MEM_FIELD_HEADER_SIZE);
+ return(buf);
+
+ #endif
+}
+
+/*******************************************************************
+NOTE: Use the corresponding macro instead of this function. Frees a single
+buffer of storage from the dynamic memory of the C compiler. Similar to the
+free of C. */
+UNIV_INLINE
+void
+mem_free_func(
+/*==========*/
+ void* ptr /* in, own: buffer to be freed */
+ #ifdef UNIV_MEM_DEBUG
+ ,char* file_name, /* in: file name where created */
+ ulint line /* in: line where created */
+ #endif
+ )
+{
+ #ifndef UNIV_MEM_DEBUG
+
+ mem_area_free(ptr, mem_comm_pool);
+
+ #else
+
+ mem_heap_t* heap;
+
+ heap = (mem_heap_t*)((byte*)ptr - MEM_BLOCK_HEADER_SIZE
+ - MEM_FIELD_HEADER_SIZE);
+ mem_heap_free_func(heap, file_name, line);
+
+ #endif
+}
+
+/*********************************************************************
+Returns the space in bytes occupied by a memory heap. */
+UNIV_INLINE
+ulint
+mem_heap_get_size(
+/*==============*/
+ mem_heap_t* heap) /* in: heap */
+{
+ mem_block_t* block;
+ ulint size = 0;
+
+ ut_ad(mem_heap_check(heap));
+
+ block = heap;
+
+ while (block != NULL) {
+
+ size += mem_block_get_len(block);
+ block = UT_LIST_GET_NEXT(list, block);
+ }
+
+ if (heap->free_block) {
+ size += UNIV_PAGE_SIZE;
+ }
+
+ return(size);
+}
+
+/*******************************************************************
+Implements realloc. */
+UNIV_INLINE
+void*
+mem_realloc(
+/*========*/
+ /* out, own: free storage, NULL if did not succeed */
+ void* buf, /* in: pointer to an old buffer */
+ ulint n) /* in: desired number of bytes */
+{
+ mem_free(buf);
+
+ return(mem_alloc(n));
+}
diff --git a/innobase/include/mem0pool.h b/innobase/include/mem0pool.h
new file mode 100644
index 00000000000..b6906894c53
--- /dev/null
+++ b/innobase/include/mem0pool.h
@@ -0,0 +1,83 @@
+/******************************************************
+The lowest-level memory management
+
+(c) 1994, 1995 Innobase Oy
+
+Created 6/9/1994 Heikki Tuuri
+*******************************************************/
+
+#ifndef mem0pool_h
+#define mem0pool_h
+
+#include "univ.i"
+#include "os0file.h"
+
+typedef struct mem_area_struct mem_area_t;
+typedef struct mem_pool_struct mem_pool_t;
+
+/* The common memory pool */
+extern mem_pool_t* mem_comm_pool;
+
+/* Each memory area takes this many extra bytes for control information */
+#define MEM_AREA_EXTRA_SIZE UNIV_MEM_ALIGNMENT
+
+/************************************************************************
+Creates a memory pool. */
+
+mem_pool_t*
+mem_pool_create(
+/*============*/
+ /* out: memory pool */
+ ulint size); /* in: pool size in bytes */
+/************************************************************************
+Allocates memory from a pool. NOTE: This low-level function should only be
+used in mem0mem.*! */
+
+void*
+mem_area_alloc(
+/*===========*/
+ /* out, own: allocated memory buffer */
+ ulint size, /* in: allocated size in bytes; for optimum
+ space usage, the size should be a power of 2
+ minus MEM_AREA_EXTRA_SIZE */
+ mem_pool_t* pool); /* in: memory pool */
+/************************************************************************
+Frees memory to a pool. */
+
+void
+mem_area_free(
+/*==========*/
+ void* ptr, /* in, own: pointer to allocated memory
+ buffer */
+ mem_pool_t* pool); /* in: memory pool */
+/************************************************************************
+Returns the amount of reserved memory. */
+
+ulint
+mem_pool_get_reserved(
+/*==================*/
+ /* out: reserved mmeory in bytes */
+ mem_pool_t* pool); /* in: memory pool */
+/************************************************************************
+Validates a memory pool. */
+
+ibool
+mem_pool_validate(
+/*==============*/
+ /* out: TRUE if ok */
+ mem_pool_t* pool); /* in: memory pool */
+/************************************************************************
+Prints info of a memory pool. */
+
+void
+mem_pool_print_info(
+/*================*/
+ FILE* outfile,/* in: output file to write to */
+ mem_pool_t* pool); /* in: memory pool */
+
+
+#ifndef UNIV_NONINL
+#include "mem0pool.ic"
+#endif
+
+#endif
diff --git a/innobase/include/mem0pool.ic b/innobase/include/mem0pool.ic
new file mode 100644
index 00000000000..4e8c08733ed
--- /dev/null
+++ b/innobase/include/mem0pool.ic
@@ -0,0 +1,7 @@
+/************************************************************************
+The lowest-level memory management
+
+(c) 1994, 1995 Innobase Oy
+
+Created 6/8/1994 Heikki Tuuri
+*************************************************************************/
diff --git a/innobase/include/mtr0log.h b/innobase/include/mtr0log.h
new file mode 100644
index 00000000000..acbf87df447
--- /dev/null
+++ b/innobase/include/mtr0log.h
@@ -0,0 +1,178 @@
+/******************************************************
+Mini-transaction logging routines
+
+(c) 1995 Innobase Oy
+
+Created 12/7/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef mtr0log_h
+#define mtr0log_h
+
+#include "univ.i"
+#include "mtr0mtr.h"
+
+/************************************************************
+Writes 1 - 4 bytes to a file page buffered in the buffer pool.
+Writes the corresponding log record to the mini-transaction log. */
+
+void
+mlog_write_ulint(
+/*=============*/
+ byte* ptr, /* in: pointer where to write */
+ ulint val, /* in: value to write */
+ byte type, /* in: MLOG_1BYTE, MLOG_2BYTES, MLOG_4BYTES */
+ mtr_t* mtr); /* in: mini-transaction handle */
+/************************************************************
+Writes 8 bytes to a file page buffered in the buffer pool.
+Writes the corresponding log record to the mini-transaction log. */
+
+void
+mlog_write_dulint(
+/*==============*/
+ byte* ptr, /* in: pointer where to write */
+ dulint val, /* in: value to write */
+ byte type, /* in: MLOG_8BYTES */
+ mtr_t* mtr); /* in: mini-transaction handle */
+/************************************************************
+Writes a string to a file page buffered in the buffer pool. Writes the
+corresponding log record to the mini-transaction log. */
+
+void
+mlog_write_string(
+/*==============*/
+ byte* ptr, /* in: pointer where to write */
+ byte* str, /* in: string to write */
+ ulint len, /* in: string length */
+ mtr_t* mtr); /* in: mini-transaction handle */
+/************************************************************
+Writes initial part of a log record consisting of one-byte item
+type and four-byte space and page numbers. */
+
+void
+mlog_write_initial_log_record(
+/*==========================*/
+ byte* ptr, /* in: pointer to (inside) a buffer frame
+ holding the file page where modification
+ is made */
+ byte type, /* in: log item type: MLOG_1BYTE, ... */
+ mtr_t* mtr); /* in: mini-transaction handle */
+/************************************************************
+Catenates 1 - 4 bytes to the mtr log. */
+UNIV_INLINE
+void
+mlog_catenate_ulint(
+/*================*/
+ mtr_t* mtr, /* in: mtr */
+ ulint val, /* in: value to write */
+ ulint type); /* in: MLOG_1BYTE, MLOG_2BYTES, MLOG_4BYTES */
+/************************************************************
+Catenates n bytes to the mtr log. */
+
+void
+mlog_catenate_string(
+/*=================*/
+ mtr_t* mtr, /* in: mtr */
+ byte* str, /* in: string to write */
+ ulint len); /* in: string length */
+/************************************************************
+Catenates a compressed ulint to mlog. */
+UNIV_INLINE
+void
+mlog_catenate_ulint_compressed(
+/*===========================*/
+ mtr_t* mtr, /* in: mtr */
+ ulint val); /* in: value to write */
+/************************************************************
+Catenates a compressed dulint to mlog. */
+UNIV_INLINE
+void
+mlog_catenate_dulint_compressed(
+/*============================*/
+ mtr_t* mtr, /* in: mtr */
+ dulint val); /* in: value to write */
+/************************************************************
+Opens a buffer to mlog. It must be closed with mlog_close. */
+UNIV_INLINE
+byte*
+mlog_open(
+/*======*/
+ /* out: buffer, NULL if log mode MTR_LOG_NONE */
+ mtr_t* mtr, /* in: mtr */
+ ulint size); /* in: buffer size in bytes */
+/************************************************************
+Closes a buffer opened to mlog. */
+UNIV_INLINE
+void
+mlog_close(
+/*=======*/
+ mtr_t* mtr, /* in: mtr */
+ byte* ptr); /* in: buffer space from ptr up was not used */
+/************************************************************
+Writes the initial part of a log record. */
+UNIV_INLINE
+byte*
+mlog_write_initial_log_record_fast(
+/*===============================*/
+ /* out: new value of log_ptr */
+ byte* ptr, /* in: pointer to (inside) a buffer frame holding the
+ file page where modification is made */
+ byte type, /* in: log item type: MLOG_1BYTE, ... */
+ byte* log_ptr,/* in: pointer to mtr log which has been opened */
+ mtr_t* mtr); /* in: mtr */
+/****************************************************************
+Writes the contents of a mini-transaction log, if any, to the database log. */
+
+dulint
+mlog_write(
+/*=======*/
+ dyn_array_t* mlog, /* in: mlog */
+ ibool* modifications); /* out: TRUE if there were
+ log items to write */
+/************************************************************
+Parses an initial log record written by mlog_write_initial_log_record. */
+
+byte*
+mlog_parse_initial_log_record(
+/*==========================*/
+ /* out: parsed record end, NULL if not a complete
+ record */
+ byte* ptr, /* in: buffer */
+ byte* end_ptr,/* in: buffer end */
+ byte* type, /* out: log record type: MLOG_1BYTE, ... */
+ ulint* space, /* out: space id */
+ ulint* page_no);/* out: page number */
+/************************************************************
+Parses a log record written by mlog_write_ulint or mlog_write_dulint. */
+
+byte*
+mlog_parse_nbytes(
+/*==============*/
+ /* out: parsed record end, NULL if not a complete
+ record */
+ ulint type, /* in: log record type: MLOG_1BYTE, ... */
+ byte* ptr, /* in: buffer */
+ byte* end_ptr,/* in: buffer end */
+ byte* page); /* in: page where to apply the log record, or NULL */
+/************************************************************
+Parses a log record written by mlog_write_string. */
+
+byte*
+mlog_parse_string(
+/*==============*/
+ /* out: parsed record end, NULL if not a complete
+ record */
+ byte* ptr, /* in: buffer */
+ byte* end_ptr,/* in: buffer end */
+ byte* page); /* in: page where to apply the log record, or NULL */
+
+
+/* Insert, update, and maybe other functions may use this value to define an
+extra mlog buffer size for variable size data */
+#define MLOG_BUF_MARGIN 256
+
+#ifndef UNIV_NONINL
+#include "mtr0log.ic"
+#endif
+
+#endif
diff --git a/innobase/include/mtr0log.ic b/innobase/include/mtr0log.ic
new file mode 100644
index 00000000000..c2150660794
--- /dev/null
+++ b/innobase/include/mtr0log.ic
@@ -0,0 +1,187 @@
+/******************************************************
+Mini-transaction logging routines
+
+(c) 1995 Innobase Oy
+
+Created 12/7/1995 Heikki Tuuri
+*******************************************************/
+
+#include "mach0data.h"
+#include "ut0lst.h"
+#include "buf0buf.h"
+
+/************************************************************
+Opens a buffer to mlog. It must be closed with mlog_close. */
+UNIV_INLINE
+byte*
+mlog_open(
+/*======*/
+ /* out: buffer, NULL if log mode MTR_LOG_NONE */
+ mtr_t* mtr, /* in: mtr */
+ ulint size) /* in: buffer size in bytes */
+{
+ dyn_array_t* mlog;
+
+ mtr->modifications = TRUE;
+
+ if (mtr_get_log_mode(mtr) == MTR_LOG_NONE) {
+
+ return(NULL);
+ }
+
+ mlog = &(mtr->log);
+
+ return(dyn_array_open(mlog, size));
+}
+
+/************************************************************
+Closes a buffer opened to mlog. */
+UNIV_INLINE
+void
+mlog_close(
+/*=======*/
+ mtr_t* mtr, /* in: mtr */
+ byte* ptr) /* in: buffer space from ptr up was not used */
+{
+ dyn_array_t* mlog;
+
+ ut_ad(mtr_get_log_mode(mtr) != MTR_LOG_NONE);
+
+ mlog = &(mtr->log);
+
+ dyn_array_close(mlog, ptr);
+}
+
+/************************************************************
+Catenates 1 - 4 bytes to the mtr log. The value is not compressed. */
+UNIV_INLINE
+void
+mlog_catenate_ulint(
+/*================*/
+ mtr_t* mtr, /* in: mtr */
+ ulint val, /* in: value to write */
+ ulint type) /* in: MLOG_1BYTE, MLOG_2BYTES, MLOG_4BYTES */
+{
+ dyn_array_t* mlog;
+ byte* ptr;
+
+ if (mtr_get_log_mode(mtr) == MTR_LOG_NONE) {
+
+ return;
+ }
+
+ mlog = &(mtr->log);
+
+ ut_ad(MLOG_1BYTE == 1);
+ ut_ad(MLOG_2BYTES == 2);
+ ut_ad(MLOG_4BYTES == 4);
+
+ ptr = dyn_array_push(mlog, type);
+
+ if (type == MLOG_4BYTES) {
+ mach_write_to_4(ptr, val);
+ } else if (type == MLOG_2BYTES) {
+ mach_write_to_2(ptr, val);
+ } else {
+ ut_ad(type == MLOG_1BYTE);
+ mach_write_to_1(ptr, val);
+ }
+}
+
+/************************************************************
+Catenates a compressed ulint to mlog. */
+UNIV_INLINE
+void
+mlog_catenate_ulint_compressed(
+/*===========================*/
+ mtr_t* mtr, /* in: mtr */
+ ulint val) /* in: value to write */
+{
+ byte* log_ptr;
+
+ log_ptr = mlog_open(mtr, 10);
+
+ /* If no logging is requested, we may return now */
+ if (log_ptr == NULL) {
+
+ return;
+ }
+
+ log_ptr += mach_write_compressed(log_ptr, val);
+
+ mlog_close(mtr, log_ptr);
+}
+
+/************************************************************
+Catenates a compressed dulint to mlog. */
+UNIV_INLINE
+void
+mlog_catenate_dulint_compressed(
+/*============================*/
+ mtr_t* mtr, /* in: mtr */
+ dulint val) /* in: value to write */
+{
+ byte* log_ptr;
+
+ log_ptr = mlog_open(mtr, 15);
+
+ /* If no logging is requested, we may return now */
+ if (log_ptr == NULL) {
+
+ return;
+ }
+
+ log_ptr += mach_dulint_write_compressed(log_ptr, val);
+
+ mlog_close(mtr, log_ptr);
+}
+
+/************************************************************
+Writes the initial part of a log record. */
+UNIV_INLINE
+byte*
+mlog_write_initial_log_record_fast(
+/*===============================*/
+ /* out: new value of log_ptr */
+ byte* ptr, /* in: pointer to (inside) a buffer frame holding the
+ file page where modification is made */
+ byte type, /* in: log item type: MLOG_1BYTE, ... */
+ byte* log_ptr,/* in: pointer to mtr log which has been opened */
+ mtr_t* mtr) /* in: mtr */
+{
+ buf_block_t* block;
+ ulint space;
+ ulint offset;
+
+ ut_ad(mtr_memo_contains(mtr, buf_block_align(ptr),
+ MTR_MEMO_PAGE_X_FIX));
+ ut_ad(type <= MLOG_BIGGEST_TYPE);
+ ut_ad(ptr && log_ptr);
+
+ block = buf_block_align(ptr);
+
+ space = buf_block_get_space(block);
+ offset = buf_block_get_page_no(block);
+
+ mach_write_to_1(log_ptr, type);
+ log_ptr++;
+ log_ptr += mach_write_compressed(log_ptr, space);
+ log_ptr += mach_write_compressed(log_ptr, offset);
+
+ mtr->n_log_recs++;
+
+#ifdef UNIV_LOG_DEBUG
+/* printf("Adding to mtr log record type %lu space %lu page no %lu\n",
+ type, space, offset); */
+#endif
+
+#ifdef UNIV_DEBUG
+ /* We now assume that all x-latched pages have been modified! */
+
+ if (!mtr_memo_contains(mtr, block, MTR_MEMO_MODIFY)) {
+
+ mtr_memo_push(mtr, block, MTR_MEMO_MODIFY);
+ }
+#endif
+ return(log_ptr);
+}
diff --git a/innobase/include/mtr0mtr.h b/innobase/include/mtr0mtr.h
new file mode 100644
index 00000000000..9f9401cd1a5
--- /dev/null
+++ b/innobase/include/mtr0mtr.h
@@ -0,0 +1,343 @@
+/******************************************************
+Mini-transaction buffer
+
+(c) 1995 Innobase Oy
+
+Created 11/26/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef mtr0mtr_h
+#define mtr0mtr_h
+
+#include "univ.i"
+#include "mem0mem.h"
+#include "dyn0dyn.h"
+#include "buf0types.h"
+#include "sync0rw.h"
+#include "ut0byte.h"
+#include "mtr0types.h"
+#include "page0types.h"
+
+/* Logging modes for a mini-transaction */
+#define MTR_LOG_ALL 21 /* default mode: log all operations
+ modifying disk-based data */
+#define MTR_LOG_NONE 22 /* log no operations */
+/*#define MTR_LOG_SPACE 23 */ /* log only operations modifying
+ file space page allocation data
+ (operations in fsp0fsp.* ) */
+#define MTR_LOG_SHORT_INSERTS 24 /* inserts are logged in a shorter
+ form */
+
+/* Types for the mlock objects to store in the mtr memo; NOTE that the
+first 3 values must be RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH */
+#define MTR_MEMO_PAGE_S_FIX RW_S_LATCH
+#define MTR_MEMO_PAGE_X_FIX RW_X_LATCH
+#define MTR_MEMO_BUF_FIX RW_NO_LATCH
+#define MTR_MEMO_MODIFY 54
+#define MTR_MEMO_S_LOCK 55
+#define MTR_MEMO_X_LOCK 56
+
+/* Log item types: we have made them to be of the type 'byte'
+for the compiler to warn if val and type parameters are switched
+in a call to mlog_write_ulint. NOTE! For 1 - 8 bytes, the
+flag value must give the length also! */
+#define MLOG_SINGLE_REC_FLAG 128 /* if the mtr contains only
+ one log record for one page,
+ i.e., write_initial_log_record
+ has been called only once,
+ this flag is ORed to the type
+ of that first log record */
+#define MLOG_1BYTE ((byte)1) /* one byte is written */
+#define MLOG_2BYTES ((byte)2) /* 2 bytes ... */
+#define MLOG_4BYTES ((byte)4) /* 4 bytes ... */
+#define MLOG_8BYTES ((byte)8) /* 8 bytes ... */
+#define MLOG_REC_INSERT ((byte)9) /* record insert */
+#define MLOG_REC_CLUST_DELETE_MARK ((byte)10) /* mark clustered index record
+ deleted */
+#define MLOG_REC_SEC_DELETE_MARK ((byte)11) /* mark secondary index record
+ deleted */
+#define MLOG_REC_UPDATE_IN_PLACE ((byte)13) /* update of a record,
+ preserves record field sizes */
+#define MLOG_REC_DELETE ((byte)14) /* delete a record from a
+ page */
+#define MLOG_LIST_END_DELETE ((byte)15) /* delete record list end on
+ index page */
+#define MLOG_LIST_START_DELETE ((byte)16) /* delete record list start on
+ index page */
+#define MLOG_LIST_END_COPY_CREATED ((byte)17) /* copy record list end to a
+ new created index page */
+#define MLOG_PAGE_REORGANIZE ((byte)18) /* reorganize an index page */
+#define MLOG_PAGE_CREATE ((byte)19) /* create an index page */
+#define MLOG_UNDO_INSERT ((byte)20) /* insert entry in an undo
+ log */
+#define MLOG_UNDO_ERASE_END ((byte)21) /* erase an undo log page end */
+#define MLOG_UNDO_INIT ((byte)22) /* initialize a page in an
+ undo log */
+#define MLOG_UNDO_HDR_DISCARD ((byte)23) /* discard an update undo log
+ header */
+#define MLOG_UNDO_HDR_REUSE ((byte)24) /* reuse an insert undo log
+ header */
+#define MLOG_UNDO_HDR_CREATE ((byte)25) /* create an undo log header */
+#define MLOG_REC_MIN_MARK ((byte)26) /* mark an index record as the
+ predefined minimum record */
+#define MLOG_IBUF_BITMAP_INIT ((byte)27) /* initialize an ibuf bitmap
+ page */
+#define MLOG_FULL_PAGE ((byte)28) /* full contents of a page */
+#define MLOG_INIT_FILE_PAGE ((byte)29) /* this means that a file page
+ is taken into use and the prior
+ contents of the page should be
+ ignored: in recovery we must
+ not trust the lsn values stored
+ to the file page */
+#define MLOG_WRITE_STRING ((byte)30) /* write a string to a page */
+#define MLOG_MULTI_REC_END ((byte)31) /* if a single mtr writes
+ log records for several pages,
+ this log record ends the
+ sequence of these records */
+#define MLOG_DUMMY_RECORD ((byte)32) /* dummy log record used to
+ pad a log block full */
+#define MLOG_BIGGEST_TYPE ((byte)32) /* biggest value (used in
+ asserts) */
+
+/*******************************************************************
+Starts a mini-transaction and creates a mini-transaction handle
+and buffer in the memory buffer given by the caller. */
+UNIV_INLINE
+mtr_t*
+mtr_start(
+/*======*/
+ /* out: mtr buffer which also acts as
+ the mtr handle */
+ mtr_t* mtr); /* in: memory buffer for the mtr buffer */
+/*******************************************************************
+Starts a mini-transaction and creates a mini-transaction handle
+and buffer in the memory buffer given by the caller. */
+
+mtr_t*
+mtr_start_noninline(
+/*================*/
+ /* out: mtr buffer which also acts as
+ the mtr handle */
+ mtr_t* mtr); /* in: memory buffer for the mtr buffer */
+/*******************************************************************
+Commits a mini-transaction. */
+
+void
+mtr_commit(
+/*=======*/
+ mtr_t* mtr); /* in: mini-transaction */
+/****************************************************************
+Writes to the database log the full contents of the pages that this mtr is
+the first to modify in the buffer pool. This function is called when the
+database is in the online backup state. */
+
+void
+mtr_log_write_backup_entries(
+/*=========================*/
+ mtr_t* mtr, /* in: mini-transaction */
+ dulint backup_lsn); /* in: online backup lsn */
+/**************************************************************
+Sets and returns a savepoint in mtr. */
+UNIV_INLINE
+ulint
+mtr_set_savepoint(
+/*==============*/
+ /* out: savepoint */
+ mtr_t* mtr); /* in: mtr */
+/**************************************************************
+Releases the latches stored in an mtr memo down to a savepoint.
+NOTE! The mtr must not have made changes to buffer pages after the
+savepoint, as these can be handled only by mtr_commit. */
+
+void
+mtr_rollback_to_savepoint(
+/*======================*/
+ mtr_t* mtr, /* in: mtr */
+ ulint savepoint); /* in: savepoint */
+/**************************************************************
+Releases the (index tree) s-latch stored in an mtr memo after a
+savepoint. */
+UNIV_INLINE
+void
+mtr_release_s_latch_at_savepoint(
+/*=============================*/
+ mtr_t* mtr, /* in: mtr */
+ ulint savepoint, /* in: savepoint */
+ rw_lock_t* lock); /* in: latch to release */
+/*******************************************************************
+Gets the logging mode of a mini-transaction. */
+UNIV_INLINE
+ulint
+mtr_get_log_mode(
+/*=============*/
+ /* out: logging mode: MTR_LOG_NONE, ... */
+ mtr_t* mtr); /* in: mtr */
+/*******************************************************************
+Changes the logging mode of a mini-transaction. */
+UNIV_INLINE
+ulint
+mtr_set_log_mode(
+/*=============*/
+ /* out: old mode */
+ mtr_t* mtr, /* in: mtr */
+ ulint mode); /* in: logging mode: MTR_LOG_NONE, ... */
+/************************************************************
+Reads 1 - 4 bytes from a file page buffered in the buffer pool. */
+
+ulint
+mtr_read_ulint(
+/*===========*/
+ /* out: value read */
+ byte* ptr, /* in: pointer from where to read */
+ ulint type, /* in: MLOG_1BYTE, MLOG_2BYTES, MLOG_4BYTES */
+ mtr_t* mtr); /* in: mini-transaction handle */
+/************************************************************
+Reads 8 bytes from a file page buffered in the buffer pool. */
+
+dulint
+mtr_read_dulint(
+/*===========*/
+ /* out: value read */
+ byte* ptr, /* in: pointer from where to read */
+ ulint type, /* in: MLOG_8BYTES */
+ mtr_t* mtr); /* in: mini-transaction handle */
+/*************************************************************************
+This macro locks an rw-lock in s-mode. */
+#ifdef UNIV_SYNC_DEBUG
+#define mtr_s_lock(B, MTR) mtr_s_lock_func((B), __FILE__, __LINE__,\
+ (MTR))
+#else
+#define mtr_s_lock(B, MTR) mtr_s_lock_func((B), (MTR))
+#endif
+/*************************************************************************
+This macro locks an rw-lock in x-mode. */
+#ifdef UNIV_SYNC_DEBUG
+#define mtr_x_lock(B, MTR) mtr_x_lock_func((B), __FILE__, __LINE__,\
+ (MTR))
+#else
+#define mtr_x_lock(B, MTR) mtr_x_lock_func((B), (MTR))
+#endif
+/*************************************************************************
+NOTE! Use the macro above!
+Locks a lock in s-mode. */
+UNIV_INLINE
+void
+mtr_s_lock_func(
+/*============*/
+ rw_lock_t* lock, /* in: rw-lock */
+#ifdef UNIV_SYNC_DEBUG
+ char* file, /* in: file name */
+ ulint line, /* in: line number */
+#endif
+ mtr_t* mtr); /* in: mtr */
+/*************************************************************************
+NOTE! Use the macro above!
+Locks a lock in x-mode. */
+UNIV_INLINE
+void
+mtr_x_lock_func(
+/*============*/
+ rw_lock_t* lock, /* in: rw-lock */
+#ifdef UNIV_SYNC_DEBUG
+ char* file, /* in: file name */
+ ulint line, /* in: line number */
+#endif
+ mtr_t* mtr); /* in: mtr */
+
+/*******************************************************
+Releases an object in the memo stack. */
+
+void
+mtr_memo_release(
+/*=============*/
+ mtr_t* mtr, /* in: mtr */
+ void* object, /* in: object */
+ ulint type); /* in: object type: MTR_MEMO_S_LOCK, ... */
+/****************************************************************
+Parses a log record which contains the full contents of a page. */
+
+byte*
+mtr_log_parse_full_page(
+/*====================*/
+ /* out: end of log record or NULL */
+ byte* ptr, /* in: buffer */
+ byte* end_ptr,/* in: buffer end */
+ page_t* page); /* in: page or NULL */
+/**************************************************************
+Checks if memo contains the given item. */
+UNIV_INLINE
+ibool
+mtr_memo_contains(
+/*==============*/
+ /* out: TRUE if contains */
+ mtr_t* mtr, /* in: mtr */
+ void* object, /* in: object to search */
+ ulint type); /* in: type of object */
+/*************************************************************
+Prints info of an mtr handle. */
+
+void
+mtr_print(
+/*======*/
+ mtr_t* mtr); /* in: mtr */
+/*######################################################################*/
+
+#define MTR_BUF_MEMO_SIZE 200 /* number of slots in memo */
+
+/*******************************************************************
+Returns the log object of a mini-transaction buffer. */
+UNIV_INLINE
+dyn_array_t*
+mtr_get_log(
+/*========*/
+ /* out: log */
+ mtr_t* mtr); /* in: mini-transaction */
+/*******************************************************
+Pushes an object to an mtr memo stack. */
+UNIV_INLINE
+void
+mtr_memo_push(
+/*==========*/
+ mtr_t* mtr, /* in: mtr */
+ void* object, /* in: object */
+ ulint type); /* in: object type: MTR_MEMO_S_LOCK, ... */
+
+
+/* Type definition of a mini-transaction memo stack slot. */
+typedef struct mtr_memo_slot_struct mtr_memo_slot_t;
+struct mtr_memo_slot_struct{
+ ulint type; /* type of the stored object (MTR_MEMO_S_LOCK, ...) */
+ void* object; /* pointer to the object */
+};
+
+/* Mini-transaction handle and buffer */
+struct mtr_struct{
+ ulint state; /* MTR_ACTIVE, MTR_COMMITTING, MTR_COMMITTED */
+ dyn_array_t memo; /* memo stack for locks etc. */
+ dyn_array_t log; /* mini-transaction log */
+ ibool modifications;
+ /* TRUE if the mtr made modifications to
+ buffer pool pages */
+ ulint n_log_recs;
+ /* count of how many page initial log records
+ have been written to the mtr log */
+ ulint log_mode; /* specifies which operations should be
+ logged; default value MTR_LOG_ALL */
+ dulint start_lsn;/* start lsn of the possible log entry for
+ this mtr */
+ dulint end_lsn;/* end lsn of the possible log entry for
+ this mtr */
+ ulint magic_n;
+};
+
+#define MTR_MAGIC_N 54551
+
+#define MTR_ACTIVE 12231
+#define MTR_COMMITTING 56456
+#define MTR_COMMITTED 34676
+
+#ifndef UNIV_NONINL
+#include "mtr0mtr.ic"
+#endif
+
+#endif
diff --git a/innobase/include/mtr0mtr.ic b/innobase/include/mtr0mtr.ic
new file mode 100644
index 00000000000..5718d872bcb
--- /dev/null
+++ b/innobase/include/mtr0mtr.ic
@@ -0,0 +1,261 @@
+/******************************************************
+Mini-transaction buffer
+
+(c) 1995 Innobase Oy
+
+Created 11/26/1995 Heikki Tuuri
+*******************************************************/
+
+#include "sync0sync.h"
+#include "sync0rw.h"
+#include "mach0data.h"
+
+/*******************************************************************
+Starts a mini-transaction and creates a mini-transaction handle
+and a buffer in the memory buffer given by the caller. */
+UNIV_INLINE
+mtr_t*
+mtr_start(
+/*======*/
+ /* out: mtr buffer which also acts as
+ the mtr handle */
+ mtr_t* mtr) /* in: memory buffer for the mtr buffer */
+{
+ dyn_array_create(&(mtr->memo));
+ dyn_array_create(&(mtr->log));
+
+ mtr->log_mode = MTR_LOG_ALL;
+ mtr->modifications = FALSE;
+ mtr->n_log_recs = 0;
+
+#ifdef UNIV_DEBUG
+ mtr->state = MTR_ACTIVE;
+ mtr->magic_n = MTR_MAGIC_N;
+#endif
+ return(mtr);
+}
+
+/*******************************************************
+Pushes an object to an mtr memo stack. */
+UNIV_INLINE
+void
+mtr_memo_push(
+/*==========*/
+ mtr_t* mtr, /* in: mtr */
+ void* object, /* in: object */
+ ulint type) /* in: object type: MTR_MEMO_S_LOCK, ... */
+{
+ dyn_array_t* memo;
+ mtr_memo_slot_t* slot;
+
+ ut_ad(object);
+ ut_ad(type >= MTR_MEMO_PAGE_S_FIX);
+ ut_ad(type <= MTR_MEMO_X_LOCK);
+ ut_ad(mtr);
+ ut_ad(mtr->magic_n == MTR_MAGIC_N);
+
+ memo = &(mtr->memo);
+
+ slot = dyn_array_push(memo, sizeof(mtr_memo_slot_t));
+
+ slot->object = object;
+ slot->type = type;
+}
+
+/**************************************************************
+Sets and returns a savepoint in mtr. */
+UNIV_INLINE
+ulint
+mtr_set_savepoint(
+/*==============*/
+ /* out: savepoint */
+ mtr_t* mtr) /* in: mtr */
+{
+ dyn_array_t* memo;
+
+ ut_ad(mtr);
+ ut_ad(mtr->magic_n == MTR_MAGIC_N);
+
+ memo = &(mtr->memo);
+
+ return(dyn_array_get_data_size(memo));
+}
+
+/**************************************************************
+Releases the (index tree) s-latch stored in an mtr memo after a
+savepoint. */
+UNIV_INLINE
+void
+mtr_release_s_latch_at_savepoint(
+/*=============================*/
+ mtr_t* mtr, /* in: mtr */
+ ulint savepoint, /* in: savepoint */
+ rw_lock_t* lock) /* in: latch to release */
+{
+ mtr_memo_slot_t* slot;
+ dyn_array_t* memo;
+
+ ut_ad(mtr);
+ ut_ad(mtr->magic_n == MTR_MAGIC_N);
+ ut_ad(mtr->state == MTR_ACTIVE);
+
+ memo = &(mtr->memo);
+
+ ut_ad(dyn_array_get_data_size(memo) > savepoint);
+
+ slot = dyn_array_get_element(memo, savepoint);
+
+ ut_ad(slot->object == lock);
+ ut_ad(slot->type == MTR_MEMO_S_LOCK);
+
+ rw_lock_s_unlock(lock);
+
+ slot->object = NULL;
+}
+
+/**************************************************************
+Checks if memo contains the given item. */
+UNIV_INLINE
+ibool
+mtr_memo_contains(
+/*==============*/
+ /* out: TRUE if contains */
+ mtr_t* mtr, /* in: mtr */
+ void* object, /* in: object to search */
+ ulint type) /* in: type of object */
+{
+ mtr_memo_slot_t* slot;
+ dyn_array_t* memo;
+ ulint offset;
+
+ ut_ad(mtr);
+ ut_ad(mtr->magic_n == MTR_MAGIC_N);
+
+ memo = &(mtr->memo);
+
+ offset = dyn_array_get_data_size(memo);
+
+ while (offset > 0) {
+ offset -= sizeof(mtr_memo_slot_t);
+
+ slot = dyn_array_get_element(memo, offset);
+
+ if ((object == slot->object) && (type == slot->type)) {
+
+ return(TRUE);
+ }
+ }
+
+ return(FALSE);
+}
+
+/*******************************************************************
+Returns the log object of a mini-transaction buffer. */
+UNIV_INLINE
+dyn_array_t*
+mtr_get_log(
+/*========*/
+ /* out: log */
+ mtr_t* mtr) /* in: mini-transaction */
+{
+ ut_ad(mtr);
+ ut_ad(mtr->magic_n == MTR_MAGIC_N);
+
+ return(&(mtr->log));
+}
+
+/*******************************************************************
+Gets the logging mode of a mini-transaction. */
+UNIV_INLINE
+ulint
+mtr_get_log_mode(
+/*=============*/
+ /* out: logging mode: MTR_LOG_NONE, ... */
+ mtr_t* mtr) /* in: mtr */
+{
+ ut_ad(mtr);
+ ut_ad(mtr->log_mode >= MTR_LOG_ALL);
+ ut_ad(mtr->log_mode <= MTR_LOG_SHORT_INSERTS);
+
+ return(mtr->log_mode);
+}
+
+/*******************************************************************
+Changes the logging mode of a mini-transaction. */
+UNIV_INLINE
+ulint
+mtr_set_log_mode(
+/*=============*/
+ /* out: old mode */
+ mtr_t* mtr, /* in: mtr */
+ ulint mode) /* in: logging mode: MTR_LOG_NONE, ... */
+{
+ ulint old_mode;
+
+ ut_ad(mtr);
+ ut_ad(mode >= MTR_LOG_ALL);
+ ut_ad(mode <= MTR_LOG_SHORT_INSERTS);
+
+ old_mode = mtr->log_mode;
+
+ if ((mode == MTR_LOG_SHORT_INSERTS) && (old_mode == MTR_LOG_NONE)) {
+ /* Do nothing */
+ } else {
+ mtr->log_mode = mode;
+ }
+
+ ut_ad(old_mode >= MTR_LOG_ALL);
+ ut_ad(old_mode <= MTR_LOG_SHORT_INSERTS);
+
+ return(old_mode);
+}
+
+/*************************************************************************
+Locks a lock in s-mode. */
+UNIV_INLINE
+void
+mtr_s_lock_func(
+/*============*/
+ rw_lock_t* lock, /* in: rw-lock */
+#ifdef UNIV_SYNC_DEBUG
+ char* file, /* in: file name */
+ ulint line, /* in: line number */
+#endif
+ mtr_t* mtr) /* in: mtr */
+{
+ ut_ad(mtr);
+ ut_ad(lock);
+
+ rw_lock_s_lock_func(lock
+ #ifdef UNIV_SYNC_DEBUG
+ ,0, file, line
+ #endif
+ );
+
+ mtr_memo_push(mtr, lock, MTR_MEMO_S_LOCK);
+}
+
+/*************************************************************************
+Locks a lock in x-mode. */
+UNIV_INLINE
+void
+mtr_x_lock_func(
+/*============*/
+ rw_lock_t* lock, /* in: rw-lock */
+#ifdef UNIV_SYNC_DEBUG
+ char* file, /* in: file name */
+ ulint line, /* in: line number */
+#endif
+ mtr_t* mtr) /* in: mtr */
+{
+ ut_ad(mtr);
+ ut_ad(lock);
+
+ rw_lock_x_lock_func(lock, 0
+ #ifdef UNIV_SYNC_DEBUG
+ , file, line
+ #endif
+ );
+
+ mtr_memo_push(mtr, lock, MTR_MEMO_X_LOCK);
+}
diff --git a/innobase/include/mtr0types.h b/innobase/include/mtr0types.h
new file mode 100644
index 00000000000..e3b6ec9a84f
--- /dev/null
+++ b/innobase/include/mtr0types.h
@@ -0,0 +1,14 @@
+/******************************************************
+Mini-transaction buffer global types
+
+(c) 1995 Innobase Oy
+
+Created 11/26/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef mtr0types_h
+#define mtr0types_h
+
+typedef struct mtr_struct mtr_t;
+
+#endif
diff --git a/innobase/include/odbc0odbc.h b/innobase/include/odbc0odbc.h
new file mode 100644
index 00000000000..7f842b54b27
--- /dev/null
+++ b/innobase/include/odbc0odbc.h
@@ -0,0 +1,20 @@
+/******************************************************
+Innobase ODBC client library additional header
+
+(c) 1998 Innobase Oy
+
+Created 2/22/1998 Heikki Tuuri
+*******************************************************/
+
+#ifndef odbc0odbc_h
+#define odbc0odbc_h
+
+#include "ib_odbc.h"
+
+/* Datagram size in communications */
+#define ODBC_DATAGRAM_SIZE 8192
+
+/* Communication address maximum length in bytes */
+#define ODBC_ADDRESS_SIZE COM_MAX_ADDR_LEN
+
+#endif
diff --git a/innobase/include/os0file.h b/innobase/include/os0file.h
new file mode 100644
index 00000000000..5b90f24f12e
--- /dev/null
+++ b/innobase/include/os0file.h
@@ -0,0 +1,353 @@
+/******************************************************
+The interface to the operating system file io
+
+(c) 1995 Innobase Oy
+
+Created 10/21/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef os0file_h
+#define os0file_h
+
+#include "univ.i"
+
+#ifdef __WIN__
+
+#include <windows.h>
+#if (defined(__NT__) || defined(__WIN2000__))
+
+#define WIN_ASYNC_IO
+
+#endif
+
+#define UNIV_NON_BUFFERED_IO
+
+#else
+
+#if defined(HAVE_AIO_H) && defined(HAVE_LIBRT)
+#define POSIX_ASYNC_IO
+#endif
+
+#endif
+
+#ifdef __WIN__
+typedef HANDLE os_file_t;
+#else
+typedef int os_file_t;
+#endif
+
+/* If this flag is TRUE, then we will use the native aio of the
+OS (provided we compiled Innobase with it in), otherwise we will
+use simulated aio we build below with threads */
+
+extern ibool os_aio_use_native_aio;
+
+#define OS_FILE_SECTOR_SIZE 512
+
+/* The next value should be smaller or equal to the smallest sector size used
+on any disk. A log block is required to be a portion of disk which is written
+so that if the start and the end of a block get written to disk, then the
+whole block gets written. This should be true even in most cases of a crash:
+if this fails for a log block, then it is equivalent to a media failure in the
+log. */
+
+#define OS_FILE_LOG_BLOCK_SIZE 512
+
+/* Options for file_create */
+#define OS_FILE_OPEN 51
+#define OS_FILE_CREATE 52
+#define OS_FILE_OVERWRITE 53
+
+/* Options for file_create */
+#define OS_FILE_AIO 61
+#define OS_FILE_NORMAL 62
+
+/* Error codes from os_file_get_last_error */
+#define OS_FILE_NOT_FOUND 71
+#define OS_FILE_DISK_FULL 72
+#define OS_FILE_ALREADY_EXISTS 73
+#define OS_FILE_AIO_RESOURCES_RESERVED 74 /* wait for OS aio resources
+ to become available again */
+#define OS_FILE_ERROR_NOT_SPECIFIED 75
+
+/* Types for aio operations */
+#define OS_FILE_READ 10
+#define OS_FILE_WRITE 11
+
+#define OS_FILE_LOG 256 /* This can be ORed to type */
+
+#define OS_AIO_N_PENDING_IOS_PER_THREAD 32 /* Win NT does not allow more
+ than 64 */
+
+/* Modes for aio operations */
+#define OS_AIO_NORMAL 21 /* Normal asynchronous i/o not for ibuf
+ pages or ibuf bitmap pages */
+#define OS_AIO_IBUF 22 /* Asynchronous i/o for ibuf pages or ibuf
+ bitmap pages */
+#define OS_AIO_LOG 23 /* Asynchronous i/o for the log */
+#define OS_AIO_SYNC 24 /* Asynchronous i/o where the calling thread
+ will itself wait for the i/o to complete,
+ doing also the job of the i/o-handler thread;
+ can be used for any pages, ibuf or non-ibuf.
+ This is used to save CPU time, as we can do
+ with fewer thread switches. Plain synchronous
+ i/o is not as good, because it must serialize
+ the file seek and read or write, causing a
+ bottleneck for parallelism. */
+
+#define OS_AIO_SIMULATED_WAKE_LATER 512 /* This can be ORed to mode
+ in the call of os_aio(...),
+ if the caller wants to post several i/o
+ requests in a batch, and only after that
+ wake the i/o-handler thread; this has
+ effect only in simulated aio */
+
+/********************************************************************
+Opens an existing file or creates a new. */
+
+os_file_t
+os_file_create(
+/*===========*/
+ /* out, own: handle to the file, not defined if error,
+ error number can be retrieved with os_get_last_error */
+ char* name, /* in: name of the file or path as a null-terminated
+ string */
+ ulint create_mode,/* in: OS_FILE_OPEN if an existing file is opened
+ (if does not exist, error), or OS_FILE_CREATE if a new
+ file is created (if exists, error), OS_FILE_OVERWRITE
+ if a new file is created or an old overwritten */
+ ulint purpose,/* in: OS_FILE_AIO, if asynchronous, non-buffered i/o
+ is desired, OS_FILE_NORMAL, if any normal file */
+ ibool* success);/* out: TRUE if succeed, FALSE if error */
+/***************************************************************************
+Closes a file handle. In case of error, error number can be retrieved with
+os_file_get_last_error. */
+
+ibool
+os_file_close(
+/*==========*/
+ /* out: TRUE if success */
+ os_file_t file); /* in, own: handle to a file */
+/***************************************************************************
+Gets a file size. */
+
+ibool
+os_file_get_size(
+/*=============*/
+ /* out: TRUE if success */
+ os_file_t file, /* in: handle to a file */
+ ulint* size, /* out: least significant 32 bits of file
+ size */
+ ulint* size_high);/* out: most significant 32 bits of size */
+/***************************************************************************
+Sets a file size. This function can be used to extend or truncate a file. */
+
+ibool
+os_file_set_size(
+/*=============*/
+ /* out: TRUE if success */
+ char* name, /* in: name of the file or path as a
+ null-terminated string */
+ os_file_t file, /* in: handle to a file */
+ ulint size, /* in: least significant 32 bits of file
+ size */
+ ulint size_high);/* in: most significant 32 bits of size */
+/***************************************************************************
+Flushes the write buffers of a given file to the disk. */
+
+ibool
+os_file_flush(
+/*==========*/
+ /* out: TRUE if success */
+ os_file_t file); /* in, own: handle to a file */
+/***************************************************************************
+Retrieves the last error number if an error occurs in a file io function.
+The number should be retrieved before any other OS calls (because they may
+overwrite the error number). If the number is not known to this program,
+the OS error number + 100 is returned. */
+
+ulint
+os_file_get_last_error(void);
+/*========================*/
+ /* out: error number, or OS error number + 100 */
+/***********************************************************************
+Requests a synchronous read operation. */
+
+ibool
+os_file_read(
+/*=========*/
+ /* out: TRUE if request was
+ successful, FALSE if fail */
+ os_file_t file, /* in: handle to a file */
+ void* buf, /* in: buffer where to read */
+ ulint offset, /* in: least significant 32 bits of file
+ offset where to read */
+ ulint offset_high,/* in: most significant 32 bits of
+ offset */
+ ulint n); /* in: number of bytes to read */
+/***********************************************************************
+Requests a synchronous write operation. */
+
+ibool
+os_file_write(
+/*==========*/
+ /* out: TRUE if request was
+ successful, FALSE if fail */
+ char* name, /* in: name of the file or path as a
+ null-terminated string */
+ os_file_t file, /* in: handle to a file */
+ void* buf, /* in: buffer from which to write */
+ ulint offset, /* in: least significant 32 bits of file
+ offset where to write */
+ ulint offset_high,/* in: most significant 32 bits of
+ offset */
+ ulint n); /* in: number of bytes to write */
+/****************************************************************************
+Initializes the asynchronous io system. Creates separate aio array for
+non-ibuf read and write, a third aio array for the ibuf i/o, with just one
+segment, two aio arrays for log reads and writes with one segment, and a
+synchronous aio array of the specified size. The combined number of segments
+in the three first aio arrays is the parameter n_segments given to the
+function. The caller must create an i/o handler thread for each segment in
+the four first arrays, but not for the sync aio array. */
+
+void
+os_aio_init(
+/*========*/
+ ulint n, /* in: maximum number of pending aio operations
+ allowed; n must be divisible by n_segments */
+ ulint n_segments, /* in: combined number of segments in the four
+ first aio arrays; must be >= 4 */
+ ulint n_slots_sync); /* in: number of slots in the sync aio array */
+/***********************************************************************
+Requests an asynchronous i/o operation. */
+
+ibool
+os_aio(
+/*===*/
+ /* out: TRUE if request was queued
+ successfully, FALSE if fail */
+ ulint type, /* in: OS_FILE_READ or OS_FILE_WRITE */
+ ulint mode, /* in: OS_AIO_NORMAL, ..., possibly ORed
+ to OS_AIO_SIMULATED_WAKE_LATER: the
+ last flag advises this function not to wake
+ i/o-handler threads, but the caller will
+ do the waking explicitly later, in this
+ way the caller can post several requests in
+ a batch; NOTE that the batch must not be
+ so big that it exhausts the slots in aio
+ arrays! NOTE that a simulated batch
+ may introduce hidden chances of deadlocks,
+ because i/os are not actually handled until
+ all have been posted: use with great
+ caution! */
+ char* name, /* in: name of the file or path as a
+ null-terminated string */
+ os_file_t file, /* in: handle to a file */
+ void* buf, /* in: buffer where to read or from which
+ to write */
+ ulint offset, /* in: least significant 32 bits of file
+ offset where to read or write */
+ ulint offset_high, /* in: most significant 32 bits of
+ offset */
+ ulint n, /* in: number of bytes to read or write */
+ void* message1,/* in: messages for the aio handler (these
+ can be used to identify a completed aio
+ operation); if mode is OS_AIO_SYNC, these
+ are ignored */
+ void* message2);
+/**************************************************************************
+Wakes up simulated aio i/o-handler threads if they have something to do. */
+
+void
+os_aio_simulated_wake_handler_threads(void);
+/*=======================================*/
+
+#ifdef WIN_ASYNC_IO
+/**************************************************************************
+This function is only used in Windows asynchronous i/o.
+Waits for an aio operation to complete. This function is used to wait the
+for completed requests. The aio array of pending requests is divided
+into segments. The thread specifies which segment or slot it wants to wait
+for. NOTE: this function will also take care of freeing the aio slot,
+therefore no other thread is allowed to do the freeing! */
+
+ibool
+os_aio_windows_handle(
+/*==================*/
+ /* out: TRUE if the aio operation succeeded */
+ ulint segment, /* in: the number of the segment in the aio
+ arrays to wait for; segment 0 is the ibuf
+ i/o thread, segment 1 the log i/o thread,
+ then follow the non-ibuf read threads, and as
+ the last are the non-ibuf write threads; if
+ this is ULINT_UNDEFINED, then it means that
+ sync aio is used, and this parameter is
+ ignored */
+ ulint pos, /* this parameter is used only in sync aio:
+ wait for the aio slot at this position */
+ void** message1, /* out: the messages passed with the aio
+ request; note that also in the case where
+ the aio operation failed, these output
+ parameters are valid and can be used to
+ restart the operation, for example */
+ void** message2);
+#endif
+#ifdef POSIX_ASYNC_IO
+/**************************************************************************
+This function is only used in Posix asynchronous i/o. Waits for an aio
+operation to complete. */
+
+ibool
+os_aio_posix_handle(
+/*================*/
+ /* out: TRUE if the aio operation succeeded */
+ ulint array_no, /* in: array number 0 - 3 */
+ void** message1, /* out: the messages passed with the aio
+ request; note that also in the case where
+ the aio operation failed, these output
+ parameters are valid and can be used to
+ restart the operation, for example */
+ void** message2);
+#endif
+/**************************************************************************
+Does simulated aio. This function should be called by an i/o-handler
+thread. */
+
+ibool
+os_aio_simulated_handle(
+/*====================*/
+ /* out: TRUE if the aio operation succeeded */
+ ulint segment, /* in: the number of the segment in the aio
+ arrays to wait for; segment 0 is the ibuf
+ i/o thread, segment 1 the log i/o thread,
+ then follow the non-ibuf read threads, and as
+ the last are the non-ibuf write threads */
+ void** message1, /* out: the messages passed with the aio
+ request; note that also in the case where
+ the aio operation failed, these output
+ parameters are valid and can be used to
+ restart the operation, for example */
+ void** message2);
+/**************************************************************************
+Validates the consistency of the aio system. */
+
+ibool
+os_aio_validate(void);
+/*=================*/
+ /* out: TRUE if ok */
+/**************************************************************************
+Prints info of the aio arrays. */
+
+void
+os_aio_print(void);
+/*==============*/
+/**************************************************************************
+Checks that all slots in the system have been freed, that is, there are
+no pending io operations. */
+
+ibool
+os_aio_all_slots_free(void);
+/*=======================*/
+ /* out: TRUE if all free */
+#endif
diff --git a/innobase/include/os0proc.h b/innobase/include/os0proc.h
new file mode 100644
index 00000000000..9da1f33e070
--- /dev/null
+++ b/innobase/include/os0proc.h
@@ -0,0 +1,71 @@
+/******************************************************
+The interface to the operating system
+process control primitives
+
+(c) 1995 Innobase Oy
+
+Created 9/30/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef os0proc_h
+#define os0proc_h
+
+#include "univ.i"
+
+typedef void* os_process_t;
+typedef unsigned long int os_process_id_t;
+
+/********************************************************************
+Allocates non-cacheable memory. */
+
+void*
+os_mem_alloc_nocache(
+/*=================*/
+ /* out: allocated memory */
+ ulint n); /* in: number of bytes */
+#ifdef notdefined
+/********************************************************************
+Creates a new process. */
+
+ibool
+os_process_create(
+/*==============*/
+ char* name, /* in: name of the executable to start
+ or its full path name */
+ char* cmd, /* in: command line for the starting
+ process, or NULL if no command line
+ specified */
+ os_process_t* proc, /* out: handle to the process */
+ os_process_id_t* id); /* out: process id */
+/**************************************************************************
+Exits a process. */
+
+void
+os_process_exit(
+/*============*/
+ ulint code); /* in: exit code */
+/**************************************************************************
+Gets process exit code. */
+
+ibool
+os_process_get_exit_code(
+/*=====================*/
+ /* out: TRUE if succeed, FALSE if fail */
+ os_process_t proc, /* in: handle to the process */
+ ulint* code); /* out: exit code */
+#endif
+/********************************************************************
+Sets the priority boost for threads released from waiting within the current
+process. */
+
+void
+os_process_set_priority_boost(
+/*==========================*/
+ ibool do_boost); /* in: TRUE if priority boost should be done,
+ FALSE if not */
+
+#ifndef UNIV_NONINL
+#include "os0proc.ic"
+#endif
+
+#endif
diff --git a/innobase/include/os0proc.ic b/innobase/include/os0proc.ic
new file mode 100644
index 00000000000..651ba1f17e3
--- /dev/null
+++ b/innobase/include/os0proc.ic
@@ -0,0 +1,10 @@
+/******************************************************
+The interface to the operating system
+process control primitives
+
+(c) 1995 Innobase Oy
+
+Created 9/30/1995 Heikki Tuuri
+*******************************************************/
+
+
diff --git a/innobase/include/os0shm.h b/innobase/include/os0shm.h
new file mode 100644
index 00000000000..250794a976f
--- /dev/null
+++ b/innobase/include/os0shm.h
@@ -0,0 +1,66 @@
+/******************************************************
+The interface to the operating system
+shared memory primitives
+
+(c) 1995 Innobase Oy
+
+Created 9/23/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef os0shm_h
+#define os0shm_h
+
+#include "univ.i"
+
+typedef void* os_shm_t;
+
+
+/********************************************************************
+Creates an area of shared memory. It can be named so that
+different processes may access it in the same computer.
+If an area with the same name already exists, returns
+a handle to that area (where the size of the area is
+not changed even if this call requests a different size).
+To use the area, it first has to be mapped to the process
+address space by os_shm_map. */
+
+os_shm_t
+os_shm_create(
+/*==========*/
+ /* out, own: handle to the shared
+ memory area, NULL if error */
+ ulint size, /* in: area size < 4 GB */
+ char* name); /* in: name of the area as a null-terminated
+ string */
+/***************************************************************************
+Frees a shared memory area. The area can be freed only after it
+has been unmapped in all the processes where it was mapped. */
+
+ibool
+os_shm_free(
+/*========*/
+ /* out: TRUE if success */
+ os_shm_t shm); /* in, own: handle to a shared memory area */
+/***************************************************************************
+Maps a shared memory area in the address space of a process. */
+
+void*
+os_shm_map(
+/*=======*/
+ /* out: address of the area, NULL if error */
+ os_shm_t shm); /* in: handle to a shared memory area */
+/***************************************************************************
+Unmaps a shared memory area from the address space of a process. */
+
+ibool
+os_shm_unmap(
+/*=========*/
+ /* out: TRUE if succeed */
+ void* addr); /* in: address of the area */
+
+
+#ifndef UNIV_NONINL
+#include "os0shm.ic"
+#endif
+
+#endif
diff --git a/innobase/include/os0shm.ic b/innobase/include/os0shm.ic
new file mode 100644
index 00000000000..cc267544bc9
--- /dev/null
+++ b/innobase/include/os0shm.ic
@@ -0,0 +1,10 @@
+/******************************************************
+The interface to the operating system
+shared memory primitives
+
+(c) 1995 Innobase Oy
+
+Created 9/23/1995 Heikki Tuuri
+*******************************************************/
+
+
diff --git a/innobase/include/os0sync.h b/innobase/include/os0sync.h
new file mode 100644
index 00000000000..dcf519fdb9d
--- /dev/null
+++ b/innobase/include/os0sync.h
@@ -0,0 +1,198 @@
+/******************************************************
+The interface to the operating system
+synchronization primitives.
+
+(c) 1995 Innobase Oy
+
+Created 9/6/1995 Heikki Tuuri
+*******************************************************/
+#ifndef os0sync_h
+#define os0sync_h
+
+#include "univ.i"
+
+#ifdef __WIN__
+
+#include <windows.h>
+typedef CRITICAL_SECTION os_fast_mutex_t;
+typedef void* os_event_t;
+
+#else
+
+typedef pthread_mutex_t os_fast_mutex_t;
+struct os_event_struct {
+ os_fast_mutex_t os_mutex; /* this mutex protects the next
+ fields */
+ ibool is_set; /* this is TRUE if the next mutex is
+ not reserved */
+ os_fast_mutex_t wait_mutex; /* this mutex is used in waiting for
+ the event */
+};
+typedef struct os_event_struct os_event_struct_t;
+typedef os_event_struct_t* os_event_t;
+#endif
+
+typedef struct os_mutex_struct os_mutex_str_t;
+typedef os_mutex_str_t* os_mutex_t;
+
+#define OS_SYNC_INFINITE_TIME ((ulint)(-1))
+
+#define OS_SYNC_TIME_EXCEEDED 1
+
+/*************************************************************
+Creates an event semaphore, i.e., a semaphore which may
+just have two states: signaled and nonsignaled.
+The created event is manual reset: it must be reset
+explicitly by calling sync_os_reset_event. */
+
+os_event_t
+os_event_create(
+/*============*/
+ /* out: the event handle */
+ char* name); /* in: the name of the event, if NULL
+ the event is created without a name */
+/*************************************************************
+Creates an auto-reset event semaphore, i.e., an event
+which is automatically reset when a single thread is
+released. */
+
+os_event_t
+os_event_create_auto(
+/*=================*/
+ /* out: the event handle */
+ char* name); /* in: the name of the event, if NULL
+ the event is created without a name */
+/**************************************************************
+Sets an event semaphore to the signaled state: lets waiting threads
+proceed. */
+
+void
+os_event_set(
+/*=========*/
+ os_event_t event); /* in: event to set */
+/**************************************************************
+Resets an event semaphore to the nonsignaled state. Waiting threads will
+stop to wait for the event. */
+
+void
+os_event_reset(
+/*===========*/
+ os_event_t event); /* in: event to reset */
+/**************************************************************
+Frees an event object. */
+
+void
+os_event_free(
+/*==========*/
+ os_event_t event); /* in: event to free */
+/**************************************************************
+Waits for an event object until it is in the signaled state. */
+
+void
+os_event_wait(
+/*==========*/
+ os_event_t event); /* in: event to wait */
+/**************************************************************
+Waits for an event object until it is in the signaled state or
+a timeout is exceeded. */
+
+ulint
+os_event_wait_time(
+/*===============*/
+ /* out: 0 if success,
+ OS_SYNC_TIME_EXCEEDED if timeout
+ was exceeded */
+ os_event_t event, /* in: event to wait */
+ ulint time); /* in: timeout in microseconds, or
+ OS_SYNC_INFINITE_TIME */
+/**************************************************************
+Waits for any event in an event array. Returns if even a single
+one is signaled or becomes signaled. */
+
+ulint
+os_event_wait_multiple(
+/*===================*/
+ /* out: index of the event
+ which was signaled */
+ ulint n, /* in: number of events in the
+ array */
+ os_event_t* event_array); /* in: pointer to an array of event
+ handles */
+/*************************************************************
+Creates an operating system mutex semaphore.
+Because these are slow, the mutex semaphore of the database
+itself (sync_mutex_t) should be used where possible. */
+
+os_mutex_t
+os_mutex_create(
+/*============*/
+ /* out: the mutex handle */
+ char* name); /* in: the name of the mutex, if NULL
+ the mutex is created without a name */
+/**************************************************************
+Acquires ownership of a mutex semaphore. */
+
+void
+os_mutex_enter(
+/*===========*/
+ os_mutex_t mutex); /* in: mutex to acquire */
+/**************************************************************
+Releases ownership of a mutex. */
+
+void
+os_mutex_exit(
+/*==========*/
+ os_mutex_t mutex); /* in: mutex to release */
+/**************************************************************
+Frees an mutex object. */
+
+void
+os_mutex_free(
+/*==========*/
+ os_mutex_t mutex); /* in: mutex to free */
+#ifndef _WIN32
+/**************************************************************
+Acquires ownership of a fast mutex. */
+UNIV_INLINE
+ulint
+os_fast_mutex_trylock(
+/*==================*/
+ /* out: 0 if success, != 0 if
+ was reserved by another
+ thread */
+ os_fast_mutex_t* fast_mutex); /* in: mutex to acquire */
+/**************************************************************
+Releases ownership of a fast mutex. */
+UNIV_INLINE
+void
+os_fast_mutex_unlock(
+/*=================*/
+ os_fast_mutex_t* fast_mutex); /* in: mutex to release */
+/*************************************************************
+Initializes an operating system fast mutex semaphore. */
+
+void
+os_fast_mutex_init(
+/*===============*/
+ os_fast_mutex_t* fast_mutex); /* in: fast mutex */
+/**************************************************************
+Acquires ownership of a fast mutex. */
+
+void
+os_fast_mutex_lock(
+/*===============*/
+ os_fast_mutex_t* fast_mutex); /* in: mutex to acquire */
+/**************************************************************
+Frees an mutex object. */
+
+void
+os_fast_mutex_free(
+/*===============*/
+ os_fast_mutex_t* fast_mutex); /* in: mutex to free */
+#endif
+
+#ifndef UNIV_NONINL
+#include "os0sync.ic"
+#endif
+
+#endif
diff --git a/innobase/include/os0sync.ic b/innobase/include/os0sync.ic
new file mode 100644
index 00000000000..d82f38483e3
--- /dev/null
+++ b/innobase/include/os0sync.ic
@@ -0,0 +1,56 @@
+/******************************************************
+The interface to the operating system synchronization primitives.
+
+(c) 1995 Innobase Oy
+
+Created 9/6/1995 Heikki Tuuri
+*******************************************************/
+
+#ifdef __WIN__
+#include <winbase.h>
+#endif
+
+#ifndef _WIN32
+/**************************************************************
+Acquires ownership of a fast mutex. */
+UNIV_INLINE
+ulint
+os_fast_mutex_trylock(
+/*==================*/
+ /* out: 0 if success, != 0 if
+ was reserved by another
+ thread */
+ os_fast_mutex_t* fast_mutex) /* in: mutex to acquire */
+{
+#ifdef __WIN__
+ int ret;
+
+ /* TryEnterCriticalSection is probably not found from
+ NT versions < 4! */
+ ret = TryEnterCriticalSection(fast_mutex);
+
+ if (ret) {
+ return(0);
+ }
+
+ return(1);
+#else
+ return((ulint) pthread_mutex_trylock(fast_mutex));
+#endif
+}
+
+/**************************************************************
+Releases ownership of a fast mutex. */
+UNIV_INLINE
+void
+os_fast_mutex_unlock(
+/*=================*/
+ os_fast_mutex_t* fast_mutex) /* in: mutex to release */
+{
+#ifdef __WIN__
+ LeaveCriticalSection(fast_mutex);
+#else
+ pthread_mutex_unlock(fast_mutex);
+#endif
+}
+#endif
diff --git a/innobase/include/os0thread.h b/innobase/include/os0thread.h
new file mode 100644
index 00000000000..2b2d9fb4bd6
--- /dev/null
+++ b/innobase/include/os0thread.h
@@ -0,0 +1,121 @@
+/******************************************************
+The interface to the operating system
+process and thread control primitives
+
+(c) 1995 Innobase Oy
+
+Created 9/8/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef os0thread_h
+#define os0thread_h
+
+#include "univ.i"
+
+/* Maximum number of threads which can be created in the program */
+#define OS_THREAD_MAX_N 1000
+
+/* Possible fixed priorities for threads */
+#define OS_THREAD_PRIORITY_NONE 100
+#define OS_THREAD_PRIORITY_BACKGROUND 1
+#define OS_THREAD_PRIORITY_NORMAL 2
+#define OS_THREAD_PRIORITY_ABOVE_NORMAL 3
+
+#ifdef __WIN__
+typedef void* os_thread_t;
+#else
+typedef pthread_t os_thread_t;
+#endif
+typedef unsigned long int os_thread_id_t;
+
+/********************************************************************
+Creates a new thread of execution. The execution starts from
+the function given. The start function takes a void* parameter
+and returns a ulint. */
+
+os_thread_t
+os_thread_create(
+/*=============*/
+ /* out: handle to the thread */
+ ulint (*start_f)(void*), /* in: pointer to function
+ from which to start */
+ void* arg, /* in: argument to start
+ function */
+ os_thread_id_t* thread_id); /* out: id of created
+ thread */
+/*********************************************************************
+A thread calling this function ends its execution. */
+
+void
+os_thread_exit(
+/*===========*/
+ ulint code); /* in: exit code */
+/*********************************************************************
+Returns the thread identifier of current thread. */
+
+os_thread_id_t
+os_thread_get_curr_id(void);
+/*========================*/
+/*********************************************************************
+Returns handle to the current thread. */
+
+os_thread_t
+os_thread_get_curr(void);
+/*====================*/
+/*********************************************************************
+Converts a thread id to a ulint. */
+
+ulint
+os_thread_conv_id_to_ulint(
+/*=======================*/
+ /* out: converted to ulint */
+ os_thread_id_t id); /* in: thread id */
+/*********************************************************************
+Waits for a thread to terminate. */
+
+void
+os_thread_wait(
+/*===========*/
+ os_thread_t thread); /* in: thread to wait */
+/*********************************************************************
+Advises the os to give up remainder of the thread's time slice. */
+
+void
+os_thread_yield(void);
+/*=================*/
+/*********************************************************************
+The thread sleeps at least the time given in microseconds. */
+
+void
+os_thread_sleep(
+/*============*/
+ ulint tm); /* in: time in microseconds */
+/**********************************************************************
+Gets a thread priority. */
+
+ulint
+os_thread_get_priority(
+/*===================*/
+ /* out: priority */
+ os_thread_t handle);/* in: OS handle to the thread */
+/**********************************************************************
+Sets a thread priority. */
+
+void
+os_thread_set_priority(
+/*===================*/
+ os_thread_t handle, /* in: OS handle to the thread */
+ ulint pri); /* in: priority: one of OS_PRIORITY_... */
+/**********************************************************************
+Gets the last operating system error code for the calling thread. */
+
+ulint
+os_thread_get_last_error(void);
+/*==========================*/
+
+
+#ifndef UNIV_NONINL
+#include "os0thread.ic"
+#endif
+
+#endif
diff --git a/innobase/include/os0thread.ic b/innobase/include/os0thread.ic
new file mode 100644
index 00000000000..a75aa3abb34
--- /dev/null
+++ b/innobase/include/os0thread.ic
@@ -0,0 +1,8 @@
+/******************************************************
+The interface to the operating system
+process and thread control primitives
+
+(c) 1995 Innobase Oy
+
+Created 9/8/1995 Heikki Tuuri
+*******************************************************/
diff --git a/innobase/include/page0cur.h b/innobase/include/page0cur.h
new file mode 100644
index 00000000000..144e0e02b21
--- /dev/null
+++ b/innobase/include/page0cur.h
@@ -0,0 +1,263 @@
+/************************************************************************
+The page cursor
+
+(c) 1994-1996 Innobase Oy
+
+Created 10/4/1994 Heikki Tuuri
+*************************************************************************/
+
+#ifndef page0cur_h
+#define page0cur_h
+
+#include "univ.i"
+
+#include "page0types.h"
+#include "page0page.h"
+#include "rem0rec.h"
+#include "data0data.h"
+#include "mtr0mtr.h"
+
+
+#define PAGE_CUR_ADAPT
+
+/* Page cursor search modes; the values must be in this order! */
+
+#define PAGE_CUR_G 1
+#define PAGE_CUR_GE 2
+#define PAGE_CUR_L 3
+#define PAGE_CUR_LE 4
+#define PAGE_CUR_DBG 5
+
+extern ulint page_cur_short_succ;
+
+/*************************************************************
+Gets pointer to the page frame where the cursor is positioned. */
+UNIV_INLINE
+page_t*
+page_cur_get_page(
+/*==============*/
+ /* out: page */
+ page_cur_t* cur); /* in: page cursor */
+/*************************************************************
+Gets the record where the cursor is positioned. */
+UNIV_INLINE
+rec_t*
+page_cur_get_rec(
+/*=============*/
+ /* out: record */
+ page_cur_t* cur); /* in: page cursor */
+/*************************************************************
+Sets the cursor object to point before the first user record
+on the page. */
+UNIV_INLINE
+void
+page_cur_set_before_first(
+/*======================*/
+ page_t* page, /* in: index page */
+ page_cur_t* cur); /* in: cursor */
+/*************************************************************
+Sets the cursor object to point after the last user record on
+the page. */
+UNIV_INLINE
+void
+page_cur_set_after_last(
+/*====================*/
+ page_t* page, /* in: index page */
+ page_cur_t* cur); /* in: cursor */
+/*************************************************************
+Returns TRUE if the cursor is before first user record on page. */
+UNIV_INLINE
+ibool
+page_cur_is_before_first(
+/*=====================*/
+ /* out: TRUE if at start */
+ page_cur_t* cur); /* in: cursor */
+/*************************************************************
+Returns TRUE if the cursor is after last user record. */
+UNIV_INLINE
+ibool
+page_cur_is_after_last(
+/*===================*/
+ /* out: TRUE if at end */
+ page_cur_t* cur); /* in: cursor */
+/**************************************************************
+Positions the cursor on the given record. */
+UNIV_INLINE
+void
+page_cur_position(
+/*==============*/
+ rec_t* rec, /* in: record on a page */
+ page_cur_t* cur); /* in: page cursor */
+/**************************************************************
+Invalidates a page cursor by setting the record pointer NULL. */
+UNIV_INLINE
+void
+page_cur_invalidate(
+/*================*/
+ page_cur_t* cur); /* in: page cursor */
+/**************************************************************
+Moves the cursor to the next record on page. */
+UNIV_INLINE
+void
+page_cur_move_to_next(
+/*==================*/
+ page_cur_t* cur); /* in: cursor; must not be after last */
+/**************************************************************
+Moves the cursor to the previous record on page. */
+UNIV_INLINE
+void
+page_cur_move_to_prev(
+/*==================*/
+ page_cur_t* cur); /* in: cursor; must not before first */
+/***************************************************************
+Inserts a record next to page cursor. Returns pointer to inserted record if
+succeed, i.e., enough space available, NULL otherwise. The cursor stays at
+the same position. */
+UNIV_INLINE
+rec_t*
+page_cur_tuple_insert(
+/*==================*/
+ /* out: pointer to record if succeed, NULL
+ otherwise */
+ page_cur_t* cursor, /* in: a page cursor */
+ dtuple_t* tuple, /* in: pointer to a data tuple */
+ mtr_t* mtr); /* in: mini-transaction handle */
+/***************************************************************
+Inserts a record next to page cursor. Returns pointer to inserted record if
+succeed, i.e., enough space available, NULL otherwise. The cursor stays at
+the same position. */
+UNIV_INLINE
+rec_t*
+page_cur_rec_insert(
+/*================*/
+ /* out: pointer to record if succeed, NULL
+ otherwise */
+ page_cur_t* cursor, /* in: a page cursor */
+ rec_t* rec, /* in: record to insert */
+ mtr_t* mtr); /* in: mini-transaction handle */
+/***************************************************************
+Inserts a record next to page cursor. Returns pointer to inserted record if
+succeed, i.e., enough space available, NULL otherwise. The record to be
+inserted can be in a data tuple or as a physical record. The other parameter
+must then be NULL. The cursor stays at the same position. */
+
+rec_t*
+page_cur_insert_rec_low(
+/*====================*/
+ /* out: pointer to record if succeed, NULL
+ otherwise */
+ page_cur_t* cursor, /* in: a page cursor */
+ dtuple_t* tuple, /* in: pointer to a data tuple or NULL */
+ ulint data_size,/* in: data size of tuple */
+ rec_t* rec, /* in: pointer to a physical record or NULL */
+ mtr_t* mtr); /* in: mini-transaction handle */
+/*****************************************************************
+Copies records from page to a newly created page, from a given record onward,
+including that record. Infimum and supremum records are not copied. */
+
+void
+page_copy_rec_list_end_to_created_page(
+/*===================================*/
+ page_t* new_page, /* in: index page to copy to */
+ page_t* page, /* in: index page */
+ rec_t* rec, /* in: first record to copy */
+ mtr_t* mtr); /* in: mtr */
+/***************************************************************
+Deletes a record at the page cursor. The cursor is moved to the
+next record after the deleted one. */
+
+void
+page_cur_delete_rec(
+/*================*/
+ page_cur_t* cursor, /* in: a page cursor */
+ mtr_t* mtr); /* in: mini-transaction handle */
+/********************************************************************
+Searches the right position for a page cursor. */
+UNIV_INLINE
+ulint
+page_cur_search(
+/*============*/
+ /* out: number of matched fields on the left */
+ page_t* page, /* in: index page */
+ dtuple_t* tuple, /* in: data tuple */
+ ulint mode, /* in: PAGE_CUR_L, PAGE_CUR_LE, PAGE_CUR_G,
+ or PAGE_CUR_GE */
+ page_cur_t* cursor);/* out: page cursor */
+/********************************************************************
+Searches the right position for a page cursor. */
+
+void
+page_cur_search_with_match(
+/*=======================*/
+ page_t* page, /* in: index page */
+ dtuple_t* tuple, /* in: data tuple */
+ ulint mode, /* in: PAGE_CUR_L, PAGE_CUR_LE, PAGE_CUR_G,
+ or PAGE_CUR_GE */
+ ulint* iup_matched_fields,
+ /* in/out: already matched fields in upper
+ limit record */
+ ulint* iup_matched_bytes,
+ /* in/out: already matched bytes in a field
+ not yet completely matched */
+ ulint* ilow_matched_fields,
+ /* in/out: already matched fields in lower
+ limit record */
+ ulint* ilow_matched_bytes,
+ /* in/out: already matched bytes in a field
+ not yet completely matched */
+ page_cur_t* cursor); /* out: page cursor */
+/***************************************************************
+Positions a page cursor on a randomly chosen user record on a page. If there
+are no user records, sets the cursor on the infimum record. */
+
+void
+page_cur_open_on_rnd_user_rec(
+/*==========================*/
+ page_t* page, /* in: page */
+ page_cur_t* cursor);/* in/out: page cursor */
+/***************************************************************
+Parses a log record of a record insert on a page. */
+
+byte*
+page_cur_parse_insert_rec(
+/*======================*/
+ /* out: end of log record or NULL */
+ ibool is_short,/* in: TRUE if short inserts */
+ byte* ptr, /* in: buffer */
+ byte* end_ptr,/* in: buffer end */
+ page_t* page, /* in: page or NULL */
+ mtr_t* mtr); /* in: mtr or NULL */
+/**************************************************************
+Parses a log record of copying a record list end to a new created page. */
+
+byte*
+page_parse_copy_rec_list_to_created_page(
+/*=====================================*/
+ /* out: end of log record or NULL */
+ byte* ptr, /* in: buffer */
+ byte* end_ptr,/* in: buffer end */
+ page_t* page, /* in: page or NULL */
+ mtr_t* mtr); /* in: mtr or NULL */
+/***************************************************************
+Parses log record of a record delete on a page. */
+
+byte*
+page_cur_parse_delete_rec(
+/*======================*/
+ /* out: pointer to record end or NULL */
+ byte* ptr, /* in: buffer */
+ byte* end_ptr,/* in: buffer end */
+ page_t* page, /* in: page or NULL */
+ mtr_t* mtr); /* in: mtr or NULL */
+
+/* Index page cursor */
+
+struct page_cur_struct{
+ byte* rec; /* pointer to a record on page */
+};
+
+#ifndef UNIV_NONINL
+#include "page0cur.ic"
+#endif
+
+#endif
diff --git a/innobase/include/page0cur.ic b/innobase/include/page0cur.ic
new file mode 100644
index 00000000000..4313036adaf
--- /dev/null
+++ b/innobase/include/page0cur.ic
@@ -0,0 +1,221 @@
+/************************************************************************
+The page cursor
+
+(c) 1994-1996 Innobase Oy
+
+Created 10/4/1994 Heikki Tuuri
+*************************************************************************/
+
+#include "page0page.h"
+
+
+/*************************************************************
+Gets pointer to the page frame where the cursor is positioned. */
+UNIV_INLINE
+page_t*
+page_cur_get_page(
+/*==============*/
+ /* out: page */
+ page_cur_t* cur) /* in: page cursor */
+{
+ ut_ad(cur);
+
+ return(buf_frame_align(cur->rec));
+}
+
+/*************************************************************
+Gets the record where the cursor is positioned. */
+UNIV_INLINE
+rec_t*
+page_cur_get_rec(
+/*=============*/
+ /* out: record */
+ page_cur_t* cur) /* in: page cursor */
+{
+ ut_ad(cur);
+
+ return(cur->rec);
+}
+
+/*************************************************************
+Sets the cursor object to point before the first user record
+on the page. */
+UNIV_INLINE
+void
+page_cur_set_before_first(
+/*======================*/
+ page_t* page, /* in: index page */
+ page_cur_t* cur) /* in: cursor */
+{
+ cur->rec = page_get_infimum_rec(page);
+}
+
+/*************************************************************
+Sets the cursor object to point after the last user record on
+the page. */
+UNIV_INLINE
+void
+page_cur_set_after_last(
+/*====================*/
+ page_t* page, /* in: index page */
+ page_cur_t* cur) /* in: cursor */
+{
+ cur->rec = page_get_supremum_rec(page);
+}
+
+/*************************************************************
+Returns TRUE if the cursor is before first user record on page. */
+UNIV_INLINE
+ibool
+page_cur_is_before_first(
+/*=====================*/
+ /* out: TRUE if at start */
+ page_cur_t* cur) /* in: cursor */
+{
+ if (page_get_infimum_rec(page_cur_get_page(cur)) == cur->rec) {
+
+ return(TRUE);
+ }
+
+ return(FALSE);
+}
+
+/*************************************************************
+Returns TRUE if the cursor is after last user record. */
+UNIV_INLINE
+ibool
+page_cur_is_after_last(
+/*===================*/
+ /* out: TRUE if at end */
+ page_cur_t* cur) /* in: cursor */
+{
+ if (page_get_supremum_rec(page_cur_get_page(cur)) == cur->rec) {
+
+ return(TRUE);
+ }
+
+ return(FALSE);
+}
+
+/**************************************************************
+Positions the cursor on the given record. */
+UNIV_INLINE
+void
+page_cur_position(
+/*==============*/
+ rec_t* rec, /* in: record on a page */
+ page_cur_t* cur) /* in: page cursor */
+{
+ ut_ad(rec && cur);
+
+ cur->rec = rec;
+}
+
+/**************************************************************
+Invalidates a page cursor by setting the record pointer NULL. */
+UNIV_INLINE
+void
+page_cur_invalidate(
+/*================*/
+ page_cur_t* cur) /* in: page cursor */
+{
+ ut_ad(cur);
+
+ cur->rec = NULL;
+}
+
+/**************************************************************
+Moves the cursor to the next record on page. */
+UNIV_INLINE
+void
+page_cur_move_to_next(
+/*==================*/
+ page_cur_t* cur) /* in: cursor; must not be after last */
+{
+ ut_ad(!page_cur_is_after_last(cur));
+
+ cur->rec = page_rec_get_next(cur->rec);
+}
+
+/**************************************************************
+Moves the cursor to the previous record on page. */
+UNIV_INLINE
+void
+page_cur_move_to_prev(
+/*==================*/
+ page_cur_t* cur) /* in: cursor; must not before first */
+{
+ ut_ad(!page_cur_is_before_first(cur));
+
+ cur->rec = page_rec_get_prev(cur->rec);
+}
+
+/********************************************************************
+Searches the right position for a page cursor. */
+UNIV_INLINE
+ulint
+page_cur_search(
+/*============*/
+ /* out: number of matched fields on the left */
+ page_t* page, /* in: index page */
+ dtuple_t* tuple, /* in: data tuple */
+ ulint mode, /* in: PAGE_CUR_L, PAGE_CUR_LE, PAGE_CUR_G,
+ or PAGE_CUR_GE */
+ page_cur_t* cursor) /* out: page cursor */
+{
+ ulint low_matched_fields = 0;
+ ulint low_matched_bytes = 0;
+ ulint up_matched_fields = 0;
+ ulint up_matched_bytes = 0;
+
+ ut_ad(dtuple_check_typed(tuple));
+
+ page_cur_search_with_match(page, tuple, mode,
+ &low_matched_fields,
+ &low_matched_bytes,
+ &up_matched_fields,
+ &up_matched_bytes,
+ cursor);
+ return(low_matched_fields);
+}
+
+/***************************************************************
+Inserts a record next to page cursor. Returns pointer to inserted record if
+succeed, i.e., enough space available, NULL otherwise. The cursor stays at
+the same position. */
+UNIV_INLINE
+rec_t*
+page_cur_tuple_insert(
+/*==================*/
+ /* out: pointer to record if succeed, NULL
+ otherwise */
+ page_cur_t* cursor, /* in: a page cursor */
+ dtuple_t* tuple, /* in: pointer to a data tuple */
+ mtr_t* mtr) /* in: mini-transaction handle */
+{
+ ulint data_size;
+
+ ut_ad(dtuple_check_typed(tuple));
+
+ data_size = dtuple_get_data_size(tuple);
+
+ return(page_cur_insert_rec_low(cursor, tuple, data_size, NULL, mtr));
+}
+
+/***************************************************************
+Inserts a record next to page cursor. Returns pointer to inserted record if
+succeed, i.e., enough space available, NULL otherwise. The cursor stays at
+the same position. */
+UNIV_INLINE
+rec_t*
+page_cur_rec_insert(
+/*================*/
+ /* out: pointer to record if succeed, NULL
+ otherwise */
+ page_cur_t* cursor, /* in: a page cursor */
+ rec_t* rec, /* in: record to insert */
+ mtr_t* mtr) /* in: mini-transaction handle */
+{
+ return(page_cur_insert_rec_low(cursor, NULL, 0, rec, mtr));
+}
+
diff --git a/innobase/include/page0page.h b/innobase/include/page0page.h
new file mode 100644
index 00000000000..8e68381b868
--- /dev/null
+++ b/innobase/include/page0page.h
@@ -0,0 +1,697 @@
+/******************************************************
+Index page routines
+
+(c) 1994-1996 Innobase Oy
+
+Created 2/2/1994 Heikki Tuuri
+*******************************************************/
+
+#ifndef page0page_h
+#define page0page_h
+
+#include "univ.i"
+
+#include "page0types.h"
+#include "fil0fil.h"
+#include "buf0buf.h"
+#include "data0data.h"
+#include "dict0dict.h"
+#include "rem0rec.h"
+#include "fsp0fsp.h"
+#include "mtr0mtr.h"
+
+#ifdef UNIV_MATERIALIZE
+#undef UNIV_INLINE
+#define UNIV_INLINE
+#endif
+
+/* PAGE HEADER
+ ===========
+
+Index page header starts at the first offset left free by the FIL-module */
+
+typedef byte page_header_t;
+
+#define PAGE_HEADER FSEG_PAGE_DATA /* index page header starts at this
+ offset */
+/*-----------------------------*/
+#define PAGE_N_DIR_SLOTS 0 /* number of slots in page directory */
+#define PAGE_HEAP_TOP 2 /* pointer to record heap top */
+#define PAGE_N_HEAP 4 /* number of records in the heap */
+#define PAGE_FREE 6 /* pointer to start of page free record list */
+#define PAGE_GARBAGE 8 /* number of bytes in deleted records */
+#define PAGE_LAST_INSERT 10 /* pointer to the last inserted record, or
+ NULL if this info has been reset by a delete,
+ for example */
+#define PAGE_DIRECTION 12 /* last insert direction: PAGE_LEFT, ... */
+#define PAGE_N_DIRECTION 14 /* number of consecutive inserts to the same
+ direction */
+#define PAGE_N_RECS 16 /* number of user records on the page */
+#define PAGE_MAX_TRX_ID 18 /* highest id of a trx which may have modified
+ a record on the page; a dulint; defined only
+ in secondary indexes; specifically, not in an
+ ibuf tree; NOTE: this may be modified only
+ when the thread has an x-latch to the page,
+ and ALSO an x-latch to btr_search_latch
+ if there is a hash index to the page! */
+#define PAGE_HEADER_PRIV_END 26 /* end of private data structure of the page
+ header which are set in a page create */
+/*----*/
+#define PAGE_LEVEL 26 /* level of the node in an index tree; the
+ leaf level is the level 0 */
+#define PAGE_INDEX_ID 28 /* index id where the page belongs */
+#define PAGE_BTR_SEG_LEAF 36 /* file segment header for the leaf pages in
+ a B-tree: defined only on the root page of a
+ B-tree, but not in the root of an ibuf tree */
+#define PAGE_BTR_IBUF_FREE_LIST PAGE_BTR_SEG_LEAF
+#define PAGE_BTR_IBUF_FREE_LIST_NODE PAGE_BTR_SEG_LEAF
+ /* in the place of PAGE_BTR_SEG_LEAF and _TOP
+ there is a free list base node if the page is
+ the root page of an ibuf tree, and at the same
+ place is the free list node if the page is in
+ a free list */
+#define PAGE_BTR_SEG_TOP (36 + FSEG_HEADER_SIZE)
+ /* file segment header for the non-leaf pages
+ in a B-tree: defined only on the root page of
+ a B-tree, but not in the root of an ibuf
+ tree */
+/*----*/
+#define PAGE_DATA (PAGE_HEADER + 36 + 2 * FSEG_HEADER_SIZE)
+ /* start of data on the page */
+
+#define PAGE_INFIMUM (PAGE_DATA + 1 + REC_N_EXTRA_BYTES)
+ /* offset of the page infimum record on the
+ page */
+#define PAGE_SUPREMUM (PAGE_DATA + 2 + 2 * REC_N_EXTRA_BYTES + 8)
+ /* offset of the page supremum record on the
+ page */
+#define PAGE_SUPREMUM_END (PAGE_SUPREMUM + 9)
+ /* offset of the page supremum record end on
+ the page */
+/*-----------------------------*/
+
+/* Directions of cursor movement */
+#define PAGE_LEFT 1
+#define PAGE_RIGHT 2
+#define PAGE_SAME_REC 3
+#define PAGE_SAME_PAGE 4
+#define PAGE_NO_DIRECTION 5
+
+/* PAGE DIRECTORY
+ ==============
+*/
+
+typedef byte page_dir_slot_t;
+typedef page_dir_slot_t page_dir_t;
+
+/* Offset of the directory start down from the page end. We call the
+slot with the highest file address directory start, as it points to
+the first record in the list of records. */
+#define PAGE_DIR FIL_PAGE_DATA_END
+
+/* We define a slot in the page directory as two bytes */
+#define PAGE_DIR_SLOT_SIZE 2
+
+/* The offset of the physically lower end of the directory, counted from
+page end, when the page is empty */
+#define PAGE_EMPTY_DIR_START (PAGE_DIR + 2 * PAGE_DIR_SLOT_SIZE)
+
+/* The maximum and minimum number of records owned by a directory slot. The
+number may drop below the minimum in the first and the last slot in the
+directory. */
+#define PAGE_DIR_SLOT_MAX_N_OWNED 8
+#define PAGE_DIR_SLOT_MIN_N_OWNED 4
+
+/*****************************************************************
+Returns the max trx id field value. */
+UNIV_INLINE
+dulint
+page_get_max_trx_id(
+/*================*/
+ page_t* page); /* in: page */
+/*****************************************************************
+Sets the max trx id field value. */
+
+void
+page_set_max_trx_id(
+/*================*/
+ page_t* page, /* in: page */
+ dulint trx_id);/* in: transaction id */
+/*****************************************************************
+Sets the max trx id field value if trx_id is bigger than the previous
+value. */
+UNIV_INLINE
+void
+page_update_max_trx_id(
+/*===================*/
+ page_t* page, /* in: page */
+ dulint trx_id); /* in: transaction id */
+/*****************************************************************
+Reads the given header field. */
+UNIV_INLINE
+ulint
+page_header_get_field(
+/*==================*/
+ page_t* page, /* in: page */
+ ulint field); /* in: PAGE_N_DIR_SLOTS, ... */
+/*****************************************************************
+Sets the given header field. */
+UNIV_INLINE
+void
+page_header_set_field(
+/*==================*/
+ page_t* page, /* in: page */
+ ulint field, /* in: PAGE_N_DIR_SLOTS, ... */
+ ulint val); /* in: value */
+/*****************************************************************
+Returns the pointer stored in the given header field. */
+UNIV_INLINE
+byte*
+page_header_get_ptr(
+/*================*/
+ /* out: pointer or NULL */
+ page_t* page, /* in: page */
+ ulint field); /* in: PAGE_FREE, ... */
+/*****************************************************************
+Sets the pointer stored in the given header field. */
+UNIV_INLINE
+void
+page_header_set_ptr(
+/*================*/
+ page_t* page, /* in: page */
+ ulint field, /* in: PAGE_FREE, ... */
+ byte* ptr); /* in: pointer or NULL*/
+/*****************************************************************
+Resets the last insert info field in the page header. Writes to mlog
+about this operation. */
+UNIV_INLINE
+void
+page_header_reset_last_insert(
+/*==========================*/
+ page_t* page, /* in: page */
+ mtr_t* mtr); /* in: mtr */
+/****************************************************************
+Gets the first record on the page. */
+UNIV_INLINE
+rec_t*
+page_get_infimum_rec(
+/*=================*/
+ /* out: the first record in record list */
+ page_t* page); /* in: page which must have record(s) */
+/****************************************************************
+Gets the last record on the page. */
+UNIV_INLINE
+rec_t*
+page_get_supremum_rec(
+/*==================*/
+ /* out: the last record in record list */
+ page_t* page); /* in: page which must have record(s) */
+/****************************************************************
+Returns the middle record of record list. If there are an even number
+of records in the list, returns the first record of upper half-list. */
+
+rec_t*
+page_get_middle_rec(
+/*================*/
+ /* out: middle record */
+ page_t* page); /* in: page */
+/*****************************************************************
+Compares a data tuple to a physical record. Differs from the function
+cmp_dtuple_rec_with_match in the way that the record must reside on an
+index page, and also page infimum and supremum records can be given in
+the parameter rec. These are considered as the negative infinity and
+the positive infinity in the alphabetical order. */
+UNIV_INLINE
+int
+page_cmp_dtuple_rec_with_match(
+/*===========================*/
+ /* out: 1, 0, -1, if dtuple is greater, equal,
+ less than rec, respectively, when only the
+ common first fields are compared */
+ dtuple_t* dtuple, /* in: data tuple */
+ rec_t* rec, /* in: physical record on a page; may also
+ be page infimum or supremum, in which case
+ matched-parameter values below are not
+ affected */
+ ulint* matched_fields, /* in/out: number of already completely
+ matched fields; when function returns
+ contains the value for current comparison */
+ ulint* matched_bytes); /* in/out: number of already matched
+ bytes within the first field not completely
+ matched; when function returns contains the
+ value for current comparison */
+/*****************************************************************
+Gets the number of user records on page (the infimum and supremum records
+are not user records). */
+UNIV_INLINE
+ulint
+page_get_n_recs(
+/*============*/
+ /* out: number of user records */
+ page_t* page); /* in: index page */
+/*******************************************************************
+Returns the number of records before the given record in chain.
+The number includes infimum and supremum records. */
+
+ulint
+page_rec_get_n_recs_before(
+/*=======================*/
+ /* out: number of records */
+ rec_t* rec); /* in: the physical record */
+/*****************************************************************
+Gets the number of dir slots in directory. */
+UNIV_INLINE
+ulint
+page_dir_get_n_slots(
+/*=================*/
+ /* out: number of slots */
+ page_t* page); /* in: index page */
+/*****************************************************************
+Gets pointer to nth directory slot. */
+UNIV_INLINE
+page_dir_slot_t*
+page_dir_get_nth_slot(
+/*==================*/
+ /* out: pointer to dir slot */
+ page_t* page, /* in: index page */
+ ulint n); /* in: position */
+/******************************************************************
+Used to check the consistency of a record on a page. */
+UNIV_INLINE
+ibool
+page_rec_check(
+/*===========*/
+ /* out: TRUE if succeed */
+ rec_t* rec); /* in: record */
+/*******************************************************************
+Gets the record pointed to by a directory slot. */
+UNIV_INLINE
+rec_t*
+page_dir_slot_get_rec(
+/*==================*/
+ /* out: pointer to record */
+ page_dir_slot_t* slot); /* in: directory slot */
+/*******************************************************************
+This is used to set the record offset in a directory slot. */
+UNIV_INLINE
+void
+page_dir_slot_set_rec(
+/*==================*/
+ page_dir_slot_t* slot, /* in: directory slot */
+ rec_t* rec); /* in: record on the page */
+/*******************************************************************
+Gets the number of records owned by a directory slot. */
+UNIV_INLINE
+ulint
+page_dir_slot_get_n_owned(
+/*======================*/
+ /* out: number of records */
+ page_dir_slot_t* slot); /* in: page directory slot */
+/*******************************************************************
+This is used to set the owned records field of a directory slot. */
+UNIV_INLINE
+void
+page_dir_slot_set_n_owned(
+/*======================*/
+ page_dir_slot_t* slot, /* in: directory slot */
+ ulint n); /* in: number of records owned
+ by the slot */
+/****************************************************************
+Calculates the space reserved for directory slots of a given
+number of records. The exact value is a fraction number
+n * PAGE_DIR_SLOT_SIZE / PAGE_DIR_SLOT_MIN_N_OWNED, and it is
+rounded upwards to an integer. */
+UNIV_INLINE
+ulint
+page_dir_calc_reserved_space(
+/*=========================*/
+ ulint n_recs); /* in: number of records */
+/*******************************************************************
+Looks for the directory slot which owns the given record. */
+UNIV_INLINE
+ulint
+page_dir_find_owner_slot(
+/*=====================*/
+ /* out: the directory slot number */
+ rec_t* rec); /* in: the physical record */
+/****************************************************************
+Gets the pointer to the next record on the page. */
+UNIV_INLINE
+rec_t*
+page_rec_get_next(
+/*==============*/
+ /* out: pointer to next record */
+ rec_t* rec); /* in: pointer to record, must not be page
+ supremum */
+/****************************************************************
+Sets the pointer to the next record on the page. */
+UNIV_INLINE
+void
+page_rec_set_next(
+/*==============*/
+ rec_t* rec, /* in: pointer to record, must not be
+ page supremum */
+ rec_t* next); /* in: pointer to next record, must not
+ be page infimum */
+/****************************************************************
+Gets the pointer to the previous record. */
+UNIV_INLINE
+rec_t*
+page_rec_get_prev(
+/*==============*/
+ /* out: pointer to previous record */
+ rec_t* rec); /* in: pointer to record, must not be page
+ infimum */
+/****************************************************************
+TRUE if the record is a user record on the page. */
+UNIV_INLINE
+ibool
+page_rec_is_user_rec(
+/*=================*/
+ /* out: TRUE if a user record */
+ rec_t* rec); /* in: record */
+/****************************************************************
+TRUE if the record is the supremum record on a page. */
+UNIV_INLINE
+ibool
+page_rec_is_supremum(
+/*=================*/
+ /* out: TRUE if the supremum record */
+ rec_t* rec); /* in: record */
+/****************************************************************
+TRUE if the record is the infimum record on a page. */
+UNIV_INLINE
+ibool
+page_rec_is_infimum(
+/*================*/
+ /* out: TRUE if the infimum record */
+ rec_t* rec); /* in: record */
+/****************************************************************
+TRUE if the record is the first user record on the page. */
+UNIV_INLINE
+ibool
+page_rec_is_first_user_rec(
+/*=======================*/
+ /* out: TRUE if first user record */
+ rec_t* rec); /* in: record */
+/****************************************************************
+TRUE if the record is the last user record on the page. */
+UNIV_INLINE
+ibool
+page_rec_is_last_user_rec(
+/*======================*/
+ /* out: TRUE if last user record */
+ rec_t* rec); /* in: record */
+/*******************************************************************
+Looks for the record which owns the given record. */
+UNIV_INLINE
+rec_t*
+page_rec_find_owner_rec(
+/*====================*/
+ /* out: the owner record */
+ rec_t* rec); /* in: the physical record */
+/***************************************************************************
+This is a low-level operation which is used in a database index creation
+to update the page number of a created B-tree to a data dictionary
+record. */
+
+void
+page_rec_write_index_page_no(
+/*=========================*/
+ rec_t* rec, /* in: record to update */
+ ulint i, /* in: index of the field to update */
+ ulint page_no,/* in: value to write */
+ mtr_t* mtr); /* in: mtr */
+/****************************************************************
+Returns the maximum combined size of records which can be inserted on top
+of record heap. */
+UNIV_INLINE
+ulint
+page_get_max_insert_size(
+/*=====================*/
+ /* out: maximum combined size for inserted records */
+ page_t* page, /* in: index page */
+ ulint n_recs); /* in: number of records */
+/****************************************************************
+Returns the maximum combined size of records which can be inserted on top
+of record heap if page is first reorganized. */
+UNIV_INLINE
+ulint
+page_get_max_insert_size_after_reorganize(
+/*======================================*/
+ /* out: maximum combined size for inserted records */
+ page_t* page, /* in: index page */
+ ulint n_recs);/* in: number of records */
+/*****************************************************************
+Calculates free space if a page is emptied. */
+UNIV_INLINE
+ulint
+page_get_free_space_of_empty(void);
+/*==============================*/
+ /* out: free space */
+/****************************************************************
+Returns the sum of the sizes of the records in the record list
+excluding the infimum and supremum records. */
+UNIV_INLINE
+ulint
+page_get_data_size(
+/*===============*/
+ /* out: data in bytes */
+ page_t* page); /* in: index page */
+/****************************************************************
+Allocates a block of memory from an index page. */
+
+byte*
+page_mem_alloc(
+/*===========*/
+ /* out: pointer to start of allocated
+ buffer, or NULL if allocation fails */
+ page_t* page, /* in: index page */
+ ulint need, /* in: number of bytes needed */
+ ulint* heap_no);/* out: this contains the heap number
+ of the allocated record if allocation succeeds */
+/****************************************************************
+Puts a record to free list. */
+UNIV_INLINE
+void
+page_mem_free(
+/*==========*/
+ page_t* page, /* in: index page */
+ rec_t* rec); /* in: pointer to the (origin of) record */
+/**************************************************************
+The index page creation function. */
+
+page_t*
+page_create(
+/*========*/
+ /* out: pointer to the page */
+ buf_frame_t* frame, /* in: a buffer frame where the page is
+ created */
+ mtr_t* mtr); /* in: mini-transaction handle */
+/*****************************************************************
+Differs from page_copy_rec_list_end, because this function does not
+touch the lock table and max trx id on page. */
+
+void
+page_copy_rec_list_end_no_locks(
+/*============================*/
+ page_t* new_page, /* in: index page to copy to */
+ page_t* page, /* in: index page */
+ rec_t* rec, /* in: record on page */
+ mtr_t* mtr); /* in: mtr */
+/*****************************************************************
+Copies records from page to new_page, from the given record onward,
+including that record. Infimum and supremum records are not copied.
+The records are copied to the start of the record list on new_page. */
+
+void
+page_copy_rec_list_end(
+/*===================*/
+ page_t* new_page, /* in: index page to copy to */
+ page_t* page, /* in: index page */
+ rec_t* rec, /* in: record on page */
+ mtr_t* mtr); /* in: mtr */
+/*****************************************************************
+Copies records from page to new_page, up to the given record, NOT
+including that record. Infimum and supremum records are not copied.
+The records are copied to the end of the record list on new_page. */
+
+void
+page_copy_rec_list_start(
+/*=====================*/
+ page_t* new_page, /* in: index page to copy to */
+ page_t* page, /* in: index page */
+ rec_t* rec, /* in: record on page */
+ mtr_t* mtr); /* in: mtr */
+/*****************************************************************
+Deletes records from a page from a given record onward, including that record.
+The infimum and supremum records are not deleted. */
+
+void
+page_delete_rec_list_end(
+/*=====================*/
+ page_t* page, /* in: index page */
+ rec_t* rec, /* in: record on page */
+ ulint n_recs, /* in: number of records to delete, or ULINT_UNDEFINED
+ if not known */
+ ulint size, /* in: the sum of the sizes of the records in the end
+ of the chain to delete, or ULINT_UNDEFINED if not
+ known */
+ mtr_t* mtr); /* in: mtr */
+/*****************************************************************
+Deletes records from page, up to the given record, NOT including
+that record. Infimum and supremum records are not deleted. */
+
+void
+page_delete_rec_list_start(
+/*=======================*/
+ page_t* page, /* in: index page */
+ rec_t* rec, /* in: record on page */
+ mtr_t* mtr); /* in: mtr */
+/*****************************************************************
+Moves record list end to another page. Moved records include
+split_rec. */
+
+void
+page_move_rec_list_end(
+/*===================*/
+ page_t* new_page, /* in: index page where to move */
+ page_t* page, /* in: index page */
+ rec_t* split_rec, /* in: first record to move */
+ mtr_t* mtr); /* in: mtr */
+/*****************************************************************
+Moves record list start to another page. Moved records do not include
+split_rec. */
+
+void
+page_move_rec_list_start(
+/*=====================*/
+ page_t* new_page, /* in: index page where to move */
+ page_t* page, /* in: index page */
+ rec_t* split_rec, /* in: first record not to move */
+ mtr_t* mtr); /* in: mtr */
+/********************************************************************
+Splits a directory slot which owns too many records. */
+
+void
+page_dir_split_slot(
+/*================*/
+ page_t* page, /* in: the index page in question */
+ ulint slot_no); /* in: the directory slot */
+/*****************************************************************
+Tries to balance the given directory slot with too few records
+with the upper neighbor, so that there are at least the minimum number
+of records owned by the slot; this may result in the merging of
+two slots. */
+
+void
+page_dir_balance_slot(
+/*==================*/
+ page_t* page, /* in: index page */
+ ulint slot_no); /* in: the directory slot */
+/**************************************************************
+Parses a log record of a record list end or start deletion. */
+
+byte*
+page_parse_delete_rec_list(
+/*=======================*/
+ /* out: end of log record or NULL */
+ byte type, /* in: MLOG_LIST_END_DELETE or MLOG_LIST_START_DELETE */
+ byte* ptr, /* in: buffer */
+ byte* end_ptr,/* in: buffer end */
+ page_t* page, /* in: page or NULL */
+ mtr_t* mtr); /* in: mtr or NULL */
+/***************************************************************
+Parses a redo log record of creating a page. */
+
+byte*
+page_parse_create(
+/*==============*/
+ /* out: end of log record or NULL */
+ byte* ptr, /* in: buffer */
+ byte* end_ptr,/* in: buffer end */
+ page_t* page, /* in: page or NULL */
+ mtr_t* mtr); /* in: mtr or NULL */
+/****************************************************************
+Prints record contents including the data relevant only in
+the index page context. */
+
+void
+page_rec_print(
+/*===========*/
+ rec_t* rec);
+/*******************************************************************
+This is used to print the contents of the directory for
+debugging purposes. */
+
+void
+page_dir_print(
+/*===========*/
+ page_t* page, /* in: index page */
+ ulint pr_n); /* in: print n first and n last entries */
+/*******************************************************************
+This is used to print the contents of the page record list for
+debugging purposes. */
+
+void
+page_print_list(
+/*============*/
+ page_t* page, /* in: index page */
+ ulint pr_n); /* in: print n first and n last entries */
+/*******************************************************************
+Prints the info in a page header. */
+
+void
+page_header_print(
+/*==============*/
+ page_t* page);
+/*******************************************************************
+This is used to print the contents of the page for
+debugging purposes. */
+
+void
+page_print(
+/*======*/
+ page_t* page, /* in: index page */
+ ulint dn, /* in: print dn first and last entries in directory */
+ ulint rn); /* in: print rn first and last records on page */
+/*******************************************************************
+The following is used to validate a record on a page. This function
+differs from rec_validate as it can also check the n_owned field and
+the heap_no field. */
+
+ibool
+page_rec_validate(
+/*==============*/
+ /* out: TRUE if ok */
+ rec_t* rec); /* in: record on the page */
+/*******************************************************************
+This function checks the consistency of an index page. */
+
+ibool
+page_validate(
+/*==========*/
+ /* out: TRUE if ok */
+ page_t* page, /* in: index page */
+ dict_index_t* index); /* in: data dictionary index containing
+ the page record type definition */
+/*******************************************************************
+Looks in the page record list for a record with the given heap number. */
+
+rec_t*
+page_find_rec_with_heap_no(
+/*=======================*/
+ /* out: record, NULL if not found */
+ page_t* page, /* in: index page */
+ ulint heap_no);/* in: heap number */
+
+#ifdef UNIV_MATERIALIZE
+#undef UNIV_INLINE
+#define UNIV_INLINE UNIV_INLINE_ORIGINAL
+#endif
+
+#ifndef UNIV_NONINL
+#include "page0page.ic"
+#endif
+
+#endif
diff --git a/innobase/include/page0page.ic b/innobase/include/page0page.ic
new file mode 100644
index 00000000000..a029604c2bc
--- /dev/null
+++ b/innobase/include/page0page.ic
@@ -0,0 +1,772 @@
+/******************************************************
+Index page routines
+
+(c) 1994-1996 Innobase Oy
+
+Created 2/2/1994 Heikki Tuuri
+*******************************************************/
+
+#include "mach0data.h"
+#include "rem0cmp.h"
+#include "mtr0log.h"
+
+#ifdef UNIV_MATERIALIZE
+#undef UNIV_INLINE
+#define UNIV_INLINE
+#endif
+
+/*****************************************************************
+Returns the max trx id field value. */
+UNIV_INLINE
+dulint
+page_get_max_trx_id(
+/*================*/
+ page_t* page) /* in: page */
+{
+ ut_ad(page);
+
+ return(mach_read_from_8(page + PAGE_HEADER + PAGE_MAX_TRX_ID));
+}
+
+/*****************************************************************
+Sets the max trx id field value if trx_id is bigger than the previous
+value. */
+UNIV_INLINE
+void
+page_update_max_trx_id(
+/*===================*/
+ page_t* page, /* in: page */
+ dulint trx_id) /* in: transaction id */
+{
+ ut_ad(page);
+
+ if (ut_dulint_cmp(page_get_max_trx_id(page), trx_id) < 0) {
+
+ page_set_max_trx_id(page, trx_id);
+ }
+}
+
+/*****************************************************************
+Reads the given header field. */
+UNIV_INLINE
+ulint
+page_header_get_field(
+/*==================*/
+ page_t* page, /* in: page */
+ ulint field) /* in: PAGE_LEVEL, ... */
+{
+ ut_ad(page);
+ ut_ad(field <= PAGE_INDEX_ID);
+
+ return(mach_read_from_2(page + PAGE_HEADER + field));
+}
+
+/*****************************************************************
+Sets the given header field. */
+UNIV_INLINE
+void
+page_header_set_field(
+/*==================*/
+ page_t* page, /* in: page */
+ ulint field, /* in: PAGE_LEVEL, ... */
+ ulint val) /* in: value */
+{
+ ut_ad(page);
+ ut_ad(field <= PAGE_N_RECS);
+ ut_ad(val < UNIV_PAGE_SIZE);
+
+ mach_write_to_2(page + PAGE_HEADER + field, val);
+}
+
+/*****************************************************************
+Returns the pointer stored in the given header field. */
+UNIV_INLINE
+byte*
+page_header_get_ptr(
+/*================*/
+ /* out: pointer or NULL */
+ page_t* page, /* in: page */
+ ulint field) /* in: PAGE_FREE, ... */
+{
+ ulint offs;
+
+ ut_ad(page);
+ ut_ad((field == PAGE_FREE)
+ || (field == PAGE_LAST_INSERT)
+ || (field == PAGE_HEAP_TOP));
+
+ offs = page_header_get_field(page, field);
+
+ ut_ad((field != PAGE_HEAP_TOP) || offs);
+
+ if (offs == 0) {
+
+ return(NULL);
+ }
+
+ return(page + offs);
+}
+
+/*****************************************************************
+Sets the pointer stored in the given header field. */
+UNIV_INLINE
+void
+page_header_set_ptr(
+/*================*/
+ page_t* page, /* in: page */
+ ulint field, /* in: PAGE_FREE, ... */
+ byte* ptr) /* in: pointer or NULL*/
+{
+ ulint offs;
+
+ ut_ad(page);
+ ut_ad((field == PAGE_FREE)
+ || (field == PAGE_LAST_INSERT)
+ || (field == PAGE_HEAP_TOP));
+
+ if (ptr == NULL) {
+ offs = 0;
+ } else {
+ offs = ptr - page;
+ }
+
+ ut_ad((field != PAGE_HEAP_TOP) || offs);
+
+ page_header_set_field(page, field, offs);
+}
+
+/*****************************************************************
+Resets the last insert info field in the page header. Writes to mlog
+about this operation. */
+UNIV_INLINE
+void
+page_header_reset_last_insert(
+/*==========================*/
+ page_t* page, /* in: page */
+ mtr_t* mtr) /* in: mtr */
+{
+ ut_ad(page && mtr);
+
+ mlog_write_ulint(page + PAGE_HEADER + PAGE_LAST_INSERT, 0,
+ MLOG_2BYTES, mtr);
+}
+
+/****************************************************************
+Gets the first record on the page. */
+UNIV_INLINE
+rec_t*
+page_get_infimum_rec(
+/*=================*/
+ /* out: the first record in record list */
+ page_t* page) /* in: page which must have record(s) */
+{
+ ut_ad(page);
+
+ return(page + PAGE_INFIMUM);
+}
+
+/****************************************************************
+Gets the last record on the page. */
+UNIV_INLINE
+rec_t*
+page_get_supremum_rec(
+/*==================*/
+ /* out: the last record in record list */
+ page_t* page) /* in: page which must have record(s) */
+{
+ ut_ad(page);
+
+ return(page + PAGE_SUPREMUM);
+}
+
+/****************************************************************
+TRUE if the record is a user record on the page. */
+UNIV_INLINE
+ibool
+page_rec_is_user_rec(
+/*=================*/
+ /* out: TRUE if a user record */
+ rec_t* rec) /* in: record */
+{
+ ut_ad(rec);
+
+ if (rec == page_get_supremum_rec(buf_frame_align(rec))) {
+
+ return(FALSE);
+ }
+
+ if (rec == page_get_infimum_rec(buf_frame_align(rec))) {
+
+ return(FALSE);
+ }
+
+ return(TRUE);
+}
+
+/****************************************************************
+TRUE if the record is the supremum record on a page. */
+UNIV_INLINE
+ibool
+page_rec_is_supremum(
+/*=================*/
+ /* out: TRUE if the supremum record */
+ rec_t* rec) /* in: record */
+{
+ ut_ad(rec);
+
+ if (rec == page_get_supremum_rec(buf_frame_align(rec))) {
+
+ return(TRUE);
+ }
+
+ return(FALSE);
+}
+
+/****************************************************************
+TRUE if the record is the infimum record on a page. */
+UNIV_INLINE
+ibool
+page_rec_is_infimum(
+/*================*/
+ /* out: TRUE if the infimum record */
+ rec_t* rec) /* in: record */
+{
+ ut_ad(rec);
+
+ if (rec == page_get_infimum_rec(buf_frame_align(rec))) {
+
+ return(TRUE);
+ }
+
+ return(FALSE);
+}
+
+/****************************************************************
+TRUE if the record is the first user record on the page. */
+UNIV_INLINE
+ibool
+page_rec_is_first_user_rec(
+/*=======================*/
+ /* out: TRUE if first user record */
+ rec_t* rec) /* in: record */
+{
+ ut_ad(rec);
+
+ if (rec == page_get_supremum_rec(buf_frame_align(rec))) {
+
+ return(FALSE);
+ }
+
+ if (rec == page_rec_get_next(
+ page_get_infimum_rec(buf_frame_align(rec)))) {
+
+ return(TRUE);
+ }
+
+ return(FALSE);
+}
+
+/****************************************************************
+TRUE if the record is the last user record on the page. */
+UNIV_INLINE
+ibool
+page_rec_is_last_user_rec(
+/*======================*/
+ /* out: TRUE if last user record */
+ rec_t* rec) /* in: record */
+{
+ ut_ad(rec);
+
+ if (rec == page_get_supremum_rec(buf_frame_align(rec))) {
+
+ return(FALSE);
+ }
+
+ if (page_rec_get_next(rec)
+ == page_get_supremum_rec(buf_frame_align(rec))) {
+
+ return(TRUE);
+ }
+
+ return(FALSE);
+}
+
+/*****************************************************************
+Compares a data tuple to a physical record. Differs from the function
+cmp_dtuple_rec_with_match in the way that the record must reside on an
+index page, and also page infimum and supremum records can be given in
+the parameter rec. These are considered as the negative infinity and
+the positive infinity in the alphabetical order. */
+UNIV_INLINE
+int
+page_cmp_dtuple_rec_with_match(
+/*===========================*/
+ /* out: 1, 0, -1, if dtuple is greater, equal,
+ less than rec, respectively, when only the
+ common first fields are compared */
+ dtuple_t* dtuple, /* in: data tuple */
+ rec_t* rec, /* in: physical record on a page; may also
+ be page infimum or supremum, in which case
+ matched-parameter values below are not
+ affected */
+ ulint* matched_fields, /* in/out: number of already completely
+ matched fields; when function returns
+ contains the value for current comparison */
+ ulint* matched_bytes) /* in/out: number of already matched
+ bytes within the first field not completely
+ matched; when function returns contains the
+ value for current comparison */
+{
+ page_t* page;
+
+ ut_ad(dtuple_check_typed(dtuple));
+
+ page = buf_frame_align(rec);
+
+ if (rec == page_get_infimum_rec(page)) {
+ return(1);
+ } else if (rec == page_get_supremum_rec(page)) {
+ return(-1);
+ } else {
+ return(cmp_dtuple_rec_with_match(dtuple, rec,
+ matched_fields,
+ matched_bytes));
+ }
+}
+
+/*****************************************************************
+Gets the number of user records on page (infimum and supremum records
+are not user records). */
+UNIV_INLINE
+ulint
+page_get_n_recs(
+/*============*/
+ /* out: number of user records */
+ page_t* page) /* in: index page */
+{
+ return(page_header_get_field(page, PAGE_N_RECS));
+}
+
+/*****************************************************************
+Gets the number of dir slots in directory. */
+UNIV_INLINE
+ulint
+page_dir_get_n_slots(
+/*=================*/
+ /* out: number of slots */
+ page_t* page) /* in: index page */
+{
+ return(page_header_get_field(page, PAGE_N_DIR_SLOTS));
+}
+
+/*****************************************************************
+Gets pointer to nth directory slot. */
+UNIV_INLINE
+page_dir_slot_t*
+page_dir_get_nth_slot(
+/*==================*/
+ /* out: pointer to dir slot */
+ page_t* page, /* in: index page */
+ ulint n) /* in: position */
+{
+ ut_ad(page_header_get_field(page, PAGE_N_DIR_SLOTS) > n);
+
+ return(page + UNIV_PAGE_SIZE - PAGE_DIR
+ - (n + 1) * PAGE_DIR_SLOT_SIZE);
+}
+
+/******************************************************************
+Used to check the consistency of a record on a page. */
+UNIV_INLINE
+ibool
+page_rec_check(
+/*===========*/
+ /* out: TRUE if succeed */
+ rec_t* rec) /* in: record */
+{
+ page_t* page;
+
+ ut_a(rec);
+
+ page = buf_frame_align(rec);
+
+ ut_a(rec <= page_header_get_ptr(page, PAGE_HEAP_TOP));
+ ut_a(rec >= page + PAGE_DATA);
+
+ return(TRUE);
+}
+
+/******************************************************************
+Used to check the consistency of a directory slot. */
+UNIV_INLINE
+ibool
+page_dir_slot_check(
+/*================*/
+ /* out: TRUE if succeed */
+ page_dir_slot_t* slot) /* in: slot */
+{
+ page_t* page;
+ ulint n_slots;
+ ulint n_owned;
+
+ ut_a(slot);
+
+ page = buf_frame_align(slot);
+
+ n_slots = page_header_get_field(page, PAGE_N_DIR_SLOTS);
+
+ ut_a(slot <= page_dir_get_nth_slot(page, 0));
+ ut_a(slot >= page_dir_get_nth_slot(page, n_slots - 1));
+
+ ut_a(page_rec_check(page + mach_read_from_2(slot)));
+
+ n_owned = rec_get_n_owned(page + mach_read_from_2(slot));
+
+ if (slot == page_dir_get_nth_slot(page, 0)) {
+ ut_a(n_owned == 1);
+ } else if (slot == page_dir_get_nth_slot(page, n_slots - 1)) {
+ ut_a(n_owned >= 1);
+ ut_a(n_owned <= PAGE_DIR_SLOT_MAX_N_OWNED);
+ } else {
+ ut_a(n_owned >= PAGE_DIR_SLOT_MIN_N_OWNED);
+ ut_a(n_owned <= PAGE_DIR_SLOT_MAX_N_OWNED);
+ }
+
+ return(TRUE);
+}
+
+/*******************************************************************
+Gets the record pointed to by a directory slot. */
+UNIV_INLINE
+rec_t*
+page_dir_slot_get_rec(
+/*==================*/
+ /* out: pointer to record */
+ page_dir_slot_t* slot) /* in: directory slot */
+{
+ return(buf_frame_align(slot) + mach_read_from_2(slot));
+}
+
+/*******************************************************************
+This is used to set the record offset in a directory slot. */
+UNIV_INLINE
+void
+page_dir_slot_set_rec(
+/*==================*/
+ page_dir_slot_t* slot, /* in: directory slot */
+ rec_t* rec) /* in: record on the page */
+{
+ ut_ad(page_rec_check(rec));
+
+ mach_write_to_2(slot, rec - buf_frame_align(rec));
+}
+
+/*******************************************************************
+Gets the number of records owned by a directory slot. */
+UNIV_INLINE
+ulint
+page_dir_slot_get_n_owned(
+/*======================*/
+ /* out: number of records */
+ page_dir_slot_t* slot) /* in: page directory slot */
+{
+ return(rec_get_n_owned(page_dir_slot_get_rec(slot)));
+}
+
+/*******************************************************************
+This is used to set the owned records field of a directory slot. */
+UNIV_INLINE
+void
+page_dir_slot_set_n_owned(
+/*======================*/
+ page_dir_slot_t* slot, /* in: directory slot */
+ ulint n) /* in: number of records owned
+ by the slot */
+{
+ rec_set_n_owned(page_dir_slot_get_rec(slot), n);
+}
+
+/****************************************************************
+Calculates the space reserved for directory slots of a given number of
+records. The exact value is a fraction number n * PAGE_DIR_SLOT_SIZE /
+PAGE_DIR_SLOT_MIN_N_OWNED, and it is rounded upwards to an integer. */
+UNIV_INLINE
+ulint
+page_dir_calc_reserved_space(
+/*=========================*/
+ ulint n_recs) /* in: number of records */
+{
+ return((PAGE_DIR_SLOT_SIZE * n_recs + PAGE_DIR_SLOT_MIN_N_OWNED - 1)
+ / PAGE_DIR_SLOT_MIN_N_OWNED);
+}
+
+/****************************************************************
+Gets the pointer to the next record on the page. */
+UNIV_INLINE
+rec_t*
+page_rec_get_next(
+/*==============*/
+ /* out: pointer to next record */
+ rec_t* rec) /* in: pointer to record */
+{
+ ulint offs;
+ page_t* page;
+
+ ut_ad(page_rec_check(rec));
+
+ page = buf_frame_align(rec);
+
+ offs = rec_get_next_offs(rec);
+
+ if (offs == 0) {
+
+ return(NULL);
+ }
+
+ return(page + offs);
+}
+
+/*******************************************************************
+Looks for the directory slot which owns the given record. */
+UNIV_INLINE
+ulint
+page_dir_find_owner_slot(
+/*=====================*/
+ /* out: the directory slot number */
+ rec_t* rec) /* in: the physical record */
+{
+ ulint i;
+ page_t* page;
+ page_dir_slot_t* slot;
+
+ ut_ad(page_rec_check(rec));
+
+ while (rec_get_n_owned(rec) == 0) {
+ rec = page_rec_get_next(rec);
+ }
+
+ page = buf_frame_align(rec);
+
+ i = page_dir_get_n_slots(page) - 1;
+ slot = page_dir_get_nth_slot(page, i);
+
+ while (page_dir_slot_get_rec(slot) != rec) {
+ i--;
+ slot = page_dir_get_nth_slot(page, i);
+ }
+
+ return(i);
+}
+
+/****************************************************************
+Sets the pointer to the next record on the page. */
+UNIV_INLINE
+void
+page_rec_set_next(
+/*==============*/
+ rec_t* rec, /* in: pointer to record, must not be page supremum */
+ rec_t* next) /* in: pointer to next record, must not be page
+ infimum */
+{
+ page_t* page;
+
+ ut_ad(page_rec_check(rec));
+ ut_ad((next == NULL)
+ || (buf_frame_align(rec) == buf_frame_align(next)));
+
+ page = buf_frame_align(rec);
+
+ ut_ad(rec != page_get_supremum_rec(page));
+ ut_ad(next != page_get_infimum_rec(page));
+
+ if (next == NULL) {
+ rec_set_next_offs(rec, 0);
+ } else {
+ rec_set_next_offs(rec, (ulint)(next - page));
+ }
+}
+
+/****************************************************************
+Gets the pointer to the previous record. */
+UNIV_INLINE
+rec_t*
+page_rec_get_prev(
+/*==============*/
+ /* out: pointer to previous record */
+ rec_t* rec) /* in: pointer to record, must not be page
+ infimum */
+{
+ page_dir_slot_t* slot;
+ ulint slot_no;
+ rec_t* rec2;
+ rec_t* prev_rec = NULL;
+ page_t* page;
+
+ ut_ad(page_rec_check(rec));
+
+ page = buf_frame_align(rec);
+
+ ut_ad(rec != page_get_infimum_rec(page));
+
+ slot_no = page_dir_find_owner_slot(rec);
+
+ ut_ad(slot_no != 0);
+
+ slot = page_dir_get_nth_slot(page, slot_no - 1);
+
+ rec2 = page_dir_slot_get_rec(slot);
+
+ while (rec != rec2) {
+ prev_rec = rec2;
+ rec2 = page_rec_get_next(rec2);
+ }
+
+ ut_ad(prev_rec);
+
+ return(prev_rec);
+}
+
+/*******************************************************************
+Looks for the record which owns the given record. */
+UNIV_INLINE
+rec_t*
+page_rec_find_owner_rec(
+/*====================*/
+ /* out: the owner record */
+ rec_t* rec) /* in: the physical record */
+{
+ ut_ad(page_rec_check(rec));
+
+ while (rec_get_n_owned(rec) == 0) {
+ rec = page_rec_get_next(rec);
+ }
+
+ return(rec);
+}
+
+/****************************************************************
+Returns the sum of the sizes of the records in the record list, excluding
+the infimum and supremum records. */
+UNIV_INLINE
+ulint
+page_get_data_size(
+/*===============*/
+ /* out: data in bytes */
+ page_t* page) /* in: index page */
+{
+ ulint ret;
+
+ ret = (ulint)(page_header_get_field(page, PAGE_HEAP_TOP)
+ - PAGE_SUPREMUM_END
+ - page_header_get_field(page, PAGE_GARBAGE));
+
+ ut_ad(ret < UNIV_PAGE_SIZE);
+
+ return(ret);
+}
+
+/*****************************************************************
+Calculates free space if a page is emptied. */
+UNIV_INLINE
+ulint
+page_get_free_space_of_empty(void)
+/*==============================*/
+ /* out: free space */
+{
+ return((ulint)(UNIV_PAGE_SIZE
+ - PAGE_SUPREMUM_END
+ - PAGE_DIR
+ - 2 * PAGE_DIR_SLOT_SIZE));
+}
+
+/****************************************************************
+Each user record on a page, and also the deleted user records in the heap
+takes its size plus the fraction of the dir cell size /
+PAGE_DIR_SLOT_MIN_N_OWNED bytes for it. If the sum of these exceeds the
+value of page_get_free_space_of_empty, the insert is impossible, otherwise
+it is allowed. This function returns the maximum combined size of records
+which can be inserted on top of the record heap. */
+UNIV_INLINE
+ulint
+page_get_max_insert_size(
+/*=====================*/
+ /* out: maximum combined size for inserted records */
+ page_t* page, /* in: index page */
+ ulint n_recs) /* in: number of records */
+{
+ ulint occupied;
+ ulint free_space;
+
+ occupied = page_header_get_field(page, PAGE_HEAP_TOP)
+ - PAGE_SUPREMUM_END
+ + page_dir_calc_reserved_space(
+ n_recs + (page_header_get_field(page, PAGE_N_HEAP) - 2));
+
+ free_space = page_get_free_space_of_empty();
+
+ /* Above the 'n_recs +' part reserves directory space for the new
+ inserted records; the '- 2' excludes page infimum and supremum
+ records */
+
+ if (occupied > free_space) {
+
+ return(0);
+ }
+
+ return(free_space - occupied);
+}
+
+/****************************************************************
+Returns the maximum combined size of records which can be inserted on top
+of the record heap if a page is first reorganized. */
+UNIV_INLINE
+ulint
+page_get_max_insert_size_after_reorganize(
+/*======================================*/
+ /* out: maximum combined size for inserted records */
+ page_t* page, /* in: index page */
+ ulint n_recs) /* in: number of records */
+{
+ ulint occupied;
+ ulint free_space;
+
+ occupied = page_get_data_size(page)
+ + page_dir_calc_reserved_space(n_recs + page_get_n_recs(page));
+
+ free_space = page_get_free_space_of_empty();
+
+ if (occupied > free_space) {
+
+ return(0);
+ }
+
+ return(free_space - occupied);
+}
+
+/****************************************************************
+Puts a record to free list. */
+UNIV_INLINE
+void
+page_mem_free(
+/*==========*/
+ page_t* page, /* in: index page */
+ rec_t* rec) /* in: pointer to the (origin of) record */
+{
+ rec_t* free;
+ ulint garbage;
+
+ free = page_header_get_ptr(page, PAGE_FREE);
+
+ page_rec_set_next(rec, free);
+ page_header_set_ptr(page, PAGE_FREE, rec);
+
+ garbage = page_header_get_field(page, PAGE_GARBAGE);
+
+ page_header_set_field(page, PAGE_GARBAGE,
+ garbage + rec_get_size(rec));
+}
+
+#ifdef UNIV_MATERIALIZE
+#undef UNIV_INLINE
+#define UNIV_INLINE UNIV_INLINE_ORIGINAL
+#endif
diff --git a/innobase/include/page0types.h b/innobase/include/page0types.h
new file mode 100644
index 00000000000..f149aad5b98
--- /dev/null
+++ b/innobase/include/page0types.h
@@ -0,0 +1,20 @@
+/******************************************************
+Index page routines
+
+(c) 1994-1996 Innobase Oy
+
+Created 2/2/1994 Heikki Tuuri
+*******************************************************/
+
+#ifndef page0types_h
+#define page0types_h
+
+#include "univ.i"
+
+/* Type of the index page */
+typedef byte page_t;
+typedef struct page_search_struct page_search_t;
+typedef struct page_cur_struct page_cur_t;
+
+
+#endif
diff --git a/innobase/include/pars0grm.h b/innobase/include/pars0grm.h
new file mode 100644
index 00000000000..d0b4b4c2e42
--- /dev/null
+++ b/innobase/include/pars0grm.h
@@ -0,0 +1,90 @@
+#ifndef YYSTYPE
+#define YYSTYPE int
+#endif
+#define PARS_INT_LIT 258
+#define PARS_FLOAT_LIT 259
+#define PARS_STR_LIT 260
+#define PARS_NULL_LIT 261
+#define PARS_ID_TOKEN 262
+#define PARS_AND_TOKEN 263
+#define PARS_OR_TOKEN 264
+#define PARS_NOT_TOKEN 265
+#define PARS_GE_TOKEN 266
+#define PARS_LE_TOKEN 267
+#define PARS_NE_TOKEN 268
+#define PARS_PROCEDURE_TOKEN 269
+#define PARS_IN_TOKEN 270
+#define PARS_OUT_TOKEN 271
+#define PARS_INT_TOKEN 272
+#define PARS_INTEGER_TOKEN 273
+#define PARS_FLOAT_TOKEN 274
+#define PARS_CHAR_TOKEN 275
+#define PARS_IS_TOKEN 276
+#define PARS_BEGIN_TOKEN 277
+#define PARS_END_TOKEN 278
+#define PARS_IF_TOKEN 279
+#define PARS_THEN_TOKEN 280
+#define PARS_ELSE_TOKEN 281
+#define PARS_ELSIF_TOKEN 282
+#define PARS_LOOP_TOKEN 283
+#define PARS_WHILE_TOKEN 284
+#define PARS_RETURN_TOKEN 285
+#define PARS_SELECT_TOKEN 286
+#define PARS_SUM_TOKEN 287
+#define PARS_COUNT_TOKEN 288
+#define PARS_DISTINCT_TOKEN 289
+#define PARS_FROM_TOKEN 290
+#define PARS_WHERE_TOKEN 291
+#define PARS_FOR_TOKEN 292
+#define PARS_DDOT_TOKEN 293
+#define PARS_CONSISTENT_TOKEN 294
+#define PARS_READ_TOKEN 295
+#define PARS_ORDER_TOKEN 296
+#define PARS_BY_TOKEN 297
+#define PARS_ASC_TOKEN 298
+#define PARS_DESC_TOKEN 299
+#define PARS_INSERT_TOKEN 300
+#define PARS_INTO_TOKEN 301
+#define PARS_VALUES_TOKEN 302
+#define PARS_UPDATE_TOKEN 303
+#define PARS_SET_TOKEN 304
+#define PARS_DELETE_TOKEN 305
+#define PARS_CURRENT_TOKEN 306
+#define PARS_OF_TOKEN 307
+#define PARS_CREATE_TOKEN 308
+#define PARS_TABLE_TOKEN 309
+#define PARS_INDEX_TOKEN 310
+#define PARS_UNIQUE_TOKEN 311
+#define PARS_CLUSTERED_TOKEN 312
+#define PARS_DOES_NOT_FIT_IN_MEM_TOKEN 313
+#define PARS_ON_TOKEN 314
+#define PARS_ASSIGN_TOKEN 315
+#define PARS_DECLARE_TOKEN 316
+#define PARS_CURSOR_TOKEN 317
+#define PARS_SQL_TOKEN 318
+#define PARS_OPEN_TOKEN 319
+#define PARS_FETCH_TOKEN 320
+#define PARS_CLOSE_TOKEN 321
+#define PARS_NOTFOUND_TOKEN 322
+#define PARS_TO_CHAR_TOKEN 323
+#define PARS_TO_NUMBER_TOKEN 324
+#define PARS_TO_BINARY_TOKEN 325
+#define PARS_BINARY_TO_NUMBER_TOKEN 326
+#define PARS_SUBSTR_TOKEN 327
+#define PARS_REPLSTR_TOKEN 328
+#define PARS_CONCAT_TOKEN 329
+#define PARS_INSTR_TOKEN 330
+#define PARS_LENGTH_TOKEN 331
+#define PARS_SYSDATE_TOKEN 332
+#define PARS_PRINTF_TOKEN 333
+#define PARS_ASSERT_TOKEN 334
+#define PARS_RND_TOKEN 335
+#define PARS_RND_STR_TOKEN 336
+#define PARS_ROW_PRINTF_TOKEN 337
+#define PARS_COMMIT_TOKEN 338
+#define PARS_ROLLBACK_TOKEN 339
+#define PARS_WORK_TOKEN 340
+#define NEG 341
+
+
+extern YYSTYPE yylval;
diff --git a/innobase/include/pars0opt.h b/innobase/include/pars0opt.h
new file mode 100644
index 00000000000..d091c3ee2d0
--- /dev/null
+++ b/innobase/include/pars0opt.h
@@ -0,0 +1,58 @@
+/******************************************************
+Simple SQL optimizer
+
+(c) 1997 Innobase Oy
+
+Created 12/21/1997 Heikki Tuuri
+*******************************************************/
+
+#ifndef pars0opt_h
+#define pars0opt_h
+
+#include "univ.i"
+#include "que0types.h"
+#include "usr0types.h"
+#include "pars0sym.h"
+#include "dict0types.h"
+#include "row0sel.h"
+
+/***********************************************************************
+Optimizes a select. Decides which indexes to tables to use. The tables
+are accessed in the order that they were written to the FROM part in the
+select statement. */
+
+void
+opt_search_plan(
+/*============*/
+ sel_node_t* sel_node); /* in: parsed select node */
+/***********************************************************************
+Looks for occurrences of the columns of the table in the query subgraph and
+adds them to the list of columns if an occurrence of the same column does not
+already exist in the list. If the column is already in the list, puts a value
+indirection to point to the occurrence in the column list, except if the
+column occurrence we are looking at is in the column list, in which case
+nothing is done. */
+
+void
+opt_find_all_cols(
+/*==============*/
+ ibool copy_val, /* in: if TRUE, new found columns are
+ added as columns to copy */
+ dict_index_t* index, /* in: index to use */
+ sym_node_list_t* col_list, /* in: base node of a list where
+ to add new found columns */
+ plan_t* plan, /* in: plan or NULL */
+ que_node_t* exp); /* in: expression or condition */
+/************************************************************************
+Prints info of a query plan. */
+
+void
+opt_print_query_plan(
+/*=================*/
+ sel_node_t* sel_node); /* in: select node */
+
+#ifndef UNIV_NONINL
+#include "pars0opt.ic"
+#endif
+
+#endif
diff --git a/innobase/include/pars0opt.ic b/innobase/include/pars0opt.ic
new file mode 100644
index 00000000000..0bfa8526bee
--- /dev/null
+++ b/innobase/include/pars0opt.ic
@@ -0,0 +1,7 @@
+/******************************************************
+Simple SQL optimizer
+
+(c) 1997 Innobase Oy
+
+Created 12/21/1997 Heikki Tuuri
+*******************************************************/
diff --git a/innobase/include/pars0pars.h b/innobase/include/pars0pars.h
new file mode 100644
index 00000000000..e08b071e246
--- /dev/null
+++ b/innobase/include/pars0pars.h
@@ -0,0 +1,566 @@
+/******************************************************
+SQL parser
+
+(c) 1996 Innobase Oy
+
+Created 11/19/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef pars0pars_h
+#define pars0pars_h
+
+#include "univ.i"
+#include "que0types.h"
+#include "usr0types.h"
+#include "pars0types.h"
+#include "row0types.h"
+#include "trx0types.h"
+
+extern int yydebug;
+
+/* If the following is set TRUE, the lexer will print the SQL string
+as it tokenizes it */
+
+extern ibool pars_print_lexed;
+
+/* Global variable used while parsing a single procedure or query : the code is
+NOT re-entrant */
+extern sym_tab_t* pars_sym_tab_global;
+
+extern pars_res_word_t pars_to_char_token;
+extern pars_res_word_t pars_to_number_token;
+extern pars_res_word_t pars_to_binary_token;
+extern pars_res_word_t pars_binary_to_number_token;
+extern pars_res_word_t pars_substr_token;
+extern pars_res_word_t pars_replstr_token;
+extern pars_res_word_t pars_concat_token;
+extern pars_res_word_t pars_length_token;
+extern pars_res_word_t pars_instr_token;
+extern pars_res_word_t pars_sysdate_token;
+extern pars_res_word_t pars_printf_token;
+extern pars_res_word_t pars_assert_token;
+extern pars_res_word_t pars_rnd_token;
+extern pars_res_word_t pars_rnd_str_token;
+extern pars_res_word_t pars_count_token;
+extern pars_res_word_t pars_sum_token;
+extern pars_res_word_t pars_distinct_token;
+extern pars_res_word_t pars_int_token;
+extern pars_res_word_t pars_char_token;
+extern pars_res_word_t pars_float_token;
+extern pars_res_word_t pars_update_token;
+extern pars_res_word_t pars_asc_token;
+extern pars_res_word_t pars_desc_token;
+extern pars_res_word_t pars_open_token;
+extern pars_res_word_t pars_close_token;
+extern pars_res_word_t pars_consistent_token;
+extern pars_res_word_t pars_unique_token;
+extern pars_res_word_t pars_clustered_token;
+
+extern ulint pars_star_denoter;
+
+/* Procedure parameter types */
+#define PARS_INPUT 0
+#define PARS_OUTPUT 1
+#define PARS_NOT_PARAM 2
+
+int
+yyparse(void);
+
+/*****************************************************************
+Parses an SQL string returning the query graph. */
+
+que_t*
+pars_sql(
+/*=====*/
+ /* out, own: the query graph */
+ char* str); /* in: SQL string */
+/*****************************************************************
+Retrieves characters to the lexical analyzer. */
+
+void
+pars_get_lex_chars(
+/*===============*/
+ char* buf, /* in/out: buffer where to copy */
+ int* result, /* out: number of characters copied or EOF */
+ int max_size); /* in: maximum number of characters which fit
+ in the buffer */
+/*****************************************************************
+Instructs the lexical analyzer to stop when it receives the EOF integer. */
+
+int
+yywrap(void);
+/*========*/
+ /* out: returns TRUE */
+/*****************************************************************
+Called by yyparse on error. */
+
+void
+yyerror(
+/*====*/
+ char* s); /* in: error message string */
+/*************************************************************************
+Parses a variable declaration. */
+
+sym_node_t*
+pars_variable_declaration(
+/*======================*/
+ /* out, own: symbol table node of type
+ SYM_VAR */
+ sym_node_t* node, /* in: symbol table node allocated for the
+ id of the variable */
+ pars_res_word_t* type); /* in: pointer to a type token */
+/*************************************************************************
+Parses a function expression. */
+
+func_node_t*
+pars_func(
+/*======*/
+ /* out, own: function node in a query tree */
+ que_node_t* res_word,/* in: function name reserved word */
+ que_node_t* arg); /* in: first argument in the argument list */
+/*************************************************************************
+Parses an operator expression. */
+
+func_node_t*
+pars_op(
+/*====*/
+ /* out, own: function node in a query tree */
+ int func, /* in: operator token code */
+ que_node_t* arg1, /* in: first argument */
+ que_node_t* arg2); /* in: second argument or NULL for an unary
+ operator */
+/*************************************************************************
+Parses an ORDER BY clause. Order by a single column only is supported. */
+
+order_node_t*
+pars_order_by(
+/*==========*/
+ /* out, own: order-by node in a query tree */
+ sym_node_t* column, /* in: column name */
+ pars_res_word_t* asc); /* in: &pars_asc_token or pars_desc_token */
+/*************************************************************************
+Parses a select list; creates a query graph node for the whole SELECT
+statement. */
+
+sel_node_t*
+pars_select_list(
+/*=============*/
+ /* out, own: select node in a query
+ tree */
+ que_node_t* select_list, /* in: select list */
+ sym_node_t* into_list); /* in: variables list or NULL */
+/*************************************************************************
+Parses a cursor declaration. */
+
+que_node_t*
+pars_cursor_declaration(
+/*====================*/
+ /* out: sym_node */
+ sym_node_t* sym_node, /* in: cursor id node in the symbol
+ table */
+ sel_node_t* select_node); /* in: select node */
+/*************************************************************************
+Parses a select statement. */
+
+sel_node_t*
+pars_select_statement(
+/*==================*/
+ /* out, own: select node in a query
+ tree */
+ sel_node_t* select_node, /* in: select node already containing
+ the select list */
+ sym_node_t* table_list, /* in: table list */
+ que_node_t* search_cond, /* in: search condition or NULL */
+ pars_res_word_t* for_update, /* in: NULL or &pars_update_token */
+ pars_res_word_t* consistent_read,/* in: NULL or
+ &pars_consistent_token */
+ order_node_t* order_by); /* in: NULL or an order-by node */
+/*************************************************************************
+Parses a column assignment in an update. */
+
+col_assign_node_t*
+pars_column_assignment(
+/*===================*/
+ /* out: column assignment node */
+ sym_node_t* column, /* in: column to assign */
+ que_node_t* exp); /* in: value to assign */
+/*************************************************************************
+Parses a delete or update statement start. */
+
+upd_node_t*
+pars_update_statement_start(
+/*========================*/
+ /* out, own: update node in a query
+ tree */
+ ibool is_delete, /* in: TRUE if delete */
+ sym_node_t* table_sym, /* in: table name node */
+ col_assign_node_t* col_assign_list);/* in: column assignment list, NULL
+ if delete */
+/*************************************************************************
+Parses an update or delete statement. */
+
+upd_node_t*
+pars_update_statement(
+/*==================*/
+ /* out, own: update node in a query
+ tree */
+ upd_node_t* node, /* in: update node */
+ sym_node_t* cursor_sym, /* in: pointer to a cursor entry in
+ the symbol table or NULL */
+ que_node_t* search_cond); /* in: search condition or NULL */
+/*************************************************************************
+Parses an insert statement. */
+
+ins_node_t*
+pars_insert_statement(
+/*==================*/
+ /* out, own: update node in a query
+ tree */
+ sym_node_t* table_sym, /* in: table name node */
+ que_node_t* values_list, /* in: value expression list or NULL */
+ sel_node_t* select); /* in: select condition or NULL */
+/*************************************************************************
+Parses a procedure parameter declaration. */
+
+sym_node_t*
+pars_parameter_declaration(
+/*=======================*/
+ /* out, own: symbol table node of type
+ SYM_VAR */
+ sym_node_t* node, /* in: symbol table node allocated for the
+ id of the parameter */
+ ulint param_type,
+ /* in: PARS_INPUT or PARS_OUTPUT */
+ pars_res_word_t* type); /* in: pointer to a type token */
+/*************************************************************************
+Parses an elsif element. */
+
+elsif_node_t*
+pars_elsif_element(
+/*===============*/
+ /* out: elsif node */
+ que_node_t* cond, /* in: if-condition */
+ que_node_t* stat_list); /* in: statement list */
+/*************************************************************************
+Parses an if-statement. */
+
+if_node_t*
+pars_if_statement(
+/*==============*/
+ /* out: if-statement node */
+ que_node_t* cond, /* in: if-condition */
+ que_node_t* stat_list, /* in: statement list */
+ que_node_t* else_part); /* in: else-part statement list */
+/*************************************************************************
+Parses a for-loop-statement. */
+
+for_node_t*
+pars_for_statement(
+/*===============*/
+ /* out: for-statement node */
+ sym_node_t* loop_var, /* in: loop variable */
+ que_node_t* loop_start_limit,/* in: loop start expression */
+ que_node_t* loop_end_limit, /* in: loop end expression */
+ que_node_t* stat_list); /* in: statement list */
+/*************************************************************************
+Parses a while-statement. */
+
+while_node_t*
+pars_while_statement(
+/*=================*/
+ /* out: while-statement node */
+ que_node_t* cond, /* in: while-condition */
+ que_node_t* stat_list); /* in: statement list */
+/*************************************************************************
+Parses a return-statement. */
+
+return_node_t*
+pars_return_statement(void);
+/*=======================*/
+ /* out: return-statement node */
+/*************************************************************************
+Parses a procedure call. */
+
+func_node_t*
+pars_procedure_call(
+/*================*/
+ /* out: function node */
+ que_node_t* res_word,/* in: procedure name reserved word */
+ que_node_t* args); /* in: argument list */
+/*************************************************************************
+Parses an assignment statement. */
+
+assign_node_t*
+pars_assignment_statement(
+/*======================*/
+ /* out: assignment statement node */
+ sym_node_t* var, /* in: variable to assign */
+ que_node_t* val); /* in: value to assign */
+/*************************************************************************
+Parses a fetch statement. */
+
+fetch_node_t*
+pars_fetch_statement(
+/*=================*/
+ /* out: fetch statement node */
+ sym_node_t* cursor, /* in: cursor node */
+ sym_node_t* into_list); /* in: variables to set */
+/*************************************************************************
+Parses an open or close cursor statement. */
+
+open_node_t*
+pars_open_statement(
+/*================*/
+ /* out: fetch statement node */
+ ulint type, /* in: ROW_SEL_OPEN_CURSOR
+ or ROW_SEL_CLOSE_CURSOR */
+ sym_node_t* cursor); /* in: cursor node */
+/*************************************************************************
+Parses a row_printf-statement. */
+
+row_printf_node_t*
+pars_row_printf_statement(
+/*======================*/
+ /* out: row_printf-statement node */
+ sel_node_t* sel_node); /* in: select node */
+/*************************************************************************
+Parses a commit statement. */
+
+commit_node_t*
+pars_commit_statement(void);
+/*=======================*/
+/*************************************************************************
+Parses a rollback statement. */
+
+roll_node_t*
+pars_rollback_statement(void);
+/*=========================*/
+/*************************************************************************
+Parses a column definition at a table creation. */
+
+sym_node_t*
+pars_column_def(
+/*============*/
+ /* out: column sym table node */
+ sym_node_t* sym_node, /* in: column node in the symbol
+ table */
+ pars_res_word_t* type); /* in: data type */
+/*************************************************************************
+Parses a table creation operation. */
+
+tab_node_t*
+pars_create_table(
+/*==============*/
+ /* out: table create subgraph */
+ sym_node_t* table_sym, /* in: table name node in the symbol
+ table */
+ sym_node_t* column_defs, /* in: list of column names */
+ void* not_fit_in_memory);/* in: a non-NULL pointer means that
+ this is a table which in simulations
+ should be simulated as not fitting
+ in memory; thread is put to sleep
+ to simulate disk accesses; NOTE that
+ this flag is not stored to the data
+ dictionary on disk, and the database
+ will forget about non-NULL value if
+ it has to reload the table definition
+ from disk */
+/*************************************************************************
+Parses an index creation operation. */
+
+ind_node_t*
+pars_create_index(
+/*==============*/
+ /* out: index create subgraph */
+ pars_res_word_t* unique_def, /* in: not NULL if a unique index */
+ pars_res_word_t* clustered_def, /* in: not NULL if a clustered index */
+ sym_node_t* index_sym, /* in: index name node in the symbol
+ table */
+ sym_node_t* table_sym, /* in: table name node in the symbol
+ table */
+ sym_node_t* column_list); /* in: list of column names */
+/*************************************************************************
+Parses a procedure definition. */
+
+que_fork_t*
+pars_procedure_definition(
+/*======================*/
+ /* out: query fork node */
+ sym_node_t* sym_node, /* in: procedure id node in the symbol
+ table */
+ sym_node_t* param_list, /* in: parameter declaration list */
+ que_node_t* stat_list); /* in: statement list */
+/*****************************************************************
+Reads stored procedure input parameter values from a buffer. */
+
+void
+pars_proc_read_input_params_from_buf(
+/*=================================*/
+ que_t* graph, /* in: query graph which contains a stored procedure */
+ byte* buf); /* in: buffer */
+/*****************************************************************
+Writes stored procedure output parameter values to a buffer. */
+
+ulint
+pars_proc_write_output_params_to_buf(
+/*=================================*/
+ byte* buf, /* in: buffer which must be big enough */
+ que_t* graph); /* in: query graph which contains a stored procedure */
+/*****************************************************************
+Parses a stored procedure call, when this is not within another stored
+procedure, that is, the client issues a procedure call directly. */
+
+que_fork_t*
+pars_stored_procedure_call(
+/*=======================*/
+ /* out: query graph */
+ sym_node_t* sym_node); /* in: stored procedure name */
+/*****************************************************************
+Writes info about query parameter markers (denoted with '?' in ODBC) into a
+buffer. */
+
+ulint
+pars_write_query_param_info(
+/*========================*/
+ /* out: number of bytes used for info in buf */
+ byte* buf, /* in: buffer which must be big enough */
+ que_fork_t* graph); /* in: parsed query graph */
+/**********************************************************************
+Completes a query graph by adding query thread and fork nodes
+above it and prepares the graph for running. The fork created is of
+type QUE_FORK_MYSQL_INTERFACE. */
+
+que_thr_t*
+pars_complete_graph_for_exec(
+/*=========================*/
+ /* out: query thread node to run */
+ que_node_t* node, /* in: root node for an incomplete
+ query graph */
+ trx_t* trx, /* in: transaction handle */
+ mem_heap_t* heap); /* in: memory heap from which allocated */
+
+
+/* Struct used to denote a reserved word in a parsing tree */
+struct pars_res_word_struct{
+ ulint code; /* the token code for the reserved word from
+ pars0grm.h */
+};
+
+/* A predefined function or operator node in a parsing tree; this construct
+is also used for some non-functions like the assignment ':=' */
+struct func_node_struct{
+ que_common_t common; /* type: QUE_NODE_FUNC */
+ int func; /* token code of the function name */
+ ulint class; /* class of the function */
+ que_node_t* args; /* argument(s) of the function */
+ UT_LIST_NODE_T(func_node_t) cond_list;
+ /* list of comparison conditions; defined
+ only for comparison operator nodes except,
+ presently, for OPT_SCROLL_TYPE ones */
+ UT_LIST_NODE_T(func_node_t) func_node_list;
+ /* list of function nodes in a parsed
+ query graph */
+};
+
+/* An order-by node in a select */
+struct order_node_struct{
+ que_common_t common; /* type: QUE_NODE_ORDER */
+ sym_node_t* column; /* order-by column */
+ ibool asc; /* TRUE if ascending, FALSE if descending */
+};
+
+/* Procedure definition node */
+struct proc_node_struct{
+ que_common_t common; /* type: QUE_NODE_PROC */
+ sym_node_t* proc_id; /* procedure name symbol in the symbol
+ table of this same procedure */
+ sym_node_t* param_list; /* input and output parameters */
+ que_node_t* stat_list; /* statement list */
+ sym_tab_t* sym_tab; /* symbol table of this procedure */
+ dict_proc_t* dict_proc; /* stored procedure node in the
+ dictionary cache, if defined */
+};
+
+/* Stored procedure call node */
+struct call_node_struct{
+ que_common_t common; /* type: QUE_NODE_CALL */
+ sym_node_t* proc_name; /* stored procedure name */
+ dict_proc_t* procedure_def; /* pointer to a stored procedure graph
+ in the dictionary stored procedure
+ cache */
+ sym_tab_t* sym_tab; /* symbol table of this query */
+};
+
+/* elsif-element node */
+struct elsif_node_struct{
+ que_common_t common; /* type: QUE_NODE_ELSIF */
+ que_node_t* cond; /* if condition */
+ que_node_t* stat_list; /* statement list */
+};
+
+/* if-statement node */
+struct if_node_struct{
+ que_common_t common; /* type: QUE_NODE_IF */
+ que_node_t* cond; /* if condition */
+ que_node_t* stat_list; /* statement list */
+ que_node_t* else_part; /* else-part statement list */
+ elsif_node_t* elsif_list; /* elsif element list */
+};
+
+/* while-statement node */
+struct while_node_struct{
+ que_common_t common; /* type: QUE_NODE_WHILE */
+ que_node_t* cond; /* while condition */
+ que_node_t* stat_list; /* statement list */
+};
+
+/* for-loop-statement node */
+struct for_node_struct{
+ que_common_t common; /* type: QUE_NODE_FOR */
+ sym_node_t* loop_var; /* loop variable: this is the
+ dereferenced symbol from the
+ variable declarations, not the
+ symbol occurrence in the for loop
+ definition */
+ que_node_t* loop_start_limit;/* initial value of loop variable */
+ que_node_t* loop_end_limit; /* end value of loop variable */
+ int loop_end_value; /* evaluated value for the end value:
+ it is calculated only when the loop
+ is entered, and will not change within
+ the loop */
+ que_node_t* stat_list; /* statement list */
+};
+
+/* return-statement node */
+struct return_node_struct{
+ que_common_t common; /* type: QUE_NODE_RETURN */
+};
+
+/* Assignment statement node */
+struct assign_node_struct{
+ que_common_t common; /* type: QUE_NODE_ASSIGNMENT */
+ sym_node_t* var; /* variable to set */
+ que_node_t* val; /* value to assign */
+};
+
+/* Column assignment node */
+struct col_assign_node_struct{
+ que_common_t common; /* type: QUE_NODE_COL_ASSIGN */
+ sym_node_t* col; /* column to set */
+ que_node_t* val; /* value to assign */
+};
+
+/* Classes of functions */
+#define PARS_FUNC_ARITH 1 /* +, -, *, / */
+#define PARS_FUNC_LOGICAL 2
+#define PARS_FUNC_CMP 3
+#define PARS_FUNC_PREDEFINED 4 /* TO_NUMBER, SUBSTR, ... */
+#define PARS_FUNC_AGGREGATE 5 /* COUNT, DISTINCT, SUM */
+#define PARS_FUNC_OTHER 6 /* these are not real functions,
+ e.g., := */
+
+#ifndef UNIV_NONINL
+#include "pars0pars.ic"
+#endif
+
+#endif
diff --git a/innobase/include/pars0pars.ic b/innobase/include/pars0pars.ic
new file mode 100644
index 00000000000..155b6659ace
--- /dev/null
+++ b/innobase/include/pars0pars.ic
@@ -0,0 +1,7 @@
+/******************************************************
+SQL parser
+
+(c) 1996 Innobase Oy
+
+Created 11/19/1996 Heikki Tuuri
+*******************************************************/
diff --git a/innobase/include/pars0sym.h b/innobase/include/pars0sym.h
new file mode 100644
index 00000000000..9fdeb1984a9
--- /dev/null
+++ b/innobase/include/pars0sym.h
@@ -0,0 +1,191 @@
+/******************************************************
+SQL parser symbol table
+
+(c) 1997 Innobase Oy
+
+Created 12/15/1997 Heikki Tuuri
+*******************************************************/
+
+#ifndef pars0sym_h
+#define pars0sym_h
+
+#include "univ.i"
+#include "que0types.h"
+#include "usr0types.h"
+#include "dict0types.h"
+#include "pars0types.h"
+#include "row0types.h"
+
+/**********************************************************************
+Creates a symbol table for a single stored procedure or query. */
+
+sym_tab_t*
+sym_tab_create(
+/*===========*/
+ /* out, own: symbol table */
+ mem_heap_t* heap); /* in: memory heap where to create */
+/**********************************************************************
+Frees the memory allocated dynamically AFTER parsing phase for variables
+etc. in the symbol table. Does not free the mem heap where the table was
+originally created. Frees also SQL explicit cursor definitions. */
+
+void
+sym_tab_free_private(
+/*=================*/
+ sym_tab_t* sym_tab); /* in, own: symbol table */
+/**********************************************************************
+Adds an integer literal to a symbol table. */
+
+sym_node_t*
+sym_tab_add_int_lit(
+/*================*/
+ /* out: symbol table node */
+ sym_tab_t* sym_tab, /* in: symbol table */
+ ulint val); /* in: integer value */
+/**********************************************************************
+Adds an string literal to a symbol table. */
+
+sym_node_t*
+sym_tab_add_str_lit(
+/*================*/
+ /* out: symbol table node */
+ sym_tab_t* sym_tab, /* in: symbol table */
+ byte* str, /* in: string with no quotes around
+ it */
+ ulint len); /* in: string length */
+/**********************************************************************
+Adds an SQL null literal to a symbol table. */
+
+sym_node_t*
+sym_tab_add_null_lit(
+/*=================*/
+ /* out: symbol table node */
+ sym_tab_t* sym_tab); /* in: symbol table */
+/**********************************************************************
+Adds an identifier to a symbol table. */
+
+sym_node_t*
+sym_tab_add_id(
+/*===========*/
+ /* out: symbol table node */
+ sym_tab_t* sym_tab, /* in: symbol table */
+ byte* name, /* in: identifier name */
+ ulint len); /* in: identifier length */
+
+#define SYM_CLUST_FIELD_NO 0
+#define SYM_SEC_FIELD_NO 1
+
+struct sym_node_struct{
+ que_common_t common; /* node type:
+ QUE_NODE_SYMBOL */
+ /* NOTE: if the data field in 'common.val' is not NULL and the symbol
+ table node is not for a temporary column, the memory for the value has
+ been allocated from dynamic memory and it should be freed when the
+ symbol table is discarded */
+
+ sym_node_t* indirection; /* pointer to
+ another symbol table
+ node which contains
+ the value for this
+ node, NULL otherwise */
+ sym_node_t* alias; /* pointer to
+ another symbol table
+ node for which this
+ node is an alias,
+ NULL otherwise */
+ UT_LIST_NODE_T(sym_node_t) col_var_list; /* list of table
+ columns or a list of
+ input variables for an
+ explicit cursor */
+ ibool copy_val; /* TRUE if a column
+ and its value should
+ be copied to dynamic
+ memory when fetched */
+ ulint field_nos[2]; /* if a column, in
+ the position
+ SYM_CLUST_FIELD_NO is
+ the field number in the
+ clustered index; in
+ the position
+ SYM_SEC_FIELD_NO
+ the field number in the
+ non-clustered index to
+ use first; if not found
+ from the index, then
+ ULINT_UNDEFINED */
+ ibool resolved; /* TRUE if the
+ meaning of a variable
+ or a column has been
+ resolved; for literals
+ this is always TRUE */
+ ulint token_type; /* SYM_VAR, SYM_COLUMN,
+ SYM_IMPLICIT_VAR,
+ SYM_LIT, SYM_TABLE,
+ SYM_CURSOR, ... */
+ char* name; /* name of an id */
+ ulint name_len; /* id name length */
+ dict_table_t* table; /* table definition
+ if a table id or a
+ column id */
+ dict_proc_t* procedure_def; /* stored procedure
+ definition, if a
+ stored procedure name */
+ ulint col_no; /* column number if a
+ column */
+ sel_buf_t* prefetch_buf; /* NULL, or a buffer
+ for cached column
+ values for prefetched
+ rows */
+ sel_node_t* cursor_def; /* cursor definition
+ select node if a
+ named cursor */
+ ulint param_type; /* PARS_INPUT,
+ PARS_OUTPUT, or
+ PARS_NOT_PARAM if not a
+ procedure parameter */
+ sym_tab_t* sym_table; /* back pointer to
+ the symbol table */
+ UT_LIST_NODE_T(sym_node_t) sym_list; /* list of symbol
+ nodes */
+};
+
+struct sym_tab_struct{
+ que_t* query_graph;
+ /* query graph generated by the
+ parser */
+ char* sql_string;
+ /* SQL string to parse */
+ int string_len;
+ /* SQL string length */
+ int next_char_pos;
+ /* position of the next character in
+ sql_string to give to the lexical
+ analyzer */
+ sym_node_list_t sym_list;
+ /* list of symbol nodes in the symbol
+ table */
+ UT_LIST_BASE_NODE_T(func_node_t)
+ func_node_list;
+ /* list of function nodes in the
+ parsed query graph */
+ mem_heap_t* heap; /* memory heap from which we can
+ allocate space */
+};
+
+/* Types of a symbol table entry */
+#define SYM_VAR 91 /* declared parameter or local
+ variable of a procedure */
+#define SYM_IMPLICIT_VAR 92 /* storage for a intermediate result
+ of a calculation */
+#define SYM_LIT 93 /* literal */
+#define SYM_TABLE 94 /* database table name */
+#define SYM_COLUMN 95 /* database table name */
+#define SYM_CURSOR 96 /* named cursor */
+#define SYM_PROCEDURE_NAME 97 /* stored procedure name */
+#define SYM_INDEX 98 /* database index name */
+
+#ifndef UNIV_NONINL
+#include "pars0sym.ic"
+#endif
+
+#endif
diff --git a/innobase/include/pars0sym.ic b/innobase/include/pars0sym.ic
new file mode 100644
index 00000000000..9508d423769
--- /dev/null
+++ b/innobase/include/pars0sym.ic
@@ -0,0 +1,7 @@
+/******************************************************
+SQL parser symbol table
+
+(c) 1997 Innobase Oy
+
+Created 12/15/1997 Heikki Tuuri
+*******************************************************/
diff --git a/innobase/include/pars0types.h b/innobase/include/pars0types.h
new file mode 100644
index 00000000000..e7471260501
--- /dev/null
+++ b/innobase/include/pars0types.h
@@ -0,0 +1,29 @@
+/******************************************************
+SQL parser global types
+
+(c) 1997 Innobase Oy
+
+Created 1/11/1998 Heikki Tuuri
+*******************************************************/
+
+#ifndef pars0types_h
+#define pars0types_h
+
+typedef struct sym_node_struct sym_node_t;
+typedef struct sym_tab_struct sym_tab_t;
+typedef struct pars_res_word_struct pars_res_word_t;
+typedef struct func_node_struct func_node_t;
+typedef struct order_node_struct order_node_t;
+typedef struct proc_node_struct proc_node_t;
+typedef struct call_node_struct call_node_t;
+typedef struct elsif_node_struct elsif_node_t;
+typedef struct if_node_struct if_node_t;
+typedef struct while_node_struct while_node_t;
+typedef struct for_node_struct for_node_t;
+typedef struct return_node_struct return_node_t;
+typedef struct assign_node_struct assign_node_t;
+typedef struct col_assign_node_struct col_assign_node_t;
+
+typedef UT_LIST_BASE_NODE_T(sym_node_t) sym_node_list_t;
+
+#endif
diff --git a/innobase/include/que0que.h b/innobase/include/que0que.h
new file mode 100644
index 00000000000..bd21a9801aa
--- /dev/null
+++ b/innobase/include/que0que.h
@@ -0,0 +1,495 @@
+/******************************************************
+Query graph
+
+(c) 1996 Innobase Oy
+
+Created 5/27/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef que0que_h
+#define que0que_h
+
+#include "univ.i"
+#include "data0data.h"
+#include "dict0types.h"
+#include "trx0trx.h"
+#include "srv0srv.h"
+#include "usr0types.h"
+#include "que0types.h"
+#include "row0types.h"
+#include "pars0types.h"
+
+/* If the following flag is set TRUE, the module will print trace info
+of SQL execution in the UNIV_SQL_DEBUG version */
+extern ibool que_trace_on;
+
+/***************************************************************************
+Adds a query graph to the session's list of graphs. */
+
+void
+que_graph_publish(
+/*==============*/
+ que_t* graph, /* in: graph */
+ sess_t* sess); /* in: session */
+/***************************************************************************
+Creates a query graph fork node. */
+
+que_fork_t*
+que_fork_create(
+/*============*/
+ /* out, own: fork node */
+ que_t* graph, /* in: graph, if NULL then this
+ fork node is assumed to be the
+ graph root */
+ que_node_t* parent, /* in: parent node */
+ ulint fork_type, /* in: fork type */
+ mem_heap_t* heap); /* in: memory heap where created */
+/***************************************************************************
+Gets the first thr in a fork. */
+UNIV_INLINE
+que_thr_t*
+que_fork_get_first_thr(
+/*===================*/
+ que_fork_t* fork); /* in: query fork */
+/***************************************************************************
+Gets the child node of the first thr in a fork. */
+UNIV_INLINE
+que_node_t*
+que_fork_get_child(
+/*===============*/
+ que_fork_t* fork); /* in: query fork */
+/***************************************************************************
+Sets the parent of a graph node. */
+UNIV_INLINE
+void
+que_node_set_parent(
+/*================*/
+ que_node_t* node, /* in: graph node */
+ que_node_t* parent);/* in: parent */
+/***************************************************************************
+Creates a query graph thread node. */
+
+que_thr_t*
+que_thr_create(
+/*===========*/
+ /* out, own: query thread node */
+ que_fork_t* parent, /* in: parent node, i.e., a fork node */
+ mem_heap_t* heap); /* in: memory heap where created */
+/**************************************************************************
+Checks if the query graph is in a state where it should be freed, and
+frees it in that case. If the session is in a state where it should be
+closed, also this is done. */
+
+ibool
+que_graph_try_free(
+/*===============*/
+ /* out: TRUE if freed */
+ que_t* graph); /* in: query graph */
+/**************************************************************************
+Frees a query graph, but not the heap where it was created. Does not free
+explicit cursor declarations, they are freed in que_graph_free. */
+
+void
+que_graph_free_recursive(
+/*=====================*/
+ que_node_t* node); /* in: query graph node */
+/**************************************************************************
+Frees a query graph. */
+
+void
+que_graph_free(
+/*===========*/
+ que_t* graph); /* in: query graph; we assume that the memory
+ heap where this graph was created is private
+ to this graph: if not, then use
+ que_graph_free_recursive and free the heap
+ afterwards! */
+/**************************************************************************
+Stops a query thread if graph or trx is in a state requiring it. The
+conditions are tested in the order (1) graph, (2) trx. The kernel mutex has
+to be reserved. */
+
+ibool
+que_thr_stop(
+/*=========*/
+ /* out: TRUE if stopped */
+ que_thr_t* thr); /* in: query thread */
+/**************************************************************************
+Moves a thread from another state to the QUE_THR_RUNNING state. Increments
+the n_active_thrs counters of the query graph and transaction. */
+UNIV_INLINE
+void
+que_thr_move_to_run_state_for_mysql(
+/*================================*/
+ que_thr_t* thr, /* in: an query thread */
+ trx_t* trx); /* in: transaction */
+/**************************************************************************
+A patch for MySQL used to 'stop' a dummy query thread used in MySQL
+select, when there is no error or lock wait. */
+UNIV_INLINE
+void
+que_thr_stop_for_mysql_no_error(
+/*============================*/
+ que_thr_t* thr, /* in: query thread */
+ trx_t* trx); /* in: transaction */
+/**************************************************************************
+A patch for MySQL used to 'stop' a dummy query thread used in MySQL
+select. */
+
+void
+que_thr_stop_for_mysql(
+/*===================*/
+ que_thr_t* thr); /* in: query thread */
+/**************************************************************************
+Runs query threads. Note that the individual query thread which is run
+within this function may change if, e.g., the OS thread executing this
+function uses a threshold amount of resources. */
+
+void
+que_run_threads(
+/*============*/
+ que_thr_t* thr); /* in: query thread which is run initially */
+/**************************************************************************
+After signal handling is finished, returns control to a query graph error
+handling routine. (Currently, just returns the control to the root of the
+graph so that the graph can communicate an error message to the client.) */
+
+void
+que_fork_error_handle(
+/*==================*/
+ trx_t* trx, /* in: trx */
+ que_t* fork); /* in: query graph which was run before signal
+ handling started, NULL not allowed */
+/**************************************************************************
+Handles an SQL error noticed during query thread execution. At the moment,
+does nothing! */
+
+void
+que_thr_handle_error(
+/*=================*/
+ que_thr_t* thr, /* in: query thread */
+ ulint err_no, /* in: error number */
+ byte* err_str,/* in, own: error string or NULL; NOTE: the
+ function will take care of freeing of the
+ string! */
+ ulint err_len);/* in: error string length */
+/**************************************************************************
+Moves a suspended query thread to the QUE_THR_RUNNING state and releases
+a single worker thread to execute it. This function should be used to end
+the wait state of a query thread waiting for a lock or a stored procedure
+completion. */
+
+void
+que_thr_end_wait(
+/*=============*/
+ que_thr_t* thr, /* in: query thread in the
+ QUE_THR_LOCK_WAIT,
+ or QUE_THR_PROCEDURE_WAIT, or
+ QUE_THR_SIG_REPLY_WAIT state */
+ que_thr_t** next_thr); /* in/out: next query thread to run;
+ if the value which is passed in is
+ a pointer to a NULL pointer, then the
+ calling function can start running
+ a new query thread */
+/**************************************************************************
+Same as que_thr_end_wait, but no parameter next_thr available. */
+
+void
+que_thr_end_wait_no_next_thr(
+/*=========================*/
+ que_thr_t* thr); /* in: query thread in the
+ QUE_THR_LOCK_WAIT,
+ or QUE_THR_PROCEDURE_WAIT, or
+ QUE_THR_SIG_REPLY_WAIT state */
+/**************************************************************************
+Starts execution of a command in a query fork. Picks a query thread which
+is not in the QUE_THR_RUNNING state and moves it to that state. If none
+can be chosen, a situation which may arise in parallelized fetches, NULL
+is returned. */
+
+que_thr_t*
+que_fork_start_command(
+/*===================*/
+ /* out: a query thread of the graph moved to
+ QUE_THR_RUNNING state, or NULL; the query
+ thread should be executed by que_run_threads
+ by the caller */
+ que_fork_t* fork, /* in: a query fork */
+ ulint command,/* in: command SESS_COMM_FETCH_NEXT, ... */
+ ulint param); /* in: possible parameter to the command */
+/***************************************************************************
+Gets the trx of a query thread. */
+UNIV_INLINE
+trx_t*
+thr_get_trx(
+/*========*/
+ que_thr_t* thr); /* in: query thread */
+/***************************************************************************
+Gets the type of a graph node. */
+UNIV_INLINE
+ulint
+que_node_get_type(
+/*==============*/
+ que_node_t* node); /* in: graph node */
+/***************************************************************************
+Gets pointer to the value data type field of a graph node. */
+UNIV_INLINE
+dtype_t*
+que_node_get_data_type(
+/*===================*/
+ que_node_t* node); /* in: graph node */
+/***************************************************************************
+Gets pointer to the value dfield of a graph node. */
+UNIV_INLINE
+dfield_t*
+que_node_get_val(
+/*=============*/
+ que_node_t* node); /* in: graph node */
+/***************************************************************************
+Gets the value buffer size of a graph node. */
+UNIV_INLINE
+ulint
+que_node_get_val_buf_size(
+/*======================*/
+ /* out: val buffer size, not defined if
+ val.data == NULL in node */
+ que_node_t* node); /* in: graph node */
+/***************************************************************************
+Sets the value buffer size of a graph node. */
+UNIV_INLINE
+void
+que_node_set_val_buf_size(
+/*======================*/
+ que_node_t* node, /* in: graph node */
+ ulint size); /* in: size */
+/*************************************************************************
+Gets the next list node in a list of query graph nodes. */
+UNIV_INLINE
+que_node_t*
+que_node_get_next(
+/*==============*/
+ que_node_t* node); /* in: node in a list */
+/*************************************************************************
+Gets the parent node of a query graph node. */
+UNIV_INLINE
+que_node_t*
+que_node_get_parent(
+/*================*/
+ /* out: parent node or NULL */
+ que_node_t* node); /* in: node */
+/*************************************************************************
+Catenates a query graph node to a list of them, possible empty list. */
+UNIV_INLINE
+que_node_t*
+que_node_list_add_last(
+/*===================*/
+ /* out: one-way list of nodes */
+ que_node_t* node_list, /* in: node list, or NULL */
+ que_node_t* node); /* in: node */
+/*************************************************************************
+Gets a query graph node list length. */
+UNIV_INLINE
+ulint
+que_node_list_get_len(
+/*==================*/
+ /* out: length, for NULL list 0 */
+ que_node_t* node_list); /* in: node list, or NULL */
+/**************************************************************************
+Checks if graph, trx, or session is in a state where the query thread should
+be stopped. */
+UNIV_INLINE
+ibool
+que_thr_peek_stop(
+/*==============*/
+ /* out: TRUE if should be stopped; NOTE that
+ if the peek is made without reserving the
+ kernel mutex, then another peek with the
+ mutex reserved is necessary before deciding
+ the actual stopping */
+ que_thr_t* thr); /* in: query thread */
+/***************************************************************************
+Returns TRUE if the query graph is for a SELECT statement. */
+UNIV_INLINE
+ibool
+que_graph_is_select(
+/*================*/
+ /* out: TRUE if a select */
+ que_t* graph); /* in: graph */
+/**************************************************************************
+Prints info of an SQL query graph node. */
+
+void
+que_node_print_info(
+/*================*/
+ que_node_t* node); /* in: query graph node */
+
+
+/* Query graph query thread node: the fields are protected by the kernel
+mutex with the exceptions named below */
+
+struct que_thr_struct{
+ que_common_t common; /* type: QUE_NODE_THR */
+ que_node_t* child; /* graph child node */
+ que_t* graph; /* graph where this node belongs */
+ ibool is_active; /* TRUE if the thread has been set
+ to the run state in
+ que_thr_move_to_run_state, but not
+ deactivated in
+ que_thr_dec_reference_count */
+ ulint state; /* state of the query thread */
+ UT_LIST_NODE_T(que_thr_t)
+ thrs; /* list of thread nodes of the fork
+ node */
+ UT_LIST_NODE_T(que_thr_t)
+ trx_thrs; /* lists of threads in wait list of
+ the trx */
+ UT_LIST_NODE_T(que_thr_t)
+ queue; /* list of runnable thread nodes in
+ the server task queue */
+ /*------------------------------*/
+ /* The following fields are private to the OS thread executing the
+ query thread, and are not protected by the kernel mutex: */
+
+ que_node_t* run_node; /* pointer to the node where the
+ subgraph down from this node is
+ currently executed */
+ que_node_t* prev_node; /* pointer to the node from which
+ the control came */
+ ulint resource; /* resource usage of the query thread
+ thus far */
+};
+
+/* Query graph fork node: its fields are protected by the kernel mutex */
+struct que_fork_struct{
+ que_common_t common; /* type: QUE_NODE_FORK */
+ que_t* graph; /* query graph of this node */
+ ulint fork_type; /* fork type */
+ ulint n_active_thrs; /* if this is the root of a graph, the
+ number query threads that have been
+ started in que_thr_move_to_run_state
+ but for which que_thr_dec_refer_count
+ has not yet been called */
+ trx_t* trx; /* transaction: this is set only in
+ the root node */
+ ulint state; /* state of the fork node */
+ que_thr_t* caller; /* pointer to a possible calling query
+ thread */
+ UT_LIST_BASE_NODE_T(que_thr_t)
+ thrs; /* list of query threads */
+ /*------------------------------*/
+ /* The fields in this section are defined only in the root node */
+ sym_tab_t* sym_tab; /* symbol table of the query,
+ generated by the parser, or NULL
+ if the graph was created 'by hand' */
+ ulint id; /* id of this query graph */
+ ulint command; /* command currently executed in the
+ graph */
+ ulint param; /* possible command parameter */
+
+ /* The following cur_... fields are relevant only in a select graph */
+
+ ulint cur_end; /* QUE_CUR_NOT_DEFINED, QUE_CUR_START,
+ QUE_CUR_END */
+ ulint cur_pos; /* if there are n rows in the result
+ set, values 0 and n + 1 mean before
+ first row, or after last row, depending
+ on cur_end; values 1...n mean a row
+ index */
+ ibool cur_on_row; /* TRUE if cursor is on a row, i.e.,
+ it is not before the first row or
+ after the last row */
+ dulint n_inserts; /* number of rows inserted */
+ dulint n_updates; /* number of rows updated */
+ dulint n_deletes; /* number of rows deleted */
+ sel_node_t* last_sel_node; /* last executed select node, or NULL
+ if none */
+ UT_LIST_NODE_T(que_fork_t)
+ graphs; /* list of query graphs of a session
+ or a stored procedure */
+ /*------------------------------*/
+ mem_heap_t* heap; /* memory heap where the fork was
+ created */
+
+};
+
+/* Query fork (or graph) types */
+#define QUE_FORK_SELECT_NON_SCROLL 1 /* forward-only cursor */
+#define QUE_FORK_SELECT_SCROLL 2 /* scrollable cursor */
+#define QUE_FORK_INSERT 3
+#define QUE_FORK_UPDATE 4
+#define QUE_FORK_ROLLBACK 5
+ /* This is really the undo graph used in rollback,
+ no signal-sending roll_node in this graph */
+#define QUE_FORK_PURGE 6
+#define QUE_FORK_EXECUTE 7
+#define QUE_FORK_PROCEDURE 8
+#define QUE_FORK_PROCEDURE_CALL 9
+#define QUE_FORK_MYSQL_INTERFACE 10
+#define QUE_FORK_RECOVERY 11
+
+/* Query fork (or graph) states */
+#define QUE_FORK_ACTIVE 1
+#define QUE_FORK_COMMAND_WAIT 2
+#define QUE_FORK_INVALID 3
+#define QUE_FORK_BEING_FREED 4
+
+/* Flag which is ORed to control structure statement node types */
+#define QUE_NODE_CONTROL_STAT 1024
+
+/* Query graph node types */
+#define QUE_NODE_LOCK 1
+#define QUE_NODE_INSERT 2
+#define QUE_NODE_UPDATE 4
+#define QUE_NODE_CURSOR 5
+#define QUE_NODE_SELECT 6
+#define QUE_NODE_AGGREGATE 7
+#define QUE_NODE_FORK 8
+#define QUE_NODE_THR 9
+#define QUE_NODE_UNDO 10
+#define QUE_NODE_COMMIT 11
+#define QUE_NODE_ROLLBACK 12
+#define QUE_NODE_PURGE 13
+#define QUE_NODE_CREATE_TABLE 14
+#define QUE_NODE_CREATE_INDEX 15
+#define QUE_NODE_SYMBOL 16
+#define QUE_NODE_RES_WORD 17
+#define QUE_NODE_FUNC 18
+#define QUE_NODE_ORDER 19
+#define QUE_NODE_PROC (20 + QUE_NODE_CONTROL_STAT)
+#define QUE_NODE_IF (21 + QUE_NODE_CONTROL_STAT)
+#define QUE_NODE_WHILE (22 + QUE_NODE_CONTROL_STAT)
+#define QUE_NODE_ASSIGNMENT 23
+#define QUE_NODE_FETCH 24
+#define QUE_NODE_OPEN 25
+#define QUE_NODE_COL_ASSIGNMENT 26
+#define QUE_NODE_FOR (27 + QUE_NODE_CONTROL_STAT)
+#define QUE_NODE_RETURN 28
+#define QUE_NODE_ROW_PRINTF 29
+#define QUE_NODE_ELSIF 30
+#define QUE_NODE_CALL 31
+
+/* Query thread states */
+#define QUE_THR_RUNNING 1
+#define QUE_THR_PROCEDURE_WAIT 2
+#define QUE_THR_COMPLETED 3 /* in selects this means that the
+ thread is at the end of its result set
+ (or start, in case of a scroll cursor);
+ in other statements, this means the
+ thread has done its task */
+#define QUE_THR_COMMAND_WAIT 4
+#define QUE_THR_LOCK_WAIT 5
+#define QUE_THR_SIG_REPLY_WAIT 6
+#define QUE_THR_SUSPENDED 7
+#define QUE_THR_ERROR 8
+
+/* From where the cursor position is counted */
+#define QUE_CUR_NOT_DEFINED 1
+#define QUE_CUR_START 2
+#define QUE_CUR_END 3
+
+
+#ifndef UNIV_NONINL
+#include "que0que.ic"
+#endif
+
+#endif
diff --git a/innobase/include/que0que.ic b/innobase/include/que0que.ic
new file mode 100644
index 00000000000..e19198aad0e
--- /dev/null
+++ b/innobase/include/que0que.ic
@@ -0,0 +1,304 @@
+/******************************************************
+Query graph
+
+(c) 1996 Innobase Oy
+
+Created 5/27/1996 Heikki Tuuri
+*******************************************************/
+
+#include "usr0sess.h"
+
+/***************************************************************************
+Gets the trx of a query thread. */
+UNIV_INLINE
+trx_t*
+thr_get_trx(
+/*========*/
+ que_thr_t* thr) /* in: query thread */
+{
+ ut_ad(thr);
+
+ return(thr->graph->trx);
+}
+
+/***************************************************************************
+Gets the first thr in a fork. */
+UNIV_INLINE
+que_thr_t*
+que_fork_get_first_thr(
+/*===================*/
+ que_fork_t* fork) /* in: query fork */
+{
+ return(UT_LIST_GET_FIRST(fork->thrs));
+}
+
+/***************************************************************************
+Gets the child node of the first thr in a fork. */
+UNIV_INLINE
+que_node_t*
+que_fork_get_child(
+/*===============*/
+ que_fork_t* fork) /* in: query fork */
+{
+ que_thr_t* thr;
+
+ thr = UT_LIST_GET_FIRST(fork->thrs);
+
+ return(thr->child);
+}
+
+/***************************************************************************
+Gets the type of a graph node. */
+UNIV_INLINE
+ulint
+que_node_get_type(
+/*==============*/
+ que_node_t* node) /* in: graph node */
+{
+ ut_ad(node);
+
+ return(((que_common_t*)node)->type);
+}
+
+/***************************************************************************
+Gets pointer to the value dfield of a graph node. */
+UNIV_INLINE
+dfield_t*
+que_node_get_val(
+/*=============*/
+ que_node_t* node) /* in: graph node */
+{
+ ut_ad(node);
+
+ return(&(((que_common_t*)node)->val));
+}
+
+/***************************************************************************
+Gets the value buffer size of a graph node. */
+UNIV_INLINE
+ulint
+que_node_get_val_buf_size(
+/*======================*/
+ /* out: val buffer size, not defined if
+ val.data == NULL in node */
+ que_node_t* node) /* in: graph node */
+{
+ ut_ad(node);
+
+ return(((que_common_t*)node)->val_buf_size);
+}
+
+/***************************************************************************
+Sets the value buffer size of a graph node. */
+UNIV_INLINE
+void
+que_node_set_val_buf_size(
+/*======================*/
+ que_node_t* node, /* in: graph node */
+ ulint size) /* in: size */
+{
+ ut_ad(node);
+
+ ((que_common_t*)node)->val_buf_size = size;
+}
+
+/***************************************************************************
+Sets the parent of a graph node. */
+UNIV_INLINE
+void
+que_node_set_parent(
+/*================*/
+ que_node_t* node, /* in: graph node */
+ que_node_t* parent) /* in: parent */
+{
+ ut_ad(node);
+
+ ((que_common_t*)node)->parent = parent;
+}
+
+/***************************************************************************
+Gets pointer to the value data type field of a graph node. */
+UNIV_INLINE
+dtype_t*
+que_node_get_data_type(
+/*===================*/
+ que_node_t* node) /* in: graph node */
+{
+ ut_ad(node);
+
+ return(&(((que_common_t*)node)->val.type));
+}
+
+/*************************************************************************
+Catenates a query graph node to a list of them, possible empty list. */
+UNIV_INLINE
+que_node_t*
+que_node_list_add_last(
+/*===================*/
+ /* out: one-way list of nodes */
+ que_node_t* node_list, /* in: node list, or NULL */
+ que_node_t* node) /* in: node */
+{
+ que_common_t* cnode;
+ que_common_t* cnode2;
+
+ cnode = node;
+
+ cnode->brother = NULL;
+
+ if (node_list == NULL) {
+
+ return(node);
+ }
+
+ cnode2 = node_list;
+
+ while (cnode2->brother != NULL) {
+ cnode2 = cnode2->brother;
+ }
+
+ cnode2->brother = node;
+
+ return(node_list);
+}
+
+/*************************************************************************
+Gets the next list node in a list of query graph nodes. */
+UNIV_INLINE
+que_node_t*
+que_node_get_next(
+/*==============*/
+ /* out: next node in a list of nodes */
+ que_node_t* node) /* in: node in a list */
+{
+ return(((que_common_t*)node)->brother);
+}
+
+/*************************************************************************
+Gets a query graph node list length. */
+UNIV_INLINE
+ulint
+que_node_list_get_len(
+/*==================*/
+ /* out: length, for NULL list 0 */
+ que_node_t* node_list) /* in: node list, or NULL */
+{
+ que_common_t* cnode;
+ ulint len;
+
+ cnode = node_list;
+ len = 0;
+
+ while (cnode != NULL) {
+ len++;
+ cnode = cnode->brother;
+ }
+
+ return(len);
+}
+
+/*************************************************************************
+Gets the parent node of a query graph node. */
+UNIV_INLINE
+que_node_t*
+que_node_get_parent(
+/*================*/
+ /* out: parent node or NULL */
+ que_node_t* node) /* in: node */
+{
+ return(((que_common_t*)node)->parent);
+}
+
+/**************************************************************************
+Checks if graph, trx, or session is in a state where the query thread should
+be stopped. */
+UNIV_INLINE
+ibool
+que_thr_peek_stop(
+/*==============*/
+ /* out: TRUE if should be stopped; NOTE that
+ if the peek is made without reserving the
+ kernel mutex, then another peek with the
+ mutex reserved is necessary before deciding
+ the actual stopping */
+ que_thr_t* thr) /* in: query thread */
+{
+ trx_t* trx;
+ que_t* graph;
+
+ graph = thr->graph;
+ trx = graph->trx;
+
+ if (graph->state != QUE_FORK_ACTIVE
+ || trx->que_state == TRX_QUE_LOCK_WAIT
+ || (UT_LIST_GET_LEN(trx->signals) > 0
+ && trx->que_state == TRX_QUE_RUNNING)) {
+ return(TRUE);
+ }
+
+ return(FALSE);
+}
+
+/***************************************************************************
+Returns TRUE if the query graph is for a SELECT statement. */
+UNIV_INLINE
+ibool
+que_graph_is_select(
+/*================*/
+ /* out: TRUE if a select */
+ que_t* graph) /* in: graph */
+{
+ if (graph->fork_type == QUE_FORK_SELECT_SCROLL
+ || graph->fork_type == QUE_FORK_SELECT_NON_SCROLL) {
+
+ return(TRUE);
+ }
+
+ return(FALSE);
+}
+
+/**************************************************************************
+Moves a thread from another state to the QUE_THR_RUNNING state. Increments
+the n_active_thrs counters of the query graph and transaction if thr was
+not active. */
+UNIV_INLINE
+void
+que_thr_move_to_run_state_for_mysql(
+/*================================*/
+ que_thr_t* thr, /* in: an query thread */
+ trx_t* trx) /* in: transaction */
+{
+ if (!thr->is_active) {
+
+ (thr->graph)->n_active_thrs++;
+
+ trx->n_active_thrs++;
+
+ thr->is_active = TRUE;
+
+ ut_ad((thr->graph)->n_active_thrs == 1);
+ ut_ad(trx->n_active_thrs == 1);
+ }
+
+ thr->state = QUE_THR_RUNNING;
+}
+
+/**************************************************************************
+A patch for MySQL used to 'stop' a dummy query thread used in MySQL
+select, when there is no error or lock wait. */
+UNIV_INLINE
+void
+que_thr_stop_for_mysql_no_error(
+/*============================*/
+ que_thr_t* thr, /* in: query thread */
+ trx_t* trx) /* in: transaction */
+{
+ ut_ad(thr->state == QUE_THR_RUNNING);
+
+ thr->state = QUE_THR_COMPLETED;
+
+ thr->is_active = FALSE;
+ (thr->graph)->n_active_thrs--;
+
+ trx->n_active_thrs--;
+}
diff --git a/innobase/include/que0types.h b/innobase/include/que0types.h
new file mode 100644
index 00000000000..c7ce09db40b
--- /dev/null
+++ b/innobase/include/que0types.h
@@ -0,0 +1,42 @@
+/******************************************************
+Query graph global types
+
+(c) 1996 Innobase Oy
+
+Created 5/27/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef que0types_h
+#define que0types_h
+
+#include "data0data.h"
+#include "dict0types.h"
+
+/* Pseudotype for all graph nodes */
+typedef void que_node_t;
+
+typedef struct que_fork_struct que_fork_t;
+
+/* Query graph root is a fork node */
+typedef que_fork_t que_t;
+
+typedef struct que_thr_struct que_thr_t;
+typedef struct que_common_struct que_common_t;
+
+/* Common struct at the beginning of each query graph node; the name of this
+substruct must be 'common' */
+
+struct que_common_struct{
+ ulint type; /* query node type */
+ que_node_t* parent; /* back pointer to parent node, or NULL */
+ que_node_t* brother;/* pointer to a possible brother node */
+ dfield_t val; /* evaluated value for an expression */
+ ulint val_buf_size;
+ /* buffer size for the evaluated value data,
+ if the buffer has been allocated dynamically:
+ if this field is != 0, and the node is a
+ symbol node or a function node, then we
+ have to free the data field in val explicitly */
+};
+
+#endif
diff --git a/innobase/include/read0read.h b/innobase/include/read0read.h
new file mode 100644
index 00000000000..dea952c8547
--- /dev/null
+++ b/innobase/include/read0read.h
@@ -0,0 +1,92 @@
+/******************************************************
+Cursor read
+
+(c) 1997 Innobase Oy
+
+Created 2/16/1997 Heikki Tuuri
+*******************************************************/
+
+#ifndef read0read_h
+#define read0read_h
+
+#include "univ.i"
+
+
+#include "ut0byte.h"
+#include "ut0lst.h"
+#include "trx0trx.h"
+#include "read0types.h"
+
+/*************************************************************************
+Opens a read view where exactly the transactions serialized before this
+point in time are seen in the view. */
+
+read_view_t*
+read_view_open_now(
+/*===============*/
+ /* out, own: read view struct */
+ trx_t* cr_trx, /* in: creating transaction, or NULL */
+ mem_heap_t* heap); /* in: memory heap from which allocated */
+/*************************************************************************
+Makes a copy of the oldest existing read view, or opens a new. The view
+must be closed with ..._close. */
+
+read_view_t*
+read_view_oldest_copy_or_open_new(
+/*==============================*/
+ /* out, own: read view struct */
+ trx_t* cr_trx, /* in: creating transaction, or NULL */
+ mem_heap_t* heap); /* in: memory heap from which allocated */
+/*************************************************************************
+Closes a read view. */
+
+void
+read_view_close(
+/*============*/
+ read_view_t* view); /* in: read view */
+/*************************************************************************
+Checks if a read view sees the specified transaction. */
+UNIV_INLINE
+ibool
+read_view_sees_trx_id(
+/*==================*/
+ /* out: TRUE if sees */
+ read_view_t* view, /* in: read view */
+ dulint trx_id); /* in: trx id */
+
+
+/* Read view lists the trx ids of those transactions for which a consistent
+read should not see the modifications to the database. */
+
+struct read_view_struct{
+ ibool can_be_too_old; /* TRUE if the system has had to purge old
+ versions which this read view should be able
+ to access: the read view can bump into the
+ DB_MISSING_HISTORY error */
+ dulint low_limit_no; /* The view does not need to see the undo
+ logs for transactions whose transaction number
+ is strictly smaller (<) than this value: they
+ can be removed in purge if not needed by other
+ views */
+ dulint low_limit_id; /* The read should not see any transaction
+ with trx id >= this value */
+ dulint up_limit_id; /* The read should see all trx ids which
+ are strictly smaller (<) than this value */
+ ulint n_trx_ids; /* Number of cells in the trx_ids array */
+ dulint* trx_ids; /* Additional trx ids which the read should
+ not see: typically, these are the active
+ transactions at the time when the read is
+ serialized, except the reading transaction
+ itself; the trx ids in this array are in a
+ descending order */
+ trx_t* creator; /* Pointer to the creating transaction, or
+ NULL if used in purge */
+ UT_LIST_NODE_T(read_view_t) view_list;
+ /* List of read views in trx_sys */
+};
+
+#ifndef UNIV_NONINL
+#include "read0read.ic"
+#endif
+
+#endif
diff --git a/innobase/include/read0read.ic b/innobase/include/read0read.ic
new file mode 100644
index 00000000000..03d84ee0c51
--- /dev/null
+++ b/innobase/include/read0read.ic
@@ -0,0 +1,85 @@
+/******************************************************
+Cursor read
+
+(c) 1997 Innobase Oy
+
+Created 2/16/1997 Heikki Tuuri
+*******************************************************/
+
+/*************************************************************************
+Gets the nth trx id in a read view. */
+UNIV_INLINE
+dulint
+read_view_get_nth_trx_id(
+/*=====================*/
+ /* out: trx id */
+ read_view_t* view, /* in: read view */
+ ulint n) /* in: position */
+{
+ ut_ad(n < view->n_trx_ids);
+
+ return(*(view->trx_ids + n));
+}
+
+/*************************************************************************
+Sets the nth trx id in a read view. */
+UNIV_INLINE
+void
+read_view_set_nth_trx_id(
+/*=====================*/
+ read_view_t* view, /* in: read view */
+ ulint n, /* in: position */
+ dulint trx_id) /* in: trx id to set */
+{
+ ut_ad(n < view->n_trx_ids);
+
+ *(view->trx_ids + n) = trx_id;
+}
+
+/*************************************************************************
+Checks if a read view sees the specified transaction. */
+UNIV_INLINE
+ibool
+read_view_sees_trx_id(
+/*==================*/
+ /* out: TRUE if sees */
+ read_view_t* view, /* in: read view */
+ dulint trx_id) /* in: trx id */
+{
+ ulint n_ids;
+ int cmp;
+ ulint i;
+
+ if (ut_dulint_cmp(trx_id, view->up_limit_id) < 0) {
+
+ return(TRUE);
+ }
+
+ if (ut_dulint_cmp(trx_id, view->low_limit_id) >= 0) {
+
+ return(FALSE);
+ }
+
+ /* We go through the trx ids in the array smallest first: this order
+ may save CPU time, because if there was a very long running
+ transaction in the trx id array, its trx id is looked at first, and
+ the first two comparisons may well decide the visibility of trx_id. */
+
+ n_ids = view->n_trx_ids;
+
+ for (i = 0; i < n_ids; i++) {
+
+ cmp = ut_dulint_cmp(trx_id,
+ read_view_get_nth_trx_id(view, n_ids - i - 1));
+ if (0 == cmp) {
+
+ return(FALSE);
+
+ } else if (cmp < 0) {
+
+ return(TRUE);
+ }
+ }
+
+ return(TRUE);
+}
diff --git a/innobase/include/read0types.h b/innobase/include/read0types.h
new file mode 100644
index 00000000000..5eb3e533f89
--- /dev/null
+++ b/innobase/include/read0types.h
@@ -0,0 +1,14 @@
+/******************************************************
+Cursor read
+
+(c) 1997 Innobase Oy
+
+Created 2/16/1997 Heikki Tuuri
+*******************************************************/
+
+#ifndef read0types_h
+#define read0types_h
+
+typedef struct read_view_struct read_view_t;
+
+#endif
diff --git a/innobase/include/rem0cmp.h b/innobase/include/rem0cmp.h
new file mode 100644
index 00000000000..77b9ef9edc8
--- /dev/null
+++ b/innobase/include/rem0cmp.h
@@ -0,0 +1,130 @@
+/***********************************************************************
+Comparison services for records
+
+(c) 1994-1996 Innobase Oy
+
+Created 7/1/1994 Heikki Tuuri
+************************************************************************/
+
+#ifndef rem0cmp_h
+#define rem0cmp_h
+
+#include "univ.i"
+#include "data0data.h"
+#include "data0type.h"
+#include "dict0dict.h"
+#include "rem0rec.h"
+
+/*****************************************************************
+This function is used to compare two dfields where at least the first
+has its data type field set. */
+UNIV_INLINE
+int
+cmp_dfield_dfield(
+/*==============*/
+ /* out: 1, 0, -1, if dfield1 is greater, equal,
+ less than dfield2, respectively */
+ dfield_t* dfield1,/* in: data field; must have type field set */
+ dfield_t* dfield2);/* in: data field */
+/*****************************************************************
+This function is used to compare a data tuple to a physical record.
+Only dtuple->n_fields_cmp first fields are taken into account for
+the the data tuple! If we denote by n = n_fields_cmp, then rec must
+have either m >= n fields, or it must differ from dtuple in some of
+the m fields rec has. */
+
+int
+cmp_dtuple_rec_with_match(
+/*======================*/
+ /* out: 1, 0, -1, if dtuple is greater, equal,
+ less than rec, respectively, when only the
+ common first fields are compared */
+ dtuple_t* dtuple, /* in: data tuple */
+ rec_t* rec, /* in: physical record which differs from
+ dtuple in some of the common fields, or which
+ has an equal number or more fields than
+ dtuple */
+ ulint* matched_fields, /* in/out: number of already completely
+ matched fields; when function returns,
+ contains the value for current comparison */
+ ulint* matched_bytes); /* in/out: number of already matched
+ bytes within the first field not completely
+ matched; when function returns, contains the
+ value for current comparison */
+/******************************************************************
+Compares a data tuple to a physical record. */
+
+int
+cmp_dtuple_rec(
+/*===========*/
+ /* out: 1, 0, -1, if dtuple is greater, equal,
+ less than rec, respectively; see the comments
+ for cmp_dtuple_rec_with_match */
+ dtuple_t* dtuple, /* in: data tuple */
+ rec_t* rec); /* in: physical record */
+/******************************************************************
+Checks if a dtuple is a prefix of a record. The last field in dtuple
+is allowed to be a prefix of the corresponding field in the record. */
+
+ibool
+cmp_dtuple_is_prefix_of_rec(
+/*========================*/
+ /* out: TRUE if prefix */
+ dtuple_t* dtuple, /* in: data tuple */
+ rec_t* rec); /* in: physical record */
+/******************************************************************
+Compares a prefix of a data tuple to a prefix of a physical record for
+equality. If there are less fields in rec than parameter n_fields, FALSE
+is returned. NOTE that n_fields_cmp of dtuple does not affect this
+comparison. */
+
+ibool
+cmp_dtuple_rec_prefix_equal(
+/*========================*/
+ /* out: TRUE if equal */
+ dtuple_t* dtuple, /* in: data tuple */
+ rec_t* rec, /* in: physical record */
+ ulint n_fields); /* in: number of fields which should be
+ compared; must not exceed the number of
+ fields in dtuple */
+/*****************************************************************
+This function is used to compare two physical records. Only the common
+first fields are compared. */
+
+int
+cmp_rec_rec_with_match(
+/*===================*/
+ /* out: 1, 0 , -1 if rec1 is greater, equal,
+ less, respectively, than rec2; only the common
+ first fields are compared */
+ rec_t* rec1, /* in: physical record */
+ rec_t* rec2, /* in: physical record */
+ dict_index_t* index, /* in: data dictionary index */
+ ulint* matched_fields, /* in/out: number of already completely
+ matched fields; when the function returns,
+ contains the value the for current
+ comparison */
+ ulint* matched_bytes);/* in/out: number of already matched
+ bytes within the first field not completely
+ matched; when the function returns, contains
+ the value for the current comparison */
+/*****************************************************************
+This function is used to compare two physical records. Only the common
+first fields are compared. */
+UNIV_INLINE
+int
+cmp_rec_rec(
+/*========*/
+ /* out: 1, 0 , -1 if rec1 is greater, equal,
+ less, respectively, than rec2; only the common
+ first fields are compared */
+ rec_t* rec1, /* in: physical record */
+ rec_t* rec2, /* in: physical record */
+ dict_index_t* index); /* in: data dictionary index */
+
+
+#ifndef UNIV_NONINL
+#include "rem0cmp.ic"
+#endif
+
+#endif
diff --git a/innobase/include/rem0cmp.ic b/innobase/include/rem0cmp.ic
new file mode 100644
index 00000000000..ebf513f538c
--- /dev/null
+++ b/innobase/include/rem0cmp.ic
@@ -0,0 +1,84 @@
+/***********************************************************************
+Comparison services for records
+
+(c) 1994-1996 Innobase Oy
+
+Created 7/1/1994 Heikki Tuuri
+************************************************************************/
+
+/*****************************************************************
+This function is used to compare two data fields for which we know the
+data type. */
+
+int
+cmp_data_data_slow(
+/*===============*/
+ /* out: 1, 0, -1, if data1 is greater, equal,
+ less than data2, respectively */
+ dtype_t* cur_type,/* in: data type of the fields */
+ byte* data1, /* in: data field (== a pointer to a memory
+ buffer) */
+ ulint len1, /* in: data field length or UNIV_SQL_NULL */
+ byte* data2, /* in: data field (== a pointer to a memory
+ buffer) */
+ ulint len2); /* in: data field length or UNIV_SQL_NULL */
+
+
+/*****************************************************************
+This function is used to compare two data fields for which we know the
+data type. */
+UNIV_INLINE
+int
+cmp_data_data(
+/*==========*/
+ /* out: 1, 0, -1, if data1 is greater, equal,
+ less than data2, respectively */
+ dtype_t* cur_type,/* in: data type of the fields */
+ byte* data1, /* in: data field (== a pointer to a memory
+ buffer) */
+ ulint len1, /* in: data field length or UNIV_SQL_NULL */
+ byte* data2, /* in: data field (== a pointer to a memory
+ buffer) */
+ ulint len2) /* in: data field length or UNIV_SQL_NULL */
+{
+ return(cmp_data_data_slow(cur_type, data1, len1, data2, len2));
+}
+
+/*****************************************************************
+This function is used to compare two dfields where at least the first
+has its data type field set. */
+UNIV_INLINE
+int
+cmp_dfield_dfield(
+/*==============*/
+ /* out: 1, 0, -1, if dfield1 is greater, equal,
+ less than dfield2, respectively */
+ dfield_t* dfield1,/* in: data field; must have type field set */
+ dfield_t* dfield2)/* in: data field */
+{
+ ut_ad(dfield_check_typed(dfield1));
+
+ return(cmp_data_data(dfield_get_type(dfield1),
+ dfield_get_data(dfield1), dfield_get_len(dfield1),
+ dfield_get_data(dfield2), dfield_get_len(dfield2)));
+}
+
+/*****************************************************************
+This function is used to compare two physical records. Only the common
+first fields are compared. */
+UNIV_INLINE
+int
+cmp_rec_rec(
+/*========*/
+ /* out: 1, 0 , -1 if rec1 is greater, equal,
+ less, respectively, than rec2; only the common
+ first fields are compared */
+ rec_t* rec1, /* in: physical record */
+ rec_t* rec2, /* in: physical record */
+ dict_index_t* index) /* in: data dictionary index */
+{
+ ulint match_f = 0;
+ ulint match_b = 0;
+
+ return(cmp_rec_rec_with_match(rec1, rec2, index, &match_f, &match_b));
+}
diff --git a/innobase/include/rem0rec.h b/innobase/include/rem0rec.h
new file mode 100644
index 00000000000..62c0aa14519
--- /dev/null
+++ b/innobase/include/rem0rec.h
@@ -0,0 +1,357 @@
+/************************************************************************
+Record manager
+
+(c) 1994-1996 Innobase Oy
+
+Created 5/30/1994 Heikki Tuuri
+*************************************************************************/
+
+#ifndef rem0rec_h
+#define rem0rec_h
+
+#include "univ.i"
+#include "data0data.h"
+#include "rem0types.h"
+
+/* Maximum values for various fields (for non-blob tuples) */
+#define REC_MAX_N_FIELDS (1024 - 1)
+#define REC_MAX_HEAP_NO (2 * 8192 - 1)
+#define REC_MAX_N_OWNED (16 - 1)
+
+/* Flag denoting the predefined minimum record: this bit is ORed in the 4
+info bits of a record */
+#define REC_INFO_MIN_REC_FLAG 0x10
+
+/* Number of extra bytes in a record, in addition to the data and the
+offsets */
+#define REC_N_EXTRA_BYTES 6
+
+/**********************************************************
+The following function is used to get the offset of the
+next chained record on the same page. */
+UNIV_INLINE
+ulint
+rec_get_next_offs(
+/*==============*/
+ /* out: the page offset of the next
+ chained record */
+ rec_t* rec); /* in: physical record */
+/**********************************************************
+The following function is used to set the next record offset field
+of the record. */
+UNIV_INLINE
+void
+rec_set_next_offs(
+/*==============*/
+ rec_t* rec, /* in: physical record */
+ ulint next); /* in: offset of the next record */
+/**********************************************************
+The following function is used to get the number of fields
+in the record. */
+UNIV_INLINE
+ulint
+rec_get_n_fields(
+/*=============*/
+ /* out: number of data fields */
+ rec_t* rec); /* in: physical record */
+/**********************************************************
+The following function is used to get the number of records
+owned by the previous directory record. */
+UNIV_INLINE
+ulint
+rec_get_n_owned(
+/*============*/
+ /* out: number of owned records */
+ rec_t* rec); /* in: physical record */
+/**********************************************************
+The following function is used to set the number of owned
+records. */
+UNIV_INLINE
+void
+rec_set_n_owned(
+/*============*/
+ rec_t* rec, /* in: physical record */
+ ulint n_owned); /* in: the number of owned */
+/**********************************************************
+The following function is used to retrieve the info bits of
+a record. */
+UNIV_INLINE
+ulint
+rec_get_info_bits(
+/*==============*/
+ /* out: info bits */
+ rec_t* rec); /* in: physical record */
+/**********************************************************
+The following function is used to set the info bits of a record. */
+UNIV_INLINE
+void
+rec_set_info_bits(
+/*==============*/
+ rec_t* rec, /* in: physical record */
+ ulint bits); /* in: info bits */
+/**********************************************************
+Gets the value of the deleted falg in info bits. */
+UNIV_INLINE
+ibool
+rec_info_bits_get_deleted_flag(
+/*===========================*/
+ /* out: TRUE if deleted flag set */
+ ulint info_bits); /* in: info bits from a record */
+/**********************************************************
+The following function tells if record is delete marked. */
+UNIV_INLINE
+ibool
+rec_get_deleted_flag(
+/*=================*/
+ /* out: TRUE if delete marked */
+ rec_t* rec); /* in: physical record */
+/**********************************************************
+The following function is used to set the deleted bit. */
+UNIV_INLINE
+void
+rec_set_deleted_flag(
+/*=================*/
+ rec_t* rec, /* in: physical record */
+ ibool flag); /* in: TRUE if delete marked */
+/**********************************************************
+The following function is used to get the order number
+of the record in the heap of the index page. */
+UNIV_INLINE
+ulint
+rec_get_heap_no(
+/*=============*/
+ /* out: heap order number */
+ rec_t* rec); /* in: physical record */
+/**********************************************************
+The following function is used to set the heap number
+field in the record. */
+UNIV_INLINE
+void
+rec_set_heap_no(
+/*=============*/
+ rec_t* rec, /* in: physical record */
+ ulint heap_no);/* in: the heap number */
+/**********************************************************
+The following function is used to test whether the data offsets
+in the record are stored in one-byte or two-byte format. */
+UNIV_INLINE
+ibool
+rec_get_1byte_offs_flag(
+/*====================*/
+ /* out: TRUE if 1-byte form */
+ rec_t* rec); /* in: physical record */
+/****************************************************************
+The following function is used to get a pointer to the nth
+data field in the record. */
+
+byte*
+rec_get_nth_field(
+/*==============*/
+ /* out: pointer to the field, NULL if SQL null */
+ rec_t* rec, /* in: record */
+ ulint n, /* in: index of the field */
+ ulint* len); /* out: length of the field; UNIV_SQL_NULL
+ if SQL null */
+/****************************************************************
+Gets the physical size of a field. Also an SQL null may have a field of
+size > 0, if the data type is of a fixed size. */
+UNIV_INLINE
+ulint
+rec_get_nth_field_size(
+/*===================*/
+ /* out: field size in bytes */
+ rec_t* rec, /* in: record */
+ ulint n); /* in: index of the field */
+/****************************************************************
+The following function is used to get a copy of the nth
+data field in the record to a buffer. */
+UNIV_INLINE
+void
+rec_copy_nth_field(
+/*===============*/
+ void* buf, /* in: pointer to the buffer */
+ rec_t* rec, /* in: record */
+ ulint n, /* in: index of the field */
+ ulint* len); /* out: length of the field; UNIV_SQL_NULL if SQL
+ null */
+/***************************************************************
+This is used to modify the value of an already existing field in
+a physical record. The previous value must have exactly the same
+size as the new value. If len is UNIV_SQL_NULL then the field is
+treated as SQL null. */
+UNIV_INLINE
+void
+rec_set_nth_field(
+/*==============*/
+ rec_t* rec, /* in: record */
+ ulint n, /* in: index of the field */
+ void* data, /* in: pointer to the data if not SQL null */
+ ulint len); /* in: length of the data or UNIV_SQL_NULL.
+ If not SQL null, must have the same length as the
+ previous value. If SQL null, previous value must be
+ SQL null. */
+/**************************************************************
+The following function returns the data size of a physical
+record, that is the sum of field lengths. SQL null fields
+are counted as length 0 fields. The value returned by the function
+is the distance from record origin to record end in bytes. */
+UNIV_INLINE
+ulint
+rec_get_data_size(
+/*==============*/
+ /* out: size */
+ rec_t* rec); /* in: physical record */
+/**************************************************************
+Returns the total size of record minus data size of record.
+The value returned by the function is the distance from record
+start to record origin in bytes. */
+UNIV_INLINE
+ulint
+rec_get_extra_size(
+/*===============*/
+ /* out: size */
+ rec_t* rec); /* in: physical record */
+/**************************************************************
+Returns the total size of a physical record. */
+UNIV_INLINE
+ulint
+rec_get_size(
+/*=========*/
+ /* out: size */
+ rec_t* rec); /* in: physical record */
+/**************************************************************
+Returns a pointer to the start of the record. */
+UNIV_INLINE
+byte*
+rec_get_start(
+/*==========*/
+ /* out: pointer to start */
+ rec_t* rec); /* in: pointer to record */
+/**************************************************************
+Returns a pointer to the end of the record. */
+UNIV_INLINE
+byte*
+rec_get_end(
+/*========*/
+ /* out: pointer to end */
+ rec_t* rec); /* in: pointer to record */
+/*******************************************************************
+Copies a physical record to a buffer. */
+UNIV_INLINE
+rec_t*
+rec_copy(
+/*=====*/
+ /* out: pointer to the origin of the copied record */
+ void* buf, /* in: buffer */
+ rec_t* rec); /* in: physical record */
+/******************************************************************
+Copies the first n fields of a physical record to a new physical record in
+a buffer. */
+
+rec_t*
+rec_copy_prefix_to_buf(
+/*===================*/
+ /* out, own: copied record */
+ rec_t* rec, /* in: physical record */
+ ulint n_fields, /* in: number of fields to copy */
+ byte** buf, /* in/out: memory buffer for the copied prefix,
+ or NULL */
+ ulint* buf_size); /* in/out: buffer size */
+/****************************************************************
+Folds a prefix of a physical record to a ulint. */
+UNIV_INLINE
+ulint
+rec_fold(
+/*=====*/
+ /* out: the folded value */
+ rec_t* rec, /* in: the physical record */
+ ulint n_fields, /* in: number of complete fields to fold */
+ ulint n_bytes, /* in: number of bytes to fold in an
+ incomplete last field */
+ dulint tree_id); /* in: index tree id */
+/*************************************************************
+Builds a physical record out of a data tuple and stores it beginning from
+address destination. */
+UNIV_INLINE
+rec_t*
+rec_convert_dtuple_to_rec(
+/*======================*/
+ /* out: pointer to the origin of physical
+ record */
+ byte* destination, /* in: start address of the physical record */
+ dtuple_t* dtuple); /* in: data tuple */
+/*************************************************************
+Builds a physical record out of a data tuple and stores it beginning from
+address destination. */
+
+rec_t*
+rec_convert_dtuple_to_rec_low(
+/*==========================*/
+ /* out: pointer to the origin of physical
+ record */
+ byte* destination, /* in: start address of the physical record */
+ dtuple_t* dtuple, /* in: data tuple */
+ ulint data_size); /* in: data size of dtuple */
+/**************************************************************
+Returns the extra size of a physical record if we know its
+data size and number of fields. */
+UNIV_INLINE
+ulint
+rec_get_converted_extra_size(
+/*=========================*/
+ /* out: extra size */
+ ulint data_size, /* in: data size */
+ ulint n_fields); /* in: number of fields */
+/**************************************************************
+The following function returns the size of a data tuple when converted to
+a physical record. */
+UNIV_INLINE
+ulint
+rec_get_converted_size(
+/*===================*/
+ /* out: size */
+ dtuple_t* dtuple);/* in: data tuple */
+/******************************************************************
+Copies the first n fields of a physical record to a data tuple.
+The fields are copied to the memory heap. */
+
+void
+rec_copy_prefix_to_dtuple(
+/*======================*/
+ dtuple_t* tuple, /* in: data tuple */
+ rec_t* rec, /* in: physical record */
+ ulint n_fields, /* in: number of fields to copy */
+ mem_heap_t* heap); /* in: memory heap */
+/*******************************************************************
+Validates the consistency of a physical record. */
+
+ibool
+rec_validate(
+/*=========*/
+ /* out: TRUE if ok */
+ rec_t* rec); /* in: physical record */
+/*******************************************************************
+Prints a physical record. */
+
+void
+rec_print(
+/*======*/
+ rec_t* rec); /* in: physical record */
+/*******************************************************************
+Prints a physical record to a buffer. */
+
+ulint
+rec_sprintf(
+/*========*/
+ /* out: printed length in bytes */
+ char* buf, /* in: buffer to print to */
+ ulint buf_len,/* in: buffer length */
+ rec_t* rec); /* in: physical record */
+
+#define REC_INFO_BITS 6 /* This is single byte bit-field */
+
+#ifndef UNIV_NONINL
+#include "rem0rec.ic"
+#endif
+
+#endif
diff --git a/innobase/include/rem0rec.ic b/innobase/include/rem0rec.ic
new file mode 100644
index 00000000000..c63b25374dd
--- /dev/null
+++ b/innobase/include/rem0rec.ic
@@ -0,0 +1,959 @@
+/************************************************************************
+Record manager
+
+(c) 1994-1996 Innobase Oy
+
+Created 5/30/1994 Heikki Tuuri
+*************************************************************************/
+
+#include "mach0data.h"
+#include "ut0byte.h"
+
+/* Offsets of the bit-fields in the record. NOTE! In the table the most
+significant bytes and bits are written below less significant.
+
+ (1) byte offset (2) bit usage within byte
+ downward from
+ origin -> 1 8 bits pointer to next record
+ 2 8 bits pointer to next record
+ 3 1 bit short flag
+ 7 bits number of fields
+ 4 3 bits number of fields
+ 5 bits heap number
+ 5 8 bits heap number
+ 6 4 bits n_owned
+ 4 bits info bits
+*/
+
+
+/* Maximum lengths for the data in a physical record if the offsets
+are given as one byte (resp. two byte) format. */
+#define REC_1BYTE_OFFS_LIMIT 0x7F
+#define REC_2BYTE_OFFS_LIMIT 0x7FFF
+
+/* We list the byte offsets from the origin of the record, the mask,
+and the shift needed to obtain each bit-field of the record. */
+
+#define REC_NEXT 2
+#define REC_NEXT_MASK 0xFFFF
+#define REC_NEXT_SHIFT 0
+
+#define REC_SHORT 3 /* This is single byte bit-field */
+#define REC_SHORT_MASK 0x1
+#define REC_SHORT_SHIFT 0
+
+#define REC_N_FIELDS 4
+#define REC_N_FIELDS_MASK 0x7FE
+#define REC_N_FIELDS_SHIFT 1
+
+#define REC_HEAP_NO 5
+#define REC_HEAP_NO_MASK 0xFFF8
+#define REC_HEAP_NO_SHIFT 3
+
+#define REC_N_OWNED 6 /* This is single byte bit-field */
+#define REC_N_OWNED_MASK 0xF
+#define REC_N_OWNED_SHIFT 0
+
+#define REC_INFO_BITS_MASK 0xF0
+#define REC_INFO_BITS_SHIFT 0
+
+/* The deleted flag in info bits */
+#define REC_INFO_DELETED_FLAG 0x20 /* when bit is set to 1, it means the
+ record has been delete marked */
+/* The following masks are used to filter the SQL null bit from
+one-byte and two-byte offsets */
+
+#define REC_1BYTE_SQL_NULL_MASK 0x80
+#define REC_2BYTE_SQL_NULL_MASK 0x8000
+
+/***************************************************************
+Sets the value of the ith field SQL null bit. */
+
+void
+rec_set_nth_field_null_bit(
+/*=======================*/
+ rec_t* rec, /* in: record */
+ ulint i, /* in: ith field */
+ ibool val); /* in: value to set */
+/***************************************************************
+Sets a record field to SQL null. The physical size of the field is not
+changed. */
+
+void
+rec_set_nth_field_sql_null(
+/*=======================*/
+ rec_t* rec, /* in: record */
+ ulint n); /* in: index of the field */
+
+/**********************************************************
+Gets a bit field from within 1 byte. */
+UNIV_INLINE
+ulint
+rec_get_bit_field_1(
+/*================*/
+ rec_t* rec, /* in: pointer to record origin */
+ ulint offs, /* in: offset from the origin down */
+ ulint mask, /* in: mask used to filter bits */
+ ulint shift) /* in: shift right applied after masking */
+{
+ ut_ad(rec);
+
+ return((mach_read_from_1(rec - offs) & mask) >> shift);
+}
+
+/**********************************************************
+Sets a bit field within 1 byte. */
+UNIV_INLINE
+void
+rec_set_bit_field_1(
+/*================*/
+ rec_t* rec, /* in: pointer to record origin */
+ ulint val, /* in: value to set */
+ ulint offs, /* in: offset from the origin down */
+ ulint mask, /* in: mask used to filter bits */
+ ulint shift) /* in: shift right applied after masking */
+{
+ ut_ad(rec);
+ ut_ad(offs <= REC_N_EXTRA_BYTES);
+ ut_ad(mask);
+ ut_ad(mask <= 0xFF);
+ ut_ad(((mask >> shift) << shift) == mask);
+ ut_ad(((val << shift) & mask) == (val << shift));
+
+ mach_write_to_1(rec - offs,
+ (mach_read_from_1(rec - offs) & ~mask)
+ | (val << shift));
+}
+
+/**********************************************************
+Gets a bit field from within 2 bytes. */
+UNIV_INLINE
+ulint
+rec_get_bit_field_2(
+/*================*/
+ rec_t* rec, /* in: pointer to record origin */
+ ulint offs, /* in: offset from the origin down */
+ ulint mask, /* in: mask used to filter bits */
+ ulint shift) /* in: shift right applied after masking */
+{
+ ut_ad(rec);
+
+ return((mach_read_from_2(rec - offs) & mask) >> shift);
+}
+
+/**********************************************************
+Sets a bit field within 2 bytes. */
+UNIV_INLINE
+void
+rec_set_bit_field_2(
+/*================*/
+ rec_t* rec, /* in: pointer to record origin */
+ ulint val, /* in: value to set */
+ ulint offs, /* in: offset from the origin down */
+ ulint mask, /* in: mask used to filter bits */
+ ulint shift) /* in: shift right applied after masking */
+{
+ ut_ad(rec);
+ ut_ad(offs <= REC_N_EXTRA_BYTES);
+ ut_ad(mask > 0xFF);
+ ut_ad(mask <= 0xFFFF);
+ ut_ad((mask >> shift) & 1);
+ ut_ad(0 == ((mask >> shift) & ((mask >> shift) + 1)));
+ ut_ad(((mask >> shift) << shift) == mask);
+ ut_ad(((val << shift) & mask) == (val << shift));
+#ifdef UNIV_DEBUG
+ {
+ ulint m;
+
+ /* The following assertion checks that the masks of currently
+ defined bit-fields in bytes 3-6 do not overlap. */
+ m = (ulint)((REC_SHORT_MASK << (8 * (REC_SHORT - 3)))
+ + (REC_N_FIELDS_MASK << (8 * (REC_N_FIELDS - 4)))
+ + (REC_HEAP_NO_MASK << (8 * (REC_HEAP_NO - 4)))
+ + (REC_N_OWNED_MASK << (8 * (REC_N_OWNED - 3)))
+ + (REC_INFO_BITS_MASK << (8 * (REC_INFO_BITS - 3))));
+ if (m != ut_dbg_zero + 0xFFFFFFFF) {
+ printf("Sum of masks %lx\n", m);
+ ut_error;
+ }
+ }
+#endif
+ mach_write_to_2(rec - offs,
+ (mach_read_from_2(rec - offs) & ~mask)
+ | (val << shift));
+}
+
+/**********************************************************
+The following function is used to get the offset of the next chained record
+on the same page. */
+UNIV_INLINE
+ulint
+rec_get_next_offs(
+/*==============*/
+ /* out: the page offset of the next chained record */
+ rec_t* rec) /* in: physical record */
+{
+ ulint ret;
+
+ ut_ad(rec);
+
+ ret = rec_get_bit_field_2(rec, REC_NEXT, REC_NEXT_MASK,
+ REC_NEXT_SHIFT);
+ ut_ad(ret < UNIV_PAGE_SIZE);
+
+ return(ret);
+}
+
+/**********************************************************
+The following function is used to set the next record offset field of the
+record. */
+UNIV_INLINE
+void
+rec_set_next_offs(
+/*==============*/
+ rec_t* rec, /* in: physical record */
+ ulint next) /* in: offset of the next record */
+{
+ ut_ad(rec);
+ ut_ad(UNIV_PAGE_SIZE > next);
+
+ rec_set_bit_field_2(rec, next, REC_NEXT, REC_NEXT_MASK,
+ REC_NEXT_SHIFT);
+}
+
+/**********************************************************
+The following function is used to get the number of fields in the record. */
+UNIV_INLINE
+ulint
+rec_get_n_fields(
+/*=============*/
+ /* out: number of data fields */
+ rec_t* rec) /* in: physical record */
+{
+ ulint ret;
+
+ ut_ad(rec);
+
+ ret = rec_get_bit_field_2(rec, REC_N_FIELDS, REC_N_FIELDS_MASK,
+ REC_N_FIELDS_SHIFT);
+ ut_ad(ret <= REC_MAX_N_FIELDS);
+ ut_ad(ret > 0);
+
+ return(ret);
+}
+
+/**********************************************************
+The following function is used to set the number of fields field in the
+record. */
+UNIV_INLINE
+void
+rec_set_n_fields(
+/*=============*/
+ rec_t* rec, /* in: physical record */
+ ulint n_fields) /* in: the number of fields */
+{
+ ut_ad(rec);
+ ut_ad(n_fields <= REC_MAX_N_FIELDS);
+ ut_ad(n_fields > 0);
+
+ rec_set_bit_field_2(rec, n_fields, REC_N_FIELDS, REC_N_FIELDS_MASK,
+ REC_N_FIELDS_SHIFT);
+}
+
+/**********************************************************
+The following function is used to get the number of records owned by the
+previous directory record. */
+UNIV_INLINE
+ulint
+rec_get_n_owned(
+/*============*/
+ /* out: number of owned records */
+ rec_t* rec) /* in: physical record */
+{
+ ulint ret;
+
+ ut_ad(rec);
+
+ ret = rec_get_bit_field_1(rec, REC_N_OWNED, REC_N_OWNED_MASK,
+ REC_N_OWNED_SHIFT);
+ ut_ad(ret <= REC_MAX_N_OWNED);
+
+ return(ret);
+}
+
+/**********************************************************
+The following function is used to set the number of owned records. */
+UNIV_INLINE
+void
+rec_set_n_owned(
+/*============*/
+ rec_t* rec, /* in: physical record */
+ ulint n_owned) /* in: the number of owned */
+{
+ ut_ad(rec);
+ ut_ad(n_owned <= REC_MAX_N_OWNED);
+
+ rec_set_bit_field_1(rec, n_owned, REC_N_OWNED, REC_N_OWNED_MASK,
+ REC_N_OWNED_SHIFT);
+}
+
+/**********************************************************
+The following function is used to retrieve the info bits of a record. */
+UNIV_INLINE
+ulint
+rec_get_info_bits(
+/*==============*/
+ /* out: info bits */
+ rec_t* rec) /* in: physical record */
+{
+ ulint ret;
+
+ ut_ad(rec);
+
+ ret = rec_get_bit_field_1(rec, REC_INFO_BITS, REC_INFO_BITS_MASK,
+ REC_INFO_BITS_SHIFT);
+ ut_ad((ret & ~REC_INFO_BITS_MASK) == 0);
+
+ return(ret);
+}
+
+/**********************************************************
+The following function is used to set the info bits of a record. */
+UNIV_INLINE
+void
+rec_set_info_bits(
+/*==============*/
+ rec_t* rec, /* in: physical record */
+ ulint bits) /* in: info bits */
+{
+ ut_ad(rec);
+ ut_ad((bits & ~REC_INFO_BITS_MASK) == 0);
+
+ rec_set_bit_field_1(rec, bits, REC_INFO_BITS, REC_INFO_BITS_MASK,
+ REC_INFO_BITS_SHIFT);
+}
+
+/**********************************************************
+Gets the value of the deleted flag in info bits. */
+UNIV_INLINE
+ibool
+rec_info_bits_get_deleted_flag(
+/*===========================*/
+ /* out: TRUE if deleted flag set */
+ ulint info_bits) /* in: info bits from a record */
+{
+ if (info_bits & REC_INFO_DELETED_FLAG) {
+
+ return(TRUE);
+ }
+
+ return(FALSE);
+}
+
+/**********************************************************
+The following function tells if record is delete marked. */
+UNIV_INLINE
+ibool
+rec_get_deleted_flag(
+/*=================*/
+ /* out: TRUE if delete marked */
+ rec_t* rec) /* in: physical record */
+{
+ if (REC_INFO_DELETED_FLAG & rec_get_info_bits(rec)) {
+
+ return(TRUE);
+ }
+
+ return(FALSE);
+}
+
+/**********************************************************
+The following function is used to set the deleted bit. */
+UNIV_INLINE
+void
+rec_set_deleted_flag(
+/*=================*/
+ rec_t* rec, /* in: physical record */
+ ibool flag) /* in: TRUE if delete marked */
+{
+ ulint old_val;
+ ulint new_val;
+
+ ut_ad(TRUE == 1);
+ ut_ad(flag <= TRUE);
+
+ old_val = rec_get_info_bits(rec);
+
+ if (flag) {
+ new_val = REC_INFO_DELETED_FLAG | old_val;
+ } else {
+ new_val = ~REC_INFO_DELETED_FLAG & old_val;
+ }
+
+ rec_set_info_bits(rec, new_val);
+}
+
+/**********************************************************
+The following function is used to get the order number of the record in the
+heap of the index page. */
+UNIV_INLINE
+ulint
+rec_get_heap_no(
+/*=============*/
+ /* out: heap order number */
+ rec_t* rec) /* in: physical record */
+{
+ ulint ret;
+
+ ut_ad(rec);
+
+ ret = rec_get_bit_field_2(rec, REC_HEAP_NO, REC_HEAP_NO_MASK,
+ REC_HEAP_NO_SHIFT);
+ ut_ad(ret <= REC_MAX_HEAP_NO);
+
+ return(ret);
+}
+
+/**********************************************************
+The following function is used to set the heap number field in the record. */
+UNIV_INLINE
+void
+rec_set_heap_no(
+/*=============*/
+ rec_t* rec, /* in: physical record */
+ ulint heap_no)/* in: the heap number */
+{
+ ut_ad(heap_no <= REC_MAX_HEAP_NO);
+
+ rec_set_bit_field_2(rec, heap_no, REC_HEAP_NO, REC_HEAP_NO_MASK,
+ REC_HEAP_NO_SHIFT);
+}
+
+/**********************************************************
+The following function is used to test whether the data offsets in the record
+are stored in one-byte or two-byte format. */
+UNIV_INLINE
+ibool
+rec_get_1byte_offs_flag(
+/*====================*/
+ /* out: TRUE if 1-byte form */
+ rec_t* rec) /* in: physical record */
+{
+ ut_ad(TRUE == 1);
+
+ return(rec_get_bit_field_1(rec, REC_SHORT, REC_SHORT_MASK,
+ REC_SHORT_SHIFT));
+}
+
+/**********************************************************
+The following function is used to set the 1-byte offsets flag. */
+UNIV_INLINE
+void
+rec_set_1byte_offs_flag(
+/*====================*/
+ rec_t* rec, /* in: physical record */
+ ibool flag) /* in: TRUE if 1byte form */
+{
+ ut_ad(TRUE == 1);
+ ut_ad(flag <= TRUE);
+
+ rec_set_bit_field_1(rec, flag, REC_SHORT, REC_SHORT_MASK,
+ REC_SHORT_SHIFT);
+}
+
+/**********************************************************
+Returns the offset of nth field end if the record is stored in the 1-byte
+offsets form. If the field is SQL null, the flag is ORed in the returned
+value. */
+UNIV_INLINE
+ulint
+rec_1_get_field_end_info(
+/*=====================*/
+ /* out: offset of the start of the field, SQL null
+ flag ORed */
+ rec_t* rec, /* in: record */
+ ulint n) /* in: field index */
+{
+ ut_ad(rec_get_1byte_offs_flag(rec));
+ ut_ad(n < rec_get_n_fields(rec));
+
+ return(mach_read_from_1(rec - (REC_N_EXTRA_BYTES + n + 1)));
+}
+
+/**********************************************************
+Returns the offset of nth field end if the record is stored in the 2-byte
+offsets form. If the field is SQL null, the flag is ORed in the returned
+value. */
+UNIV_INLINE
+ulint
+rec_2_get_field_end_info(
+/*=====================*/
+ /* out: offset of the start of the field, SQL null
+ flag ORed */
+ rec_t* rec, /* in: record */
+ ulint n) /* in: field index */
+{
+ ut_ad(!rec_get_1byte_offs_flag(rec));
+ ut_ad(n < rec_get_n_fields(rec));
+
+ return(mach_read_from_2(rec - (REC_N_EXTRA_BYTES + 2 * n + 2)));
+}
+
+/**********************************************************
+Returns the offset of n - 1th field end if the record is stored in the 1-byte
+offsets form. If the field is SQL null, the flag is ORed in the returned
+value. This function and the 2-byte counterpart are defined here because the
+C-compilerwas not able to sum negative and positive constant offsets, and
+warned of constant arithmetic overflow within the compiler. */
+UNIV_INLINE
+ulint
+rec_1_get_prev_field_end_info(
+/*==========================*/
+ /* out: offset of the start of the PREVIOUS field, SQL
+ null flag ORed */
+ rec_t* rec, /* in: record */
+ ulint n) /* in: field index */
+{
+ ut_ad(rec_get_1byte_offs_flag(rec));
+ ut_ad(n <= rec_get_n_fields(rec));
+
+ return(mach_read_from_1(rec - (REC_N_EXTRA_BYTES + n)));
+}
+
+/**********************************************************
+Returns the offset of n - 1th field end if the record is stored in the 2-byte
+offsets form. If the field is SQL null, the flag is ORed in the returned
+value. */
+UNIV_INLINE
+ulint
+rec_2_get_prev_field_end_info(
+/*==========================*/
+ /* out: offset of the start of the PREVIOUS field, SQL
+ null flag ORed */
+ rec_t* rec, /* in: record */
+ ulint n) /* in: field index */
+{
+ ut_ad(!rec_get_1byte_offs_flag(rec));
+ ut_ad(n <= rec_get_n_fields(rec));
+
+ return(mach_read_from_2(rec - (REC_N_EXTRA_BYTES + 2 * n)));
+}
+
+/**********************************************************
+Sets the field end info for the nth field if the record is stored in the
+1-byte format. */
+UNIV_INLINE
+void
+rec_1_set_field_end_info(
+/*=====================*/
+ rec_t* rec, /* in: record */
+ ulint n, /* in: field index */
+ ulint info) /* in: value to set */
+{
+ ut_ad(rec_get_1byte_offs_flag(rec));
+ ut_ad(n < rec_get_n_fields(rec));
+
+ mach_write_to_1(rec - (REC_N_EXTRA_BYTES + n + 1), info);
+}
+
+/**********************************************************
+Sets the field end info for the nth field if the record is stored in the
+2-byte format. */
+UNIV_INLINE
+void
+rec_2_set_field_end_info(
+/*=====================*/
+ rec_t* rec, /* in: record */
+ ulint n, /* in: field index */
+ ulint info) /* in: value to set */
+{
+ ut_ad(!rec_get_1byte_offs_flag(rec));
+ ut_ad(n < rec_get_n_fields(rec));
+
+ mach_write_to_2(rec - (REC_N_EXTRA_BYTES + 2 * n + 2), info);
+}
+
+/**********************************************************
+Returns the offset of nth field start if the record is stored in the 1-byte
+offsets form. */
+UNIV_INLINE
+ulint
+rec_1_get_field_start_offs(
+/*=======================*/
+ /* out: offset of the start of the field */
+ rec_t* rec, /* in: record */
+ ulint n) /* in: field index */
+{
+ ut_ad(rec_get_1byte_offs_flag(rec));
+ ut_ad(n <= rec_get_n_fields(rec));
+
+ if (n == 0) {
+
+ return(0);
+ }
+
+ return(rec_1_get_prev_field_end_info(rec, n)
+ & ~REC_1BYTE_SQL_NULL_MASK);
+}
+
+/**********************************************************
+Returns the offset of nth field start if the record is stored in the 2-byte
+offsets form. */
+UNIV_INLINE
+ulint
+rec_2_get_field_start_offs(
+/*=======================*/
+ /* out: offset of the start of the field */
+ rec_t* rec, /* in: record */
+ ulint n) /* in: field index */
+{
+ ut_ad(!rec_get_1byte_offs_flag(rec));
+ ut_ad(n <= rec_get_n_fields(rec));
+
+ if (n == 0) {
+
+ return(0);
+ }
+
+ return(rec_2_get_prev_field_end_info(rec, n)
+ & ~REC_2BYTE_SQL_NULL_MASK);
+}
+
+/**********************************************************
+The following function is used to read the offset of the start of a data field
+in the record. The start of an SQL null field is the end offset of the
+previous non-null field, or 0, if none exists. If n is the number of the last
+field + 1, then the end offset of the last field is returned. */
+UNIV_INLINE
+ulint
+rec_get_field_start_offs(
+/*=====================*/
+ /* out: offset of the start of the field */
+ rec_t* rec, /* in: record */
+ ulint n) /* in: field index */
+{
+ ut_ad(rec);
+ ut_ad(n <= rec_get_n_fields(rec));
+
+ if (n == 0) {
+
+ return(0);
+ }
+
+ if (rec_get_1byte_offs_flag(rec)) {
+
+ return(rec_1_get_field_start_offs(rec, n));
+ }
+
+ return(rec_2_get_field_start_offs(rec, n));
+}
+
+/****************************************************************
+Gets the physical size of a field. Also an SQL null may have a field of
+size > 0, if the data type is of a fixed size. */
+UNIV_INLINE
+ulint
+rec_get_nth_field_size(
+/*===================*/
+ /* out: field size in bytes */
+ rec_t* rec, /* in: record */
+ ulint n) /* in: index of the field */
+{
+ ulint os;
+ ulint next_os;
+
+ os = rec_get_field_start_offs(rec, n);
+ next_os = rec_get_field_start_offs(rec, n + 1);
+
+ ut_ad(next_os - os < UNIV_PAGE_SIZE);
+
+ return(next_os - os);
+}
+
+/****************************************************************
+The following function is used to get a copy of the nth data field in a
+record to a buffer. */
+UNIV_INLINE
+void
+rec_copy_nth_field(
+/*===============*/
+ void* buf, /* in: pointer to the buffer */
+ rec_t* rec, /* in: record */
+ ulint n, /* in: index of the field */
+ ulint* len) /* out: length of the field; UNIV_SQL_NULL if SQL
+ null */
+{
+ byte* ptr;
+
+ ut_ad(buf && rec && len);
+
+ ptr = rec_get_nth_field(rec, n, len);
+
+ if (*len == UNIV_SQL_NULL) {
+
+ return;
+ }
+
+ ut_memcpy(buf, ptr, *len);
+}
+
+/***************************************************************
+This is used to modify the value of an already existing field in a record.
+The previous value must have exactly the same size as the new value. If len
+is UNIV_SQL_NULL then the field is treated as an SQL null. */
+UNIV_INLINE
+void
+rec_set_nth_field(
+/*==============*/
+ rec_t* rec, /* in: record */
+ ulint n, /* in: index of the field */
+ void* data, /* in: pointer to the data if not SQL null */
+ ulint len) /* in: length of the data or UNIV_SQL_NULL */
+{
+ byte* data2;
+ ulint len2;
+
+ ut_ad((len == UNIV_SQL_NULL)
+ || (rec_get_nth_field_size(rec, n) == len));
+
+ if (len == UNIV_SQL_NULL) {
+ rec_set_nth_field_sql_null(rec, n);
+
+ return;
+ }
+
+ data2 = rec_get_nth_field(rec, n, &len2);
+
+ ut_memcpy(data2, data, len);
+
+ if (len2 == UNIV_SQL_NULL) {
+
+ rec_set_nth_field_null_bit(rec, n, FALSE);
+ }
+}
+
+/**************************************************************
+The following function returns the data size of a physical
+record, that is the sum of field lengths. SQL null fields
+are counted as length 0 fields. The value returned by the function
+is the distance from record origin to record end in bytes. */
+UNIV_INLINE
+ulint
+rec_get_data_size(
+/*==============*/
+ /* out: size */
+ rec_t* rec) /* in: physical record */
+{
+ ut_ad(rec);
+
+ return(rec_get_field_start_offs(rec, rec_get_n_fields(rec)));
+}
+
+/**************************************************************
+Returns the total size of record minus data size of record. The value
+returned by the function is the distance from record start to record origin
+in bytes. */
+UNIV_INLINE
+ulint
+rec_get_extra_size(
+/*===============*/
+ /* out: size */
+ rec_t* rec) /* in: physical record */
+{
+ ulint n_fields;
+
+ ut_ad(rec);
+
+ n_fields = rec_get_n_fields(rec);
+
+ if (rec_get_1byte_offs_flag(rec)) {
+
+ return(REC_N_EXTRA_BYTES + n_fields);
+ }
+
+ return(REC_N_EXTRA_BYTES + 2 * n_fields);
+}
+
+/**************************************************************
+Returns the total size of a physical record. */
+UNIV_INLINE
+ulint
+rec_get_size(
+/*=========*/
+ /* out: size */
+ rec_t* rec) /* in: physical record */
+{
+ ulint n_fields;
+
+ ut_ad(rec);
+
+ n_fields = rec_get_n_fields(rec);
+
+ if (rec_get_1byte_offs_flag(rec)) {
+
+ return(REC_N_EXTRA_BYTES + n_fields
+ + rec_1_get_field_start_offs(rec, n_fields));
+ }
+
+ return(REC_N_EXTRA_BYTES + 2 * n_fields
+ + rec_2_get_field_start_offs(rec, n_fields));
+}
+
+/**************************************************************
+Returns a pointer to the end of the record. */
+UNIV_INLINE
+byte*
+rec_get_end(
+/*========*/
+ /* out: pointer to end */
+ rec_t* rec) /* in: pointer to record */
+{
+ return(rec + rec_get_data_size(rec));
+}
+
+/**************************************************************
+Returns a pointer to the start of the record. */
+UNIV_INLINE
+byte*
+rec_get_start(
+/*==========*/
+ /* out: pointer to start */
+ rec_t* rec) /* in: pointer to record */
+{
+ return(rec - rec_get_extra_size(rec));
+}
+
+/*******************************************************************
+Copies a physical record to a buffer. */
+UNIV_INLINE
+rec_t*
+rec_copy(
+/*=====*/
+ /* out: pointer to the origin of the copied record */
+ void* buf, /* in: buffer */
+ rec_t* rec) /* in: physical record */
+{
+ ulint extra_len;
+ ulint data_len;
+
+ ut_ad(rec && buf);
+ ut_ad(rec_validate(rec));
+
+ extra_len = rec_get_extra_size(rec);
+ data_len = rec_get_data_size(rec);
+
+ ut_memcpy(buf, rec - extra_len, extra_len + data_len);
+
+ return((byte*)buf + extra_len);
+}
+
+/**************************************************************
+Returns the extra size of a physical record if we know its data size and
+the number of fields. */
+UNIV_INLINE
+ulint
+rec_get_converted_extra_size(
+/*=========================*/
+ /* out: extra size */
+ ulint data_size, /* in: data size */
+ ulint n_fields) /* in: number of fields */
+{
+ if (data_size <= REC_1BYTE_OFFS_LIMIT) {
+
+ return(REC_N_EXTRA_BYTES + n_fields);
+ }
+
+ return(REC_N_EXTRA_BYTES + 2 * n_fields);
+}
+
+/**************************************************************
+The following function returns the size of a data tuple when converted to
+a physical record. */
+UNIV_INLINE
+ulint
+rec_get_converted_size(
+/*===================*/
+ /* out: size */
+ dtuple_t* dtuple) /* in: data tuple */
+{
+ ulint data_size;
+ ulint extra_size;
+
+ ut_ad(dtuple);
+ ut_ad(dtuple_check_typed(dtuple));
+
+ data_size = dtuple_get_data_size(dtuple);
+
+ extra_size = rec_get_converted_extra_size(
+ data_size, dtuple_get_n_fields(dtuple));
+
+ return(data_size + extra_size);
+}
+
+/****************************************************************
+Folds a prefix of a physical record to a ulint. */
+UNIV_INLINE
+ulint
+rec_fold(
+/*=====*/
+ /* out: the folded value */
+ rec_t* rec, /* in: the physical record */
+ ulint n_fields, /* in: number of complete fields to fold */
+ ulint n_bytes, /* in: number of bytes to fold in an
+ incomplete last field */
+ dulint tree_id) /* in: index tree id */
+{
+ ulint i;
+ byte* data;
+ ulint len;
+ ulint fold;
+
+ ut_ad(rec_validate(rec));
+ ut_ad(n_fields <= rec_get_n_fields(rec));
+ ut_ad((n_fields < rec_get_n_fields(rec)) || (n_bytes == 0));
+ ut_ad(n_fields + n_bytes > 0);
+ /* Only the page supremum and infimum records have 1 field: */
+ ut_ad(rec_get_n_fields(rec) > 1);
+
+ fold = ut_fold_dulint(tree_id);
+
+ for (i = 0; i < n_fields; i++) {
+ data = rec_get_nth_field(rec, i, &len);
+
+ if (len != UNIV_SQL_NULL) {
+ fold = ut_fold_ulint_pair(fold,
+ ut_fold_binary(data, len));
+ }
+ }
+
+ if (n_bytes > 0) {
+ data = rec_get_nth_field(rec, i, &len);
+
+ if (len != UNIV_SQL_NULL) {
+ if (len > n_bytes) {
+ len = n_bytes;
+ }
+
+ fold = ut_fold_ulint_pair(fold,
+ ut_fold_binary(data, len));
+ }
+ }
+
+ return(fold);
+}
+
+/*************************************************************
+Builds a physical record out of a data tuple and stores it beginning from
+the address destination. */
+UNIV_INLINE
+rec_t*
+rec_convert_dtuple_to_rec(
+/*======================*/
+ /* out: pointer to the origin of physical
+ record */
+ byte* destination, /* in: start address of the physical record */
+ dtuple_t* dtuple) /* in: data tuple */
+{
+ return(rec_convert_dtuple_to_rec_low(destination, dtuple,
+ dtuple_get_data_size(dtuple)));
+}
diff --git a/innobase/include/rem0types.h b/innobase/include/rem0types.h
new file mode 100644
index 00000000000..94c394499c5
--- /dev/null
+++ b/innobase/include/rem0types.h
@@ -0,0 +1,16 @@
+/************************************************************************
+Record manager global types
+
+(c) 1994-1996 Innobase Oy
+
+Created 5/30/1994 Heikki Tuuri
+*************************************************************************/
+
+#ifndef rem0types_h
+#define rem0types_h
+
+/* We define the physical record simply as an array of bytes */
+typedef byte rec_t;
+
+
+#endif
diff --git a/innobase/include/row0ins.h b/innobase/include/row0ins.h
new file mode 100644
index 00000000000..94b0e8dec37
--- /dev/null
+++ b/innobase/include/row0ins.h
@@ -0,0 +1,142 @@
+/******************************************************
+Insert into a table
+
+(c) 1996 Innobase Oy
+
+Created 4/20/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef row0ins_h
+#define row0ins_h
+
+#include "univ.i"
+#include "data0data.h"
+#include "que0types.h"
+#include "dict0types.h"
+#include "trx0types.h"
+#include "row0types.h"
+
+/*************************************************************************
+Creates an insert node struct. */
+
+ins_node_t*
+ins_node_create(
+/*============*/
+ /* out, own: insert node struct */
+ ulint ins_type, /* in: INS_VALUES, ... */
+ dict_table_t* table, /* in: table where to insert */
+ mem_heap_t* heap); /* in: mem heap where created */
+/*************************************************************************
+Sets a new row to insert for an INS_DIRECT node. This function is only used
+if we have constructed the row separately, which is a rare case; this
+function is quite slow. */
+
+void
+ins_node_set_new_row(
+/*=================*/
+ ins_node_t* node, /* in: insert node */
+ dtuple_t* row); /* in: new row (or first row) for the node */
+/*******************************************************************
+Tries to insert an index entry to an index. If the index is clustered
+and a record with the same unique key is found, the other record is
+necessarily marked deleted by a committed transaction, or a unique key
+violation error occurs. The delete marked record is then updated to an
+existing record, and we must write an undo log record on the delete
+marked record. If the index is secondary, and a record with exactly the
+same fields is found, the other record is necessarily marked deleted.
+It is then unmarked. Otherwise, the entry is just inserted to the index. */
+
+ulint
+row_ins_index_entry_low(
+/*====================*/
+ /* out: DB_SUCCESS, DB_LOCK_WAIT, DB_FAIL
+ if pessimistic retry needed, or error code */
+ ulint mode, /* in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE,
+ depending on whether we wish optimistic or
+ pessimistic descent down the index tree */
+ dict_index_t* index, /* in: index */
+ dtuple_t* entry, /* in: index entry to insert */
+ que_thr_t* thr); /* in: query thread */
+/*******************************************************************
+Inserts an index entry to index. Tries first optimistic, then pessimistic
+descent down the tree. If the entry matches enough to a delete marked record,
+performs the insert by updating or delete unmarking the delete marked
+record. */
+
+ulint
+row_ins_index_entry(
+/*================*/
+ /* out: DB_SUCCESS, DB_LOCK_WAIT,
+ DB_DUPLICATE_KEY, or some other error code */
+ dict_index_t* index, /* in: index */
+ dtuple_t* entry, /* in: index entry to insert */
+ que_thr_t* thr); /* in: query thread */
+/***************************************************************
+Inserts a row to a table. */
+
+ulint
+row_ins(
+/*====*/
+ /* out: DB_SUCCESS if operation successfully
+ completed, else error code or DB_LOCK_WAIT */
+ ins_node_t* node, /* in: row insert node */
+ que_thr_t* thr); /* in: query thread */
+/***************************************************************
+Inserts a row to a table. This is a high-level function used in
+SQL execution graphs. */
+
+que_thr_t*
+row_ins_step(
+/*=========*/
+ /* out: query thread to run next or NULL */
+ que_thr_t* thr); /* in: query thread */
+
+/* Insert node structure */
+
+struct ins_node_struct{
+ que_common_t common; /* node type: QUE_NODE_INSERT */
+ ulint ins_type;/* INS_VALUES, INS_SEARCHED, or INS_DIRECT */
+ dtuple_t* row; /* row to insert */
+ dict_table_t* table; /* table where to insert */
+ sel_node_t* select; /* select in searched insert */
+ que_node_t* values_list;/* list of expressions to evaluate and
+ insert in an INS_VALUES insert */
+ ulint state; /* node execution state */
+ dict_index_t* index; /* NULL, or the next index where the index
+ entry should be inserted */
+ dtuple_t* entry; /* NULL, or entry to insert in the index;
+ after a successful insert of the entry,
+ this should be reset to NULL */
+ UT_LIST_BASE_NODE_T(dtuple_t)
+ entry_list;/* list of entries, one for each index */
+ byte* row_id_buf;/* buffer for the row id sys field in row */
+ dulint trx_id; /* trx id or the last trx which executed the
+ node */
+ byte* trx_id_buf;/* buffer for the trx id sys field in row */
+ mem_heap_t* entry_sys_heap;
+ /* memory heap used as auxiliary storage;
+ entry_list and sys fields are stored here;
+ if this is NULL, entry list should be created
+ and buffers for sys fields in row allocated */
+ ulint magic_n;
+};
+
+#define INS_NODE_MAGIC_N 15849075
+
+/* Insert node types */
+#define INS_SEARCHED 0 /* INSERT INTO ... SELECT ... */
+#define INS_VALUES 1 /* INSERT INTO ... VALUES ... */
+#define INS_DIRECT 2 /* this is for internal use in dict0crea:
+ insert the row directly */
+
+/* Node execution states */
+#define INS_NODE_SET_IX_LOCK 1 /* we should set an IX lock on table */
+#define INS_NODE_ALLOC_ROW_ID 2 /* row id should be allocated */
+#define INS_NODE_INSERT_ENTRIES 3 /* index entries should be built and
+ inserted */
+
+#ifndef UNIV_NONINL
+#include "row0ins.ic"
+#endif
+
+#endif
diff --git a/innobase/include/row0ins.ic b/innobase/include/row0ins.ic
new file mode 100644
index 00000000000..80a232d41ee
--- /dev/null
+++ b/innobase/include/row0ins.ic
@@ -0,0 +1,9 @@
+/******************************************************
+Insert into a table
+
+(c) 1996 Innobase Oy
+
+Created 4/20/1996 Heikki Tuuri
+*******************************************************/
+
+
diff --git a/innobase/include/row0mysql.h b/innobase/include/row0mysql.h
new file mode 100644
index 00000000000..ee631bc02dc
--- /dev/null
+++ b/innobase/include/row0mysql.h
@@ -0,0 +1,359 @@
+/******************************************************
+Interface between Innobase row operations and MySQL.
+Contains also create table and other data dictionary operations.
+
+(c) 2000 Innobase Oy
+
+Created 9/17/2000 Heikki Tuuri
+*******************************************************/
+
+#ifndef row0mysql_h
+#define row0mysql_h
+
+#include "univ.i"
+#include "data0data.h"
+#include "que0types.h"
+#include "dict0types.h"
+#include "trx0types.h"
+#include "row0types.h"
+#include "btr0pcur.h"
+#include "trx0types.h"
+
+typedef struct row_prebuilt_struct row_prebuilt_t;
+
+/***********************************************************************
+Stores a variable-length field (like VARCHAR) length to dest, in the
+MySQL format. */
+UNIV_INLINE
+byte*
+row_mysql_store_var_len(
+/*====================*/
+ /* out: dest + 2 */
+ byte* dest, /* in: where to store */
+ ulint len); /* in: length, must fit in two bytes */
+/***********************************************************************
+Reads a MySQL format variable-length field (like VARCHAR) length and
+returns pointer to the field data. */
+UNIV_INLINE
+byte*
+row_mysql_read_var_ref(
+/*===================*/
+ /* out: field + 2 */
+ ulint* len, /* out: variable-length field length */
+ byte* field); /* in: field */
+/***********************************************************************
+Reads a MySQL format variable-length field (like VARCHAR) length and
+returns pointer to the field data. */
+
+byte*
+row_mysql_read_var_ref_noninline(
+/*=============================*/
+ /* out: field + 2 */
+ ulint* len, /* out: variable-length field length */
+ byte* field); /* in: field */
+/***********************************************************************
+Stores a reference to a BLOB in the MySQL format. */
+
+void
+row_mysql_store_blob_ref(
+/*=====================*/
+ byte* dest, /* in: where to store */
+ ulint col_len, /* in: dest buffer size: determines into
+ how many bytes the BLOB length is stored,
+ this may vary from 1 to 4 bytes */
+ byte* data, /* in: BLOB data */
+ ulint len); /* in: BLOB length */
+/***********************************************************************
+Reads a reference to a BLOB in the MySQL format. */
+
+byte*
+row_mysql_read_blob_ref(
+/*====================*/
+ /* out: pointer to BLOB data */
+ ulint* len, /* out: BLOB length */
+ byte* ref, /* in: BLOB reference in the MySQL format */
+ ulint col_len); /* in: BLOB reference length (not BLOB
+ length) */
+/******************************************************************
+Stores a non-SQL-NULL field given in the MySQL format in the Innobase
+format. */
+UNIV_INLINE
+void
+row_mysql_store_col_in_innobase_format(
+/*===================================*/
+ dfield_t* dfield, /* in/out: dfield */
+ byte* buf, /* in/out: buffer for the converted
+ value */
+ byte* mysql_data, /* in: MySQL column value, not
+ SQL NULL; NOTE that dfield may also
+ get a pointer to mysql_data,
+ therefore do not discard this as long
+ as dfield is used! */
+ ulint col_len, /* in: MySQL column length */
+ ulint type, /* in: data type */
+ ulint is_unsigned); /* in: != 0 if unsigned integer type */
+/********************************************************************
+Handles user errors and lock waits detected by the database engine. */
+
+ibool
+row_mysql_handle_errors(
+/*====================*/
+ /* out: TRUE if it was a lock wait and
+ we should continue running the query thread */
+ ulint* new_err,/* out: possible new error encountered in
+ rollback, or the old error which was
+ during the function entry */
+ trx_t* trx, /* in: transaction */
+ que_thr_t* thr, /* in: query thread */
+ trx_savept_t* savept);/* in: savepoint */
+/************************************************************************
+Create a prebuilt struct for a MySQL table handle. */
+
+row_prebuilt_t*
+row_create_prebuilt(
+/*================*/
+ /* out, own: a prebuilt struct */
+ dict_table_t* table); /* in: Innobase table handle */
+/************************************************************************
+Free a prebuilt struct for a MySQL table handle. */
+
+void
+row_prebuilt_free(
+/*==============*/
+ row_prebuilt_t* prebuilt); /* in, own: prebuilt struct */
+/*************************************************************************
+Updates the transaction pointers in query graphs stored in the prebuilt
+struct. */
+
+void
+row_update_prebuilt_trx(
+/*====================*/
+ /* out: prebuilt dtuple */
+ row_prebuilt_t* prebuilt, /* in: prebuilt struct in MySQL
+ handle */
+ trx_t* trx); /* in: transaction handle */
+/*************************************************************************
+Does an insert for MySQL. */
+
+int
+row_insert_for_mysql(
+/*=================*/
+ /* out: error code or DB_SUCCESS */
+ byte* mysql_rec, /* in: row in the MySQL format */
+ row_prebuilt_t* prebuilt); /* in: prebuilt struct in MySQL
+ handle */
+/*************************************************************************
+Builds a dummy query graph used in selects. */
+
+void
+row_prebuild_sel_graph(
+/*===================*/
+ row_prebuilt_t* prebuilt); /* in: prebuilt struct in MySQL
+ handle */
+/*************************************************************************
+Gets pointer to a prebuilt update vector used in updates. If the update
+graph has not yet been built in the prebuilt struct, then this function
+first builds it. */
+
+upd_t*
+row_get_prebuilt_update_vector(
+/*===========================*/
+ /* out: prebuilt update vector */
+ row_prebuilt_t* prebuilt); /* in: prebuilt struct in MySQL
+ handle */
+/*************************************************************************
+Checks if a table is such that we automatically created a clustered
+index on it (on row id). */
+
+ibool
+row_table_got_default_clust_index(
+/*==============================*/
+ dict_table_t* table);
+/*************************************************************************
+Does an update or delete of a row for MySQL. */
+
+int
+row_update_for_mysql(
+/*=================*/
+ /* out: error code or DB_SUCCESS */
+ byte* mysql_rec, /* in: the row to be updated, in
+ the MySQL format */
+ row_prebuilt_t* prebuilt); /* in: prebuilt struct in MySQL
+ handle */
+/*************************************************************************
+Does a table creation operation for MySQL. */
+
+int
+row_create_table_for_mysql(
+/*=======================*/
+ /* out: error code or DB_SUCCESS */
+ dict_table_t* table, /* in: table definition */
+ trx_t* trx); /* in: transaction handle */
+/*************************************************************************
+Does an index creation operation for MySQL. TODO: currently failure
+to create an index results in dropping the whole table! This is no problem
+currently as all indexes must be created at the same time as the table. */
+
+int
+row_create_index_for_mysql(
+/*=======================*/
+ /* out: error number or DB_SUCCESS */
+ dict_index_t* index, /* in: index defintion */
+ trx_t* trx); /* in: transaction handle */
+/*************************************************************************
+Drops a table for MySQL. */
+
+int
+row_drop_table_for_mysql(
+/*=====================*/
+ /* out: error code or DB_SUCCESS */
+ char* name, /* in: table name */
+ trx_t* trx, /* in: transaction handle */
+ ibool has_dict_mutex);/* in: TRUE if the caller already owns the
+ dictionary system mutex */
+/*************************************************************************
+Renames a table for MySQL. */
+
+int
+row_rename_table_for_mysql(
+/*=======================*/
+ /* out: error code or DB_SUCCESS */
+ char* old_name, /* in: old table name */
+ char* new_name, /* in: new table name */
+ trx_t* trx); /* in: transaction handle */
+
+/* A struct describing a place for an individual column in the MySQL
+row format which is presented to the table handler in ha_innobase.
+This template struct is used to speed up row transformations between
+Innobase and MySQL. */
+
+typedef struct mysql_row_templ_struct mysql_row_templ_t;
+struct mysql_row_templ_struct {
+ ulint col_no; /* column number of the column */
+ ulint rec_field_no; /* field number of the column in an
+ Innobase record in the current index;
+ not defined if template_type is
+ ROW_MYSQL_WHOLE_ROW */
+ ulint mysql_col_offset; /* offset of the column in the MySQL
+ row format */
+ ulint mysql_col_len; /* length of the column in the MySQL
+ row format */
+ ulint mysql_null_byte_offset; /* MySQL NULL bit byte offset in a
+ MySQL record */
+ ulint mysql_null_bit_mask; /* bit mask to get the NULL bit,
+ zero if column cannot be NULL */
+ ulint type; /* column type in Innobase mtype
+ numbers DATA_CHAR... */
+ ulint is_unsigned; /* if a column type is an integer
+ type and this field is != 0, then
+ it is an unsigned integer type */
+};
+
+#define MYSQL_FETCH_CACHE_SIZE 8
+/* After fetching this many rows, we start caching them in fetch_cache */
+#define MYSQL_FETCH_CACHE_THRESHOLD 4
+
+
+/* A struct for (sometimes lazily) prebuilt structures in an Innobase table
+handle used within MySQL; these are used to save CPU time. */
+
+struct row_prebuilt_struct {
+ dict_table_t* table; /* Innobase table handle */
+ trx_t* trx; /* current transaction handle */
+ ibool sql_stat_start; /* TRUE when we start processing of
+ an SQL statement: we may have to set
+ an intention lock on the table,
+ create a consistent read view etc. */
+ ibool clust_index_was_generated;
+ /* if the user did not define a
+ primary key in MySQL, then Innobase
+ automatically generated a clustered
+ index where the ordering column is
+ the row id: in this case this flag
+ is set to TRUE */
+ dict_index_t* index; /* current index for a search, if any */
+ ulint template_type; /* ROW_MYSQL_WHOLE_ROW,
+ ROW_MYSQL_REC_FIELDS or
+ ROW_MYSQL_NO_TEMPLATE */
+ ulint n_template; /* number of elements in the
+ template */
+ ulint null_bitmap_len;/* number of bytes in the SQL NULL
+ bitmap at the start of a row in the
+ MySQL format */
+ ibool need_to_access_clustered; /* if we are fetching
+ columns through a secondary index
+ and at least one column is not in
+ the secondary index, then this is
+ set to TRUE */
+ ibool templ_contains_blob;/* TRUE if the template contains
+ BLOB column(s) */
+ mysql_row_templ_t* mysql_template;/* template used to transform
+ rows fast between MySQL and Innobase
+ formats; memory for this template
+ is not allocated from 'heap' */
+ mem_heap_t* heap; /* memory heap from which
+ these auxiliary structures are
+ allocated when needed */
+ ins_node_t* ins_node; /* Innobase SQL insert node
+ used to perform inserts
+ to the table */
+ byte* ins_upd_rec_buff;/* buffer for storing data converted
+ to the Innobase format from the MySQL
+ format */
+ ibool in_update_remember_pos;
+ /* if an update is processed, then if
+ this flag is set to TRUE, it means
+ that the stored cursor position in
+ SELECT is the right position also
+ for the update: we can just restore
+ the cursor and save CPU time */
+ upd_node_t* upd_node; /* Innobase SQL update node used
+ to perform updates and deletes */
+ que_fork_t* ins_graph; /* Innobase SQL query graph used
+ in inserts */
+ que_fork_t* upd_graph; /* Innobase SQL query graph used
+ in updates or deletes */
+ btr_pcur_t* pcur; /* persistent cursor used in selects
+ and updates */
+ btr_pcur_t* clust_pcur; /* persistent cursor used in
+ some selects and updates */
+ que_fork_t* sel_graph; /* dummy query graph used in
+ selects */
+ dtuple_t* search_tuple; /* prebuilt dtuple used in selects */
+ byte row_id[DATA_ROW_ID_LEN];
+ /* if the clustered index was generated,
+ the row id of the last row fetched is
+ stored here */
+ dtuple_t* clust_ref; /* prebuilt dtuple used in
+ sel/upd/del */
+ ulint select_lock_type;/* LOCK_NONE, LOCK_S, or LOCK_X */
+ ulint mysql_row_len; /* length in bytes of a row in the
+ MySQL format */
+ ulint n_rows_fetched; /* number of rows fetched after
+ positioning the current cursor */
+ ulint fetch_direction;/* ROW_SEL_NEXT or ROW_SEL_PREV */
+ byte* fetch_cache[MYSQL_FETCH_CACHE_SIZE];
+ /* a cache for fetched rows if we
+ fetch many rows from the same cursor:
+ it saves CPU time to fetch them in a
+ batch; we reserve mysql_row_len
+ bytes for each such row */
+ ulint fetch_cache_first;/* position of the first not yet
+ fetched row in fetch_cache */
+ ulint n_fetch_cached; /* number of not yet fetched rows
+ in fetch_cache */
+ mem_heap_t* blob_heap; /* in SELECTS BLOB fields are copied
+ to this heap */
+ mem_heap_t* old_vers_heap; /* memory heap where a previous
+ version is built in consistent read */
+};
+
+#define ROW_MYSQL_WHOLE_ROW 0
+#define ROW_MYSQL_REC_FIELDS 1
+#define ROW_MYSQL_NO_TEMPLATE 2
+
+#ifndef UNIV_NONINL
+#include "row0mysql.ic"
+#endif
+
+#endif
diff --git a/innobase/include/row0mysql.ic b/innobase/include/row0mysql.ic
new file mode 100644
index 00000000000..773e25a87ef
--- /dev/null
+++ b/innobase/include/row0mysql.ic
@@ -0,0 +1,97 @@
+/******************************************************
+MySQL interface for Innobase
+
+(C) 2001 Innobase Oy
+
+Created 1/23/2001 Heikki Tuuri
+*******************************************************/
+
+/***********************************************************************
+Stores a variable-length field (like VARCHAR) length to dest, in the
+MySQL format. No real var implemented in MySQL yet! */
+UNIV_INLINE
+byte*
+row_mysql_store_var_len(
+/*====================*/
+ /* out: dest + 2 */
+ byte* dest, /* in: where to store */
+ ulint len) /* in: length, must fit in two bytes */
+{
+ ut_ad(len < 256 * 256);
+/*
+ mach_write_to_2_little_endian(dest, len);
+
+ return(dest + 2);
+*/
+ return(dest); /* No real var implemented in MySQL yet! */
+}
+
+/***********************************************************************
+Reads a MySQL format variable-length field (like VARCHAR) length and
+returns pointer to the field data. No real var implemented in MySQL yet! */
+UNIV_INLINE
+byte*
+row_mysql_read_var_ref(
+/*===================*/
+ /* out: field + 2 */
+ ulint* len, /* out: variable-length field length; does not work
+ yet! */
+ byte* field) /* in: field */
+{
+/*
+ *len = mach_read_from_2_little_endian(field);
+
+ return(field + 2);
+*/
+ return(field); /* No real var implemented in MySQL yet! */
+}
+
+/******************************************************************
+Stores a non-SQL-NULL field given in the MySQL format in the Innobase
+format. */
+UNIV_INLINE
+void
+row_mysql_store_col_in_innobase_format(
+/*===================================*/
+ dfield_t* dfield, /* in/out: dfield */
+ byte* buf, /* in/out: buffer for the converted
+ value */
+ byte* mysql_data, /* in: MySQL column value, not
+ SQL NULL; NOTE that dfield may also
+ get a pointer to mysql_data,
+ therefore do not discard this as long
+ as dfield is used! */
+ ulint col_len, /* in: MySQL column length */
+ ulint type, /* in: data type */
+ ulint is_unsigned) /* in: != 0 if unsigned integer type */
+{
+ byte* ptr = mysql_data;
+
+ if (type == DATA_INT) {
+ /* Store integer data in Innobase in a big-endian format,
+ sign bit negated */
+
+ ptr = buf + col_len;
+
+ for (;;) {
+ ptr--;
+ *ptr = *mysql_data;
+ if (ptr == buf) {
+ break;
+ }
+ mysql_data++;
+ }
+
+ if (!is_unsigned) {
+ *ptr = *ptr ^ 128;
+ }
+ } else if (type == DATA_VARCHAR || type == DATA_VARMYSQL
+ || type == DATA_BINARY) {
+ ptr = row_mysql_read_var_ref(&col_len, mysql_data);
+
+ } else if (type == DATA_BLOB) {
+ ptr = row_mysql_read_blob_ref(&col_len, mysql_data, col_len);
+ }
+
+ dfield_set_data(dfield, ptr, col_len);
+}
diff --git a/innobase/include/row0purge.h b/innobase/include/row0purge.h
new file mode 100644
index 00000000000..4c863441442
--- /dev/null
+++ b/innobase/include/row0purge.h
@@ -0,0 +1,80 @@
+/******************************************************
+Purge obsolete records
+
+(c) 1997 Innobase Oy
+
+Created 3/14/1997 Heikki Tuuri
+*******************************************************/
+
+#ifndef row0purge_h
+#define row0purge_h
+
+#include "univ.i"
+#include "data0data.h"
+#include "btr0types.h"
+#include "btr0pcur.h"
+#include "dict0types.h"
+#include "trx0types.h"
+#include "que0types.h"
+#include "row0types.h"
+
+/************************************************************************
+Creates a purge node to a query graph. */
+
+purge_node_t*
+row_purge_node_create(
+/*==================*/
+ /* out, own: purge node */
+ que_thr_t* parent, /* in: parent node, i.e., a thr node */
+ mem_heap_t* heap); /* in: memory heap where created */
+/***************************************************************
+Does the purge operation for a single undo log record. This is a high-level
+function used in an SQL execution graph. */
+
+que_thr_t*
+row_purge_step(
+/*===========*/
+ /* out: query thread to run next or NULL */
+ que_thr_t* thr); /* in: query thread */
+
+/* Purge node structure */
+
+struct purge_node_struct{
+ que_common_t common; /* node type: QUE_NODE_PURGE */
+ /*----------------------*/
+ /* Local storage for this graph node */
+ dulint roll_ptr;/* roll pointer to undo log record */
+ trx_undo_rec_t* undo_rec;/* undo log record */
+ trx_undo_inf_t* reservation;/* reservation for the undo log record in
+ the purge array */
+ dulint undo_no;/* undo number of the record */
+ ulint rec_type;/* undo log record type: TRX_UNDO_INSERT_REC,
+ ... */
+ btr_pcur_t pcur; /* persistent cursor used in searching the
+ clustered index record */
+ ibool found_clust;/* TRUE if the clustered index record
+ determined by ref was found in the clustered
+ index, and we were able to position pcur on
+ it */
+ dict_table_t* table; /* table where purge is done; NOTE that the
+ table has to be released explicitly with
+ dict_table_release */
+ ulint cmpl_info;/* compiler analysis info of an update */
+ upd_t* update; /* update vector for a clustered index record */
+ dtuple_t* ref; /* NULL, or row reference to the next row to
+ handle */
+ dtuple_t* row; /* NULL, or a copy (also fields copied to
+ heap) of the indexed fields of the row to
+ handle */
+ dict_index_t* index; /* NULL, or the next index whose record should
+ be handled */
+ mem_heap_t* heap; /* memory heap used as auxiliary storage for
+ row; this must be emptied after a successful
+ purge of a row */
+};
+
+#ifndef UNIV_NONINL
+#include "row0purge.ic"
+#endif
+
+#endif
diff --git a/innobase/include/row0purge.ic b/innobase/include/row0purge.ic
new file mode 100644
index 00000000000..50aabf0bc1b
--- /dev/null
+++ b/innobase/include/row0purge.ic
@@ -0,0 +1,8 @@
+
+/******************************************************
+Purge obsolete records
+
+(c) 1997 Innobase Oy
+
+Created 3/14/1997 Heikki Tuuri
+*******************************************************/
diff --git a/innobase/include/row0row.h b/innobase/include/row0row.h
new file mode 100644
index 00000000000..fb1e1b01ee3
--- /dev/null
+++ b/innobase/include/row0row.h
@@ -0,0 +1,266 @@
+/******************************************************
+General row routines
+
+(c) 1996 Innobase Oy
+
+Created 4/20/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef row0row_h
+#define row0row_h
+
+#include "univ.i"
+#include "data0data.h"
+#include "dict0types.h"
+#include "trx0types.h"
+#include "que0types.h"
+#include "mtr0mtr.h"
+#include "rem0types.h"
+#include "read0types.h"
+#include "btr0types.h"
+
+/*************************************************************************
+Reads the trx id field from a clustered index record. */
+UNIV_INLINE
+dulint
+row_get_rec_trx_id(
+/*===============*/
+ /* out: value of the field */
+ rec_t* rec, /* in: record */
+ dict_index_t* index); /* in: clustered index */
+/*************************************************************************
+Reads the roll pointer field from a clustered index record. */
+UNIV_INLINE
+dulint
+row_get_rec_roll_ptr(
+/*=================*/
+ /* out: value of the field */
+ rec_t* rec, /* in: record */
+ dict_index_t* index); /* in: clustered index */
+/*************************************************************************
+Writes the trx id field to a clustered index record. */
+UNIV_INLINE
+void
+row_set_rec_trx_id(
+/*===============*/
+ rec_t* rec, /* in: record */
+ dict_index_t* index, /* in: clustered index */
+ dulint trx_id); /* in: value of the field */
+/*************************************************************************
+Sets the roll pointer field in a clustered index record. */
+UNIV_INLINE
+void
+row_set_rec_roll_ptr(
+/*=================*/
+ rec_t* rec, /* in: record */
+ dict_index_t* index, /* in: clustered index */
+ dulint roll_ptr);/* in: value of the field */
+/*********************************************************************
+When an insert to a table is performed, this function builds the entry which
+has to be inserted to an index on the table. */
+
+dtuple_t*
+row_build_index_entry(
+/*==================*/
+ /* out: index entry which should be inserted */
+ dtuple_t* row, /* in: row which should be inserted to the
+ table */
+ dict_index_t* index, /* in: index on the table */
+ mem_heap_t* heap); /* in: memory heap from which the memory for
+ the index entry is allocated */
+/*********************************************************************
+Builds an index entry from a row. */
+
+void
+row_build_index_entry_to_tuple(
+/*===========================*/
+ dtuple_t* entry, /* in/out: index entry; the dtuple must have
+ enough fields for the index! */
+ dtuple_t* row, /* in: row */
+ dict_index_t* index); /* in: index on the table */
+/***********************************************************************
+An inverse function to dict_row_build_index_entry. Builds a row from a
+record in a clustered index. */
+
+dtuple_t*
+row_build(
+/*======*/
+ /* out, own: row built; see the NOTE below! */
+ ulint type, /* in: ROW_COPY_DATA, or ROW_COPY_POINTERS:
+ the former copies also the data fields to
+ heap as the latter only places pointers to
+ data fields on the index page, and thus is
+ more efficient */
+ dict_index_t* index, /* in: clustered index */
+ rec_t* rec, /* in: record in the clustered index;
+ NOTE: in the case ROW_COPY_POINTERS
+ the data fields in the row will point
+ directly into this record, therefore,
+ the buffer page of this record must be
+ at least s-latched and the latch held
+ as long as the row dtuple is used! */
+ mem_heap_t* heap); /* in: memory heap from which the memory
+ needed is allocated */
+/***********************************************************************
+An inverse function to dict_row_build_index_entry. Builds a row from a
+record in a clustered index. */
+
+void
+row_build_to_tuple(
+/*===============*/
+ dtuple_t* row, /* in/out: row built; see the NOTE below! */
+ dict_index_t* index, /* in: clustered index */
+ rec_t* rec); /* in: record in the clustered index;
+ NOTE: the data fields in the row will point
+ directly into this record, therefore,
+ the buffer page of this record must be
+ at least s-latched and the latch held
+ as long as the row dtuple is used! */
+/***********************************************************************
+Converts an index record to a typed data tuple. */
+
+dtuple_t*
+row_rec_to_index_entry(
+/*===================*/
+ /* out, own: index entry built; see the
+ NOTE below! */
+ ulint type, /* in: ROW_COPY_DATA, or ROW_COPY_POINTERS:
+ the former copies also the data fields to
+ heap as the latter only places pointers to
+ data fields on the index page */
+ dict_index_t* index, /* in: index */
+ rec_t* rec, /* in: record in the index;
+ NOTE: in the case ROW_COPY_POINTERS
+ the data fields in the row will point
+ directly into this record, therefore,
+ the buffer page of this record must be
+ at least s-latched and the latch held
+ as long as the dtuple is used! */
+ mem_heap_t* heap); /* in: memory heap from which the memory
+ needed is allocated */
+/***********************************************************************
+Builds from a secondary index record a row reference with which we can
+search the clustered index record. */
+
+dtuple_t*
+row_build_row_ref(
+/*==============*/
+ /* out, own: row reference built; see the
+ NOTE below! */
+ ulint type, /* in: ROW_COPY_DATA, or ROW_COPY_POINTERS:
+ the former copies also the data fields to
+ heap, whereas the latter only places pointers
+ to data fields on the index page */
+ dict_index_t* index, /* in: index */
+ rec_t* rec, /* in: record in the index;
+ NOTE: in the case ROW_COPY_POINTERS
+ the data fields in the row will point
+ directly into this record, therefore,
+ the buffer page of this record must be
+ at least s-latched and the latch held
+ as long as the row reference is used! */
+ mem_heap_t* heap); /* in: memory heap from which the memory
+ needed is allocated */
+/***********************************************************************
+Builds from a secondary index record a row reference with which we can
+search the clustered index record. */
+
+void
+row_build_row_ref_in_tuple(
+/*=======================*/
+ dtuple_t* ref, /* in/out: row reference built; see the
+ NOTE below! */
+ dict_index_t* index, /* in: index */
+ rec_t* rec); /* in: record in the index;
+ NOTE: the data fields in ref will point
+ directly into this record, therefore,
+ the buffer page of this record must be
+ at least s-latched and the latch held
+ as long as the row reference is used! */
+/***********************************************************************
+From a row build a row reference with which we can search the clustered
+index record. */
+
+void
+row_build_row_ref_from_row(
+/*=======================*/
+ dtuple_t* ref, /* in/out: row reference built; see the
+ NOTE below! ref must have the right number
+ of fields! */
+ dict_table_t* table, /* in: table */
+ dtuple_t* row); /* in: row
+ NOTE: the data fields in ref will point
+ directly into data of this row */
+/***********************************************************************
+Builds from a secondary index record a row reference with which we can
+search the clustered index record. */
+UNIV_INLINE
+void
+row_build_row_ref_fast(
+/*===================*/
+ dtuple_t* ref, /* in: typed data tuple where the reference
+ is built */
+ ulint* map, /* in: array of field numbers in rec telling
+ how ref should be built from the fields of
+ rec */
+ rec_t* rec); /* in: record in the index; must be preserved
+ while ref is used, as we do not copy field
+ values to heap */
+/*******************************************************************
+Searches the clustered index record for a row, if we have the row
+reference. */
+
+ibool
+row_search_on_row_ref(
+/*==================*/
+ /* out: TRUE if found */
+ btr_pcur_t* pcur, /* in/out: persistent cursor, which must
+ be closed by the caller */
+ ulint mode, /* in: BTR_MODIFY_LEAF, ... */
+ dict_table_t* table, /* in: table */
+ dtuple_t* ref, /* in: row reference */
+ mtr_t* mtr); /* in: mtr */
+/*************************************************************************
+Fetches the clustered index record for a secondary index record. The latches
+on the secondary index record are preserved. */
+
+rec_t*
+row_get_clust_rec(
+/*==============*/
+ /* out: record or NULL, if no record found */
+ ulint mode, /* in: BTR_MODIFY_LEAF, ... */
+ rec_t* rec, /* in: record in a secondary index */
+ dict_index_t* index, /* in: secondary index */
+ dict_index_t** clust_index,/* out: clustered index */
+ mtr_t* mtr); /* in: mtr */
+/*******************************************************************
+Searches an index record. */
+
+ibool
+row_search_index_entry(
+/*===================*/
+ /* out: TRUE if found */
+ dict_index_t* index, /* in: index */
+ dtuple_t* entry, /* in: index entry */
+ ulint mode, /* in: BTR_MODIFY_LEAF, ... */
+ btr_pcur_t* pcur, /* in/out: persistent cursor, which must
+ be closed by the caller */
+ mtr_t* mtr); /* in: mtr */
+
+
+#define ROW_COPY_DATA 1
+#define ROW_COPY_POINTERS 2
+
+/* The allowed latching order of index records is the following:
+(1) a secondary index record ->
+(2) the clustered index record ->
+(3) rollback segment data for the clustered index record.
+
+No new latches may be obtained while the kernel mutex is reserved.
+However, the kernel mutex can be reserved while latches are owned. */
+
+#ifndef UNIV_NONINL
+#include "row0row.ic"
+#endif
+
+#endif
diff --git a/innobase/include/row0row.ic b/innobase/include/row0row.ic
new file mode 100644
index 00000000000..8e5121f5a96
--- /dev/null
+++ b/innobase/include/row0row.ic
@@ -0,0 +1,165 @@
+/******************************************************
+General row routines
+
+(c) 1996 Innobase Oy
+
+Created 4/20/1996 Heikki Tuuri
+*******************************************************/
+
+#include "dict0dict.h"
+#include "rem0rec.h"
+#include "trx0undo.h"
+
+/*************************************************************************
+Reads the trx id or roll ptr field from a clustered index record: this function
+is slower than the specialized inline functions. */
+
+dulint
+row_get_rec_sys_field(
+/*==================*/
+ /* out: value of the field */
+ ulint type, /* in: DATA_TRX_ID or DATA_ROLL_PTR */
+ rec_t* rec, /* in: record */
+ dict_index_t* index); /* in: clustered index */
+/*************************************************************************
+Sets the trx id or roll ptr field in a clustered index record: this function
+is slower than the specialized inline functions. */
+
+void
+row_set_rec_sys_field(
+/*==================*/
+ /* out: value of the field */
+ ulint type, /* in: DATA_TRX_ID or DATA_ROLL_PTR */
+ rec_t* rec, /* in: record */
+ dict_index_t* index, /* in: clustered index */
+ dulint val); /* in: value to set */
+
+/*************************************************************************
+Reads the trx id field from a clustered index record. */
+UNIV_INLINE
+dulint
+row_get_rec_trx_id(
+/*===============*/
+ /* out: value of the field */
+ rec_t* rec, /* in: record */
+ dict_index_t* index) /* in: clustered index */
+{
+ ulint offset;
+
+ ut_ad(index->type & DICT_CLUSTERED);
+
+ offset = index->trx_id_offset;
+
+ if (offset) {
+ return(trx_read_trx_id(rec + offset));
+ } else {
+ return(row_get_rec_sys_field(DATA_TRX_ID, rec, index));
+ }
+}
+
+/*************************************************************************
+Reads the roll pointer field from a clustered index record. */
+UNIV_INLINE
+dulint
+row_get_rec_roll_ptr(
+/*=================*/
+ /* out: value of the field */
+ rec_t* rec, /* in: record */
+ dict_index_t* index) /* in: clustered index */
+{
+ ulint offset;
+
+ ut_ad(index->type & DICT_CLUSTERED);
+
+ offset = index->trx_id_offset;
+
+ if (offset) {
+ return(trx_read_roll_ptr(rec + offset + DATA_TRX_ID_LEN));
+ } else {
+ return(row_get_rec_sys_field(DATA_ROLL_PTR, rec, index));
+ }
+}
+
+/*************************************************************************
+Writes the trx id field to a clustered index record. */
+UNIV_INLINE
+void
+row_set_rec_trx_id(
+/*===============*/
+ rec_t* rec, /* in: record */
+ dict_index_t* index, /* in: clustered index */
+ dulint trx_id) /* in: value of the field */
+{
+ ulint offset;
+
+ ut_ad(index->type & DICT_CLUSTERED);
+
+ offset = index->trx_id_offset;
+
+ if (offset) {
+ trx_write_trx_id(rec + offset, trx_id);
+ } else {
+ row_set_rec_sys_field(DATA_TRX_ID, rec, index, trx_id);
+ }
+}
+
+/*************************************************************************
+Sets the roll pointer field in a clustered index record. */
+UNIV_INLINE
+void
+row_set_rec_roll_ptr(
+/*=================*/
+ rec_t* rec, /* in: record */
+ dict_index_t* index, /* in: clustered index */
+ dulint roll_ptr)/* in: value of the field */
+{
+ ulint offset;
+
+ ut_ad(index->type & DICT_CLUSTERED);
+
+ offset = index->trx_id_offset;
+
+ if (offset) {
+ trx_write_roll_ptr(rec + offset + DATA_TRX_ID_LEN, roll_ptr);
+ } else {
+ row_set_rec_sys_field(DATA_ROLL_PTR, rec, index, roll_ptr);
+ }
+}
+
+/***********************************************************************
+Builds from a secondary index record a row reference with which we can
+search the clustered index record. */
+UNIV_INLINE
+void
+row_build_row_ref_fast(
+/*===================*/
+ dtuple_t* ref, /* in: typed data tuple where the reference
+ is built */
+ ulint* map, /* in: array of field numbers in rec telling
+ how ref should be built from the fields of
+ rec */
+ rec_t* rec) /* in: record in the index; must be preserved
+ while ref is used, as we do not copy field
+ values to heap */
+{
+ dfield_t* dfield;
+ byte* field;
+ ulint len;
+ ulint ref_len;
+ ulint field_no;
+ ulint i;
+
+ ref_len = dtuple_get_n_fields(ref);
+
+ for (i = 0; i < ref_len; i++) {
+ dfield = dtuple_get_nth_field(ref, i);
+
+ field_no = *(map + i);
+
+ if (field_no != ULINT_UNDEFINED) {
+
+ field = rec_get_nth_field(rec, field_no, &len);
+ dfield_set_data(dfield, field, len);
+ }
+ }
+}
diff --git a/innobase/include/row0sel.h b/innobase/include/row0sel.h
new file mode 100644
index 00000000000..a64d3f8e425
--- /dev/null
+++ b/innobase/include/row0sel.h
@@ -0,0 +1,330 @@
+/******************************************************
+Select
+
+(c) 1997 Innobase Oy
+
+Created 12/19/1997 Heikki Tuuri
+*******************************************************/
+
+#ifndef row0sel_h
+#define row0sel_h
+
+#include "univ.i"
+#include "data0data.h"
+#include "que0types.h"
+#include "dict0types.h"
+#include "trx0types.h"
+#include "row0types.h"
+#include "que0types.h"
+#include "pars0sym.h"
+#include "btr0pcur.h"
+#include "read0read.h"
+#include "row0mysql.h"
+
+/*************************************************************************
+Creates a select node struct. */
+
+sel_node_t*
+sel_node_create(
+/*============*/
+ /* out, own: select node struct */
+ mem_heap_t* heap); /* in: memory heap where created */
+/*************************************************************************
+Frees the memory private to a select node when a query graph is freed,
+does not free the heap where the node was originally created. */
+
+void
+sel_node_free_private(
+/*==================*/
+ sel_node_t* node); /* in: select node struct */
+/*************************************************************************
+Frees a prefetch buffer for a column, including the dynamically allocated
+memory for data stored there. */
+
+void
+sel_col_prefetch_buf_free(
+/*======================*/
+ sel_buf_t* prefetch_buf); /* in, own: prefetch buffer */
+/*************************************************************************
+Gets the plan node for the nth table in a join. */
+UNIV_INLINE
+plan_t*
+sel_node_get_nth_plan(
+/*==================*/
+ sel_node_t* node,
+ ulint i);
+/**************************************************************************
+Performs a select step. This is a high-level function used in SQL execution
+graphs. */
+
+que_thr_t*
+row_sel_step(
+/*=========*/
+ /* out: query thread to run next or NULL */
+ que_thr_t* thr); /* in: query thread */
+/**************************************************************************
+Performs an execution step of an open or close cursor statement node. */
+UNIV_INLINE
+que_thr_t*
+open_step(
+/*======*/
+ /* out: query thread to run next or NULL */
+ que_thr_t* thr); /* in: query thread */
+/**************************************************************************
+Performs a fetch for a cursor. */
+
+que_thr_t*
+fetch_step(
+/*=======*/
+ /* out: query thread to run next or NULL */
+ que_thr_t* thr); /* in: query thread */
+/***************************************************************
+Prints a row in a select result. */
+
+que_thr_t*
+row_printf_step(
+/*============*/
+ /* out: query thread to run next or NULL */
+ que_thr_t* thr); /* in: query thread */
+/********************************************************************
+Converts a key value stored in MySQL format to an Innobase dtuple.
+The last field of the key value may be just a prefix of a fixed length
+field: hence the parameter key_len. */
+
+void
+row_sel_convert_mysql_key_to_innobase(
+/*==================================*/
+ dtuple_t* tuple, /* in: tuple where to build;
+ NOTE: we assume that the type info
+ in the tuple is already according
+ to index! */
+ byte* buf, /* in: buffer to use in field
+ conversions */
+ dict_index_t* index, /* in: index of the key value */
+ byte* key_ptr, /* in: MySQL key value */
+ ulint key_len); /* in: MySQL key value length */
+/************************************************************************
+Searches for rows in the database. This is used in the interface to
+MySQL. This function opens a cursor, and also implements fetch next
+and fetch prev. NOTE that if we do a search with a full key value
+from a unique index (ROW_SEL_EXACT), then we will not store the cursor
+position and fetch next or fetch prev must not be tried to the cursor! */
+
+ulint
+row_search_for_mysql(
+/*=================*/
+ /* out: DB_SUCCESS,
+ DB_RECORD_NOT_FOUND,
+ DB_END_OF_INDEX, or DB_DEADLOCK */
+ byte* buf, /* in/out: buffer for the fetched
+ row in the MySQL format */
+ ulint mode, /* in: search mode PAGE_CUR_L, ... */
+ row_prebuilt_t* prebuilt, /* in: prebuilt struct for the
+ table handle; this contains the info
+ of search_tuple, index; if search
+ tuple contains 0 fields then we
+ position the cursor at the start or
+ the end of the index, depending on
+ 'mode' */
+ ulint match_mode, /* in: 0 or ROW_SEL_EXACT or
+ ROW_SEL_EXACT_PREFIX */
+ ulint direction); /* in: 0 or ROW_SEL_NEXT or
+ ROW_SEL_PREV; NOTE: if this is != 0,
+ then prebuilt must have a pcur
+ with stored position! In opening of a
+ cursor 'direction' should be 0. */
+
+
+/* A structure for caching column values for prefetched rows */
+struct sel_buf_struct{
+ byte* data; /* data, or NULL; if not NULL, this field
+ has allocated memory which must be explicitly
+ freed; can be != NULL even when len is
+ UNIV_SQL_NULL */
+ ulint len; /* data length or UNIV_SQL_NULL */
+ ulint val_buf_size;
+ /* size of memory buffer allocated for data:
+ this can be more than len; this is defined
+ when data != NULL */
+};
+
+struct plan_struct{
+ dict_table_t* table; /* table struct in the dictionary
+ cache */
+ dict_index_t* index; /* table index used in the search */
+ btr_pcur_t pcur; /* persistent cursor used to search
+ the index */
+ ibool asc; /* TRUE if cursor traveling upwards */
+ ibool pcur_is_open; /* TRUE if pcur has been positioned
+ and we can try to fetch new rows */
+ ibool cursor_at_end; /* TRUE if the cursor is open but
+ we know that there are no more
+ qualifying rows left to retrieve from
+ the index tree; NOTE though, that
+ there may still be unprocessed rows in
+ the prefetch stack; always FALSE when
+ pcur_is_open is FALSE */
+ ibool stored_cursor_rec_processed;
+ /* TRUE if the pcur position has been
+ stored and the record it is positioned
+ on has already been processed */
+ que_node_t** tuple_exps; /* array of expressions which are used
+ to calculate the field values in the
+ search tuple: there is one expression
+ for each field in the search tuple */
+ dtuple_t* tuple; /* search tuple */
+ ulint mode; /* search mode: PAGE_CUR_G, ... */
+ ulint n_exact_match; /* number of first fields in the search
+ tuple which must be exactly matched */
+ ibool unique_search; /* TRUE if we are searching an
+ index record with a unique key */
+ ulint n_rows_fetched; /* number of rows fetched using pcur
+ after it was opened */
+ ulint n_rows_prefetched;/* number of prefetched rows cached
+ for fetch: fetching several rows in
+ the same mtr saves CPU time */
+ ulint first_prefetched;/* index of the first cached row in
+ select buffer arrays for each column */
+ ibool no_prefetch; /* no prefetch for this table */
+ ibool mixed_index; /* TRUE if index is a clustered index
+ in a mixed cluster */
+ sym_node_list_t columns; /* symbol table nodes for the columns
+ to retrieve from the table */
+ UT_LIST_BASE_NODE_T(func_node_t)
+ end_conds; /* conditions which determine the
+ fetch limit of the index segment we
+ have to look at: when one of these
+ fails, the result set has been
+ exhausted for the cursor in this
+ index; these conditions are normalized
+ so that in a comparison the column
+ for this table is the first argument */
+ UT_LIST_BASE_NODE_T(func_node_t)
+ other_conds; /* the rest of search conditions we can
+ test at this table in a join */
+ ibool must_get_clust; /* TRUE if index is a non-clustered
+ index and we must also fetch the
+ clustered index record; this is the
+ case if the non-clustered record does
+ not contain all the needed columns, or
+ if this is a single-table explicit
+ cursor, or a searched update or
+ delete */
+ ulint* clust_map; /* map telling how clust_ref is built
+ from the fields of a non-clustered
+ record */
+ dtuple_t* clust_ref; /* the reference to the clustered
+ index entry is built here if index is
+ a non-clustered index */
+ btr_pcur_t clust_pcur; /* if index is non-clustered, we use
+ this pcur to search the clustered
+ index */
+ mem_heap_t* old_vers_heap; /* memory heap used in building an old
+ version of a row, or NULL */
+};
+
+struct sel_node_struct{
+ que_common_t common; /* node type: QUE_NODE_SELECT */
+ ulint state; /* node state */
+ que_node_t* select_list; /* select list */
+ sym_node_t* into_list; /* variables list or NULL */
+ sym_node_t* table_list; /* table list */
+ ibool asc; /* TRUE if the rows should be fetched
+ in an ascending order */
+ ibool set_x_locks; /* TRUE if the cursor is for update or
+ delete, which means that a row x-lock
+ should be placed on the cursor row */
+ ibool select_will_do_update;
+ /* TRUE if the select is for a searched
+ update which can be performed in-place:
+ in this case the select will take care
+ of the update */
+ ulint latch_mode; /* BTR_SEARCH_LEAF, or BTR_MODIFY_LEAF
+ if select_will_do_update is TRUE */
+ ulint row_lock_mode; /* LOCK_X or LOCK_S */
+ ulint n_tables; /* number of tables */
+ ulint fetch_table; /* number of the next table to access
+ in the join */
+ plan_t* plans; /* array of n_tables many plan nodes
+ containing the search plan and the
+ search data structures */
+ que_node_t* search_cond; /* search condition */
+ read_view_t* read_view; /* if the query is a non-locking
+ consistent read, its read view is
+ placed here, otherwise NULL */
+ ibool consistent_read;/* TRUE if the select is a consistent,
+ non-locking read */
+ order_node_t* order_by; /* order by column definition, or
+ NULL */
+ ibool is_aggregate; /* TRUE if the select list consists of
+ aggregate functions */
+ ibool aggregate_already_fetched;
+ /* TRUE if the aggregate row has
+ already been fetched for the current
+ cursor */
+ ibool can_get_updated;/* this is TRUE if the select is in a
+ single-table explicit cursor which can
+ get updated within the stored procedure,
+ or in a searched update or delete;
+ NOTE that to determine of an explicit
+ cursor if it can get updated, the
+ parser checks from a stored procedure
+ if it contains positioned update or
+ delete statements */
+ sym_node_t* explicit_cursor;/* not NULL if an explicit cursor */
+ UT_LIST_BASE_NODE_T(sym_node_t)
+ copy_variables; /* variables whose values we have to
+ copy when an explicit cursor is opened,
+ so that they do not change between
+ fetches */
+};
+
+/* Select node states */
+#define SEL_NODE_CLOSED 0 /* it is a declared cursor which is not
+ currently open */
+#define SEL_NODE_OPEN 1 /* intention locks not yet set on
+ tables */
+#define SEL_NODE_FETCH 2 /* intention locks have been set */
+#define SEL_NODE_NO_MORE_ROWS 3 /* cursor has reached the result set
+ end */
+
+/* Fetch statement node */
+struct fetch_node_struct{
+ que_common_t common; /* type: QUE_NODE_FETCH */
+ sel_node_t* cursor_def; /* cursor definition */
+ sym_node_t* into_list; /* variables to set */
+};
+
+/* Open or close cursor statement node */
+struct open_node_struct{
+ que_common_t common; /* type: QUE_NODE_OPEN */
+ ulint op_type; /* ROW_SEL_OPEN_CURSOR or
+ ROW_SEL_CLOSE_CURSOR */
+ sel_node_t* cursor_def; /* cursor definition */
+};
+
+/* Row printf statement node */
+struct row_printf_node_struct{
+ que_common_t common; /* type: QUE_NODE_ROW_PRINTF */
+ sel_node_t* sel_node; /* select */
+};
+
+#define ROW_SEL_OPEN_CURSOR 0
+#define ROW_SEL_CLOSE_CURSOR 1
+
+/* Flags for the MySQL interface */
+#define ROW_SEL_NEXT 1
+#define ROW_SEL_PREV 2
+
+#define ROW_SEL_EXACT 1 /* search using a complete key value */
+#define ROW_SEL_EXACT_PREFIX 2 /* search using a key prefix which
+ must match to rows: the prefix may
+ contain an incomplete field (the
+ last field in prefix may be just
+ a prefix of a fixed length column) */
+
+#ifndef UNIV_NONINL
+#include "row0sel.ic"
+#endif
+
+#endif
diff --git a/innobase/include/row0sel.ic b/innobase/include/row0sel.ic
new file mode 100644
index 00000000000..9005624b6ca
--- /dev/null
+++ b/innobase/include/row0sel.ic
@@ -0,0 +1,91 @@
+/******************************************************
+Select
+
+(c) 1997 Innobase Oy
+
+Created 12/19/1997 Heikki Tuuri
+*******************************************************/
+
+#include "que0que.h"
+
+/*************************************************************************
+Gets the plan node for the nth table in a join. */
+UNIV_INLINE
+plan_t*
+sel_node_get_nth_plan(
+/*==================*/
+ /* out: plan node */
+ sel_node_t* node, /* in: select node */
+ ulint i) /* in: get ith plan node */
+{
+ ut_ad(i < node->n_tables);
+
+ return(node->plans + i);
+}
+
+/*************************************************************************
+Resets the cursor defined by sel_node to the SEL_NODE_OPEN state, which means
+that it will start fetching from the start of the result set again, regardless
+of where it was before, and it will set intention locks on the tables. */
+UNIV_INLINE
+void
+sel_node_reset_cursor(
+/*==================*/
+ sel_node_t* node) /* in: select node */
+{
+ node->state = SEL_NODE_OPEN;
+}
+
+/**************************************************************************
+Performs an execution step of an open or close cursor statement node. */
+UNIV_INLINE
+que_thr_t*
+open_step(
+/*======*/
+ /* out: query thread to run next or NULL */
+ que_thr_t* thr) /* in: query thread */
+{
+ sel_node_t* sel_node;
+ open_node_t* node;
+ ulint err;
+
+ ut_ad(thr);
+
+ node = thr->run_node;
+ ut_ad(que_node_get_type(node) == QUE_NODE_OPEN);
+
+ sel_node = node->cursor_def;
+
+ err = DB_SUCCESS;
+
+ if (node->op_type == ROW_SEL_OPEN_CURSOR) {
+
+/* if (sel_node->state == SEL_NODE_CLOSED) { */
+
+ sel_node_reset_cursor(sel_node);
+/* } else {
+ err = DB_ERROR;
+ } */
+ } else {
+ if (sel_node->state != SEL_NODE_CLOSED) {
+
+ sel_node->state = SEL_NODE_CLOSED;
+ } else {
+ err = DB_ERROR;
+ }
+ }
+
+ if (err != DB_SUCCESS) {
+ /* SQL error detected */
+ printf("SQL error %lu\n", err);
+
+ ut_error;
+ que_thr_handle_error(thr, err, NULL, 0);
+
+ return(NULL);
+ }
+
+ thr->run_node = que_node_get_parent(node);
+
+ return(thr);
+}
diff --git a/innobase/include/row0types.h b/innobase/include/row0types.h
new file mode 100644
index 00000000000..79b864f4835
--- /dev/null
+++ b/innobase/include/row0types.h
@@ -0,0 +1,37 @@
+/******************************************************
+Row operation global types
+
+(c) 1996 Innobase Oy
+
+Created 12/27/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef row0types_h
+#define row0types_h
+
+typedef struct plan_struct plan_t;
+
+typedef struct upd_struct upd_t;
+
+typedef struct upd_field_struct upd_field_t;
+
+typedef struct upd_node_struct upd_node_t;
+
+typedef struct del_node_struct del_node_t;
+
+typedef struct ins_node_struct ins_node_t;
+
+typedef struct sel_node_struct sel_node_t;
+
+typedef struct open_node_struct open_node_t;
+
+typedef struct fetch_node_struct fetch_node_t;
+
+typedef struct row_printf_node_struct row_printf_node_t;
+typedef struct sel_buf_struct sel_buf_t;
+
+typedef struct undo_node_struct undo_node_t;
+
+typedef struct purge_node_struct purge_node_t;
+
+#endif
diff --git a/innobase/include/row0uins.h b/innobase/include/row0uins.h
new file mode 100644
index 00000000000..df5e072487e
--- /dev/null
+++ b/innobase/include/row0uins.h
@@ -0,0 +1,37 @@
+/******************************************************
+Fresh insert undo
+
+(c) 1996 Innobase Oy
+
+Created 2/25/1997 Heikki Tuuri
+*******************************************************/
+
+#ifndef row0uins_h
+#define row0uins_h
+
+#include "univ.i"
+#include "data0data.h"
+#include "dict0types.h"
+#include "trx0types.h"
+#include "que0types.h"
+#include "row0types.h"
+#include "mtr0mtr.h"
+
+/***************************************************************
+Undoes a fresh insert of a row to a table. A fresh insert means that
+the same clustered index unique key did not have any record, even delete
+marked, at the time of the insert. */
+
+ulint
+row_undo_ins(
+/*=========*/
+ /* out: DB_SUCCESS */
+ undo_node_t* node, /* in: row undo node */
+ que_thr_t* thr); /* in: query thread */
+
+
+#ifndef UNIV_NONINL
+#include "row0uins.ic"
+#endif
+
+#endif
diff --git a/innobase/include/row0uins.ic b/innobase/include/row0uins.ic
new file mode 100644
index 00000000000..2b3d5a10f95
--- /dev/null
+++ b/innobase/include/row0uins.ic
@@ -0,0 +1,8 @@
+/******************************************************
+Fresh insert undo
+
+(c) 1996 Innobase Oy
+
+Created 2/25/1997 Heikki Tuuri
+*******************************************************/
+
diff --git a/innobase/include/row0umod.h b/innobase/include/row0umod.h
new file mode 100644
index 00000000000..2c8e19a80ae
--- /dev/null
+++ b/innobase/include/row0umod.h
@@ -0,0 +1,35 @@
+/******************************************************
+Undo modify of a row
+
+(c) 1997 Innobase Oy
+
+Created 2/27/1997 Heikki Tuuri
+*******************************************************/
+
+#ifndef row0umod_h
+#define row0umod_h
+
+#include "univ.i"
+#include "data0data.h"
+#include "dict0types.h"
+#include "trx0types.h"
+#include "que0types.h"
+#include "row0types.h"
+#include "mtr0mtr.h"
+
+/***************************************************************
+Undoes a modify operation on a row of a table. */
+
+ulint
+row_undo_mod(
+/*=========*/
+ /* out: DB_SUCCESS or error code */
+ undo_node_t* node, /* in: row undo node */
+ que_thr_t* thr); /* in: query thread */
+
+
+#ifndef UNIV_NONINL
+#include "row0umod.ic"
+#endif
+
+#endif
diff --git a/innobase/include/row0umod.ic b/innobase/include/row0umod.ic
new file mode 100644
index 00000000000..fcbf4dbc1f3
--- /dev/null
+++ b/innobase/include/row0umod.ic
@@ -0,0 +1,7 @@
+/******************************************************
+Undo modify of a row
+
+(c) 1997 Innobase Oy
+
+Created 2/27/1997 Heikki Tuuri
+*******************************************************/
diff --git a/innobase/include/row0undo.h b/innobase/include/row0undo.h
new file mode 100644
index 00000000000..5402f1d9236
--- /dev/null
+++ b/innobase/include/row0undo.h
@@ -0,0 +1,117 @@
+/******************************************************
+Row undo
+
+(c) 1997 Innobase Oy
+
+Created 1/8/1997 Heikki Tuuri
+*******************************************************/
+
+#ifndef row0undo_h
+#define row0undo_h
+
+#include "univ.i"
+#include "mtr0mtr.h"
+#include "trx0sys.h"
+#include "btr0types.h"
+#include "btr0pcur.h"
+#include "dict0types.h"
+#include "trx0types.h"
+#include "que0types.h"
+#include "row0types.h"
+
+/************************************************************************
+Creates a row undo node to a query graph. */
+
+undo_node_t*
+row_undo_node_create(
+/*=================*/
+ /* out, own: undo node */
+ trx_t* trx, /* in: transaction */
+ que_thr_t* parent, /* in: parent node, i.e., a thr node */
+ mem_heap_t* heap); /* in: memory heap where created */
+/***************************************************************
+Looks for the clustered index record when node has the row reference.
+The pcur in node is used in the search. If found, stores the row to node,
+and stores the position of pcur, and detaches it. The pcur must be closed
+by the caller in any case. */
+
+ibool
+row_undo_search_clust_to_pcur(
+/*==========================*/
+ /* out: TRUE if found; NOTE the node->pcur
+ must be closed by the caller, regardless of
+ the return value */
+ undo_node_t* node, /* in: row undo node */
+ que_thr_t* thr); /* in: query thread */
+/***************************************************************
+Undoes a row operation in a table. This is a high-level function used
+in SQL execution graphs. */
+
+que_thr_t*
+row_undo_step(
+/*==========*/
+ /* out: query thread to run next or NULL */
+ que_thr_t* thr); /* in: query thread */
+
+/* A single query thread will try to perform the undo for all successive
+versions of a clustered index record, if the transaction has modified it
+several times during the execution which is rolled back. It may happen
+that the task is transferred to another query thread, if the other thread
+is assigned to handle an undo log record in the chain of different versions
+of the record, and the other thread happens to get the x-latch to the
+clustered index record at the right time.
+ If a query thread notices that the clustered index record it is looking
+for is missing, or the roll ptr field in the record doed not point to the
+undo log record the thread was assigned to handle, then it gives up the undo
+task for that undo log record, and fetches the next. This situation can occur
+just in the case where the transaction modified the same record several times
+and another thread is currently doing the undo for successive versions of
+that index record. */
+
+/* Undo node structure */
+
+struct undo_node_struct{
+ que_common_t common; /* node type: QUE_NODE_UNDO */
+ ulint state; /* node execution state */
+ trx_t* trx; /* trx for which undo is done */
+ dulint roll_ptr;/* roll pointer to undo log record */
+ trx_undo_rec_t* undo_rec;/* undo log record */
+ dulint undo_no;/* undo number of the record */
+ ulint rec_type;/* undo log record type: TRX_UNDO_INSERT_REC,
+ ... */
+ dulint new_roll_ptr; /* roll ptr to restore to clustered index
+ record */
+ dulint new_trx_id; /* trx id to restore to clustered index
+ record */
+ btr_pcur_t pcur; /* persistent cursor used in searching the
+ clustered index record */
+ dict_table_t* table; /* table where undo is done; NOTE that the
+ table has to be released explicitly with
+ dict_table_release */
+ ulint cmpl_info;/* compiler analysis of an update */
+ upd_t* update; /* update vector for a clustered index record */
+ dtuple_t* ref; /* row reference to the next row to handle */
+ dtuple_t* row; /* a copy (also fields copied to heap) of the
+ row to handle */
+ dict_index_t* index; /* the next index whose record should be
+ handled */
+ mem_heap_t* heap; /* memory heap used as auxiliary storage for
+ row; this must be emptied after undo is tried
+ on a row */
+};
+
+/* Execution states for an undo node */
+#define UNDO_NODE_FETCH_NEXT 1 /* we should fetch the next undo log
+ record */
+#define UNDO_NODE_PREV_VERS 2 /* the roll ptr to previous version of
+ a row is stored in node, and undo
+ should be done based on it */
+#define UNDO_NODE_INSERT 3
+#define UNDO_NODE_MODIFY 4
+
+
+#ifndef UNIV_NONINL
+#include "row0undo.ic"
+#endif
+
+#endif
diff --git a/innobase/include/row0undo.ic b/innobase/include/row0undo.ic
new file mode 100644
index 00000000000..e7f89c7de67
--- /dev/null
+++ b/innobase/include/row0undo.ic
@@ -0,0 +1,7 @@
+/******************************************************
+Row undo
+
+(c) 1997 Innobase Oy
+
+Created 1/8/1997 Heikki Tuuri
+*******************************************************/
diff --git a/innobase/include/row0upd.h b/innobase/include/row0upd.h
new file mode 100644
index 00000000000..3046345f446
--- /dev/null
+++ b/innobase/include/row0upd.h
@@ -0,0 +1,363 @@
+/******************************************************
+Update of a row
+
+(c) 1996 Innobase Oy
+
+Created 12/27/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef row0upd_h
+#define row0upd_h
+
+#include "univ.i"
+#include "data0data.h"
+#include "btr0types.h"
+#include "btr0pcur.h"
+#include "dict0types.h"
+#include "trx0types.h"
+#include "que0types.h"
+#include "row0types.h"
+#include "pars0types.h"
+
+/*************************************************************************
+Creates an update vector object. */
+UNIV_INLINE
+upd_t*
+upd_create(
+/*=======*/
+ /* out, own: update vector object */
+ ulint n, /* in: number of fields */
+ mem_heap_t* heap); /* in: heap from which memory allocated */
+/*************************************************************************
+Returns the number of fields in the update vector == number of columns
+to be updated by an update vector. */
+UNIV_INLINE
+ulint
+upd_get_n_fields(
+/*=============*/
+ /* out: number of fields */
+ upd_t* update); /* in: update vector */
+/*************************************************************************
+Returns the nth field of an update vector. */
+UNIV_INLINE
+upd_field_t*
+upd_get_nth_field(
+/*==============*/
+ /* out: update vector field */
+ upd_t* update, /* in: update vector */
+ ulint n); /* in: field position in update vector */
+/*************************************************************************
+Sets the clustered index field number to be updated by an update vector
+field. */
+UNIV_INLINE
+void
+upd_field_set_field_no(
+/*===================*/
+ upd_field_t* upd_field, /* in: update vector field */
+ ulint field_no, /* in: field number in a clustered
+ index */
+ dict_index_t* index); /* in: clustered index */
+/*************************************************************************
+Writes into the redo log the values of trx id and roll ptr and enough info
+to determine their positions within a clustered index record. */
+
+byte*
+row_upd_write_sys_vals_to_log(
+/*==========================*/
+ /* out: new pointer to mlog */
+ dict_index_t* index, /* in: clustered index */
+ trx_t* trx, /* in: transaction */
+ dulint roll_ptr,/* in: roll ptr of the undo log record */
+ byte* log_ptr,/* pointer to a buffer of size > 20 opened
+ in mlog */
+ mtr_t* mtr); /* in: mtr */
+/*************************************************************************
+Updates the trx id and roll ptr field in a clustered index record when
+a row is updated or marked deleted. */
+UNIV_INLINE
+void
+row_upd_rec_sys_fields(
+/*===================*/
+ rec_t* rec, /* in: record */
+ dict_index_t* index, /* in: clustered index */
+ trx_t* trx, /* in: transaction */
+ dulint roll_ptr);/* in: roll ptr of the undo log record */
+/*************************************************************************
+Sets the trx id or roll ptr field of a clustered index entry. */
+
+void
+row_upd_index_entry_sys_field(
+/*==========================*/
+ dtuple_t* entry, /* in: index entry, where the memory buffers
+ for sys fields are already allocated:
+ the function just copies the new values to
+ them */
+ dict_index_t* index, /* in: clustered index */
+ ulint type, /* in: DATA_TRX_ID or DATA_ROLL_PTR */
+ dulint val); /* in: value to write */
+/*************************************************************************
+Creates an update node for a query graph. */
+
+upd_node_t*
+upd_node_create(
+/*============*/
+ /* out, own: update node */
+ mem_heap_t* heap); /* in: mem heap where created */
+/***************************************************************
+Writes to the redo log the new values of the fields occurring in the index. */
+
+void
+row_upd_index_write_log(
+/*====================*/
+ upd_t* update, /* in: update vector */
+ byte* log_ptr,/* in: pointer to mlog buffer: must contain at least
+ MLOG_BUF_MARGIN bytes of free space; the buffer is
+ closed within this function */
+ mtr_t* mtr); /* in: mtr into whose log to write */
+/***************************************************************
+Returns TRUE if row update changes size of some field in index. */
+
+ibool
+row_upd_changes_field_size(
+/*=======================*/
+ /* out: TRUE if the update changes the size of
+ some field in index */
+ rec_t* rec, /* in: record in clustered index */
+ dict_index_t* index, /* in: clustered index */
+ upd_t* update);/* in: update vector */
+/***************************************************************
+Replaces the new column values stored in the update vector to the record
+given. No field size changes are allowed. This function is used only for
+a clustered index */
+
+void
+row_upd_rec_in_place(
+/*=================*/
+ rec_t* rec, /* in/out: record where replaced */
+ upd_t* update);/* in: update vector */
+/*******************************************************************
+Builds an update vector from those fields, excluding the roll ptr and
+trx id fields, which in an index entry differ from a record that has
+the equal ordering fields. */
+
+upd_t*
+row_upd_build_difference(
+/*=====================*/
+ /* out, own: update vector of differing
+ fields, excluding roll ptr and trx id */
+ dict_index_t* index, /* in: clustered index */
+ dtuple_t* entry, /* in: entry to insert */
+ rec_t* rec, /* in: clustered index record */
+ mem_heap_t* heap); /* in: memory heap from which allocated */
+/***************************************************************
+Replaces the new column values stored in the update vector to the index entry
+given. */
+
+void
+row_upd_index_replace_new_col_vals(
+/*===============================*/
+ dtuple_t* entry, /* in/out: index entry where replaced */
+ dict_index_t* index, /* in: index; NOTE that may also be a
+ non-clustered index */
+ upd_t* update); /* in: update vector */
+/***************************************************************
+Replaces the new column values stored in the update vector to the
+clustered index entry given. */
+
+void
+row_upd_clust_index_replace_new_col_vals(
+/*=====================================*/
+ dtuple_t* entry, /* in/out: index entry where replaced */
+ upd_t* update); /* in: update vector */
+/***************************************************************
+Checks if an update vector changes an ordering field of an index record.
+This function is fast if the update vector is short or the number of ordering
+fields in the index is small. Otherwise, this can be quadratic. */
+
+ibool
+row_upd_changes_ord_field(
+/*======================*/
+ /* out: TRUE if update vector changes
+ an ordering field in the index record */
+ dtuple_t* row, /* in: old value of row, or NULL if the
+ row and the data values in update are not
+ known when this function is called, e.g., at
+ compile time */
+ dict_index_t* index, /* in: index of the record */
+ upd_t* update);/* in: update vector for the row */
+/***************************************************************
+Checks if an update vector changes an ordering field of an index record.
+This function is fast if the update vector is short or the number of ordering
+fields in the index is small. Otherwise, this can be quadratic. */
+
+ibool
+row_upd_changes_some_index_ord_field(
+/*=================================*/
+ /* out: TRUE if update vector may change
+ an ordering field in an index record */
+ dict_table_t* table, /* in: table */
+ upd_t* update);/* in: update vector for the row */
+/***************************************************************
+Updates a row in a table. This is a high-level function used
+in SQL execution graphs. */
+
+que_thr_t*
+row_upd_step(
+/*=========*/
+ /* out: query thread to run next or NULL */
+ que_thr_t* thr); /* in: query thread */
+/*************************************************************************
+Performs an in-place update for the current clustered index record in
+select. */
+
+void
+row_upd_in_place_in_select(
+/*=======================*/
+ sel_node_t* sel_node, /* in: select node */
+ que_thr_t* thr, /* in: query thread */
+ mtr_t* mtr); /* in: mtr */
+/*************************************************************************
+Parses the log data of system field values. */
+
+byte*
+row_upd_parse_sys_vals(
+/*===================*/
+ /* out: log data end or NULL */
+ byte* ptr, /* in: buffer */
+ byte* end_ptr,/* in: buffer end */
+ ulint* pos, /* out: TRX_ID position in record */
+ dulint* trx_id, /* out: trx id */
+ dulint* roll_ptr);/* out: roll ptr */
+/*************************************************************************
+Updates the trx id and roll ptr field in a clustered index record in database
+recovery. */
+
+void
+row_upd_rec_sys_fields_in_recovery(
+/*===============================*/
+ rec_t* rec, /* in: record */
+ ulint pos, /* in: TRX_ID position in rec */
+ dulint trx_id, /* in: transaction id */
+ dulint roll_ptr);/* in: roll ptr of the undo log record */
+/*************************************************************************
+Parses the log data written by row_upd_index_write_log. */
+
+byte*
+row_upd_index_parse(
+/*================*/
+ /* out: log data end or NULL */
+ byte* ptr, /* in: buffer */
+ byte* end_ptr,/* in: buffer end */
+ mem_heap_t* heap, /* in: memory heap where update vector is
+ built */
+ upd_t** update_out);/* out: update vector */
+
+
+/* Update vector field */
+struct upd_field_struct{
+ ulint field_no; /* field number in the clustered
+ index */
+ que_node_t* exp; /* expression for calculating a new
+ value: it refers to column values and
+ constants in the symbol table of the
+ query graph */
+ dfield_t new_val; /* new value for the column */
+};
+
+/* Update vector structure */
+struct upd_struct{
+ ulint info_bits; /* new value of info bits to record;
+ default is 0 */
+ ulint n_fields; /* number of update fields */
+ upd_field_t* fields; /* array of update fields */
+};
+
+/* Update node structure which also implements the delete operation
+of a row */
+
+struct upd_node_struct{
+ que_common_t common; /* node type: QUE_NODE_UPDATE */
+ ibool is_delete;/* TRUE if delete, FALSE if update */
+ ibool searched_update;
+ /* TRUE if searched update, FALSE if
+ positioned */
+ ibool select_will_do_update;
+ /* TRUE if a searched update where ordering
+ fields will not be updated, and the size of
+ the fields will not change: in this case the
+ select node will take care of the update */
+ ibool in_mysql_interface;
+ /* TRUE if the update node was created
+ for the MySQL interface */
+ sel_node_t* select; /* query graph subtree implementing a base
+ table cursor: the rows returned will be
+ updated */
+ btr_pcur_t* pcur; /* persistent cursor placed on the clustered
+ index record which should be updated or
+ deleted; the cursor is stored in the graph
+ of 'select' field above, except in the case
+ of the MySQL interface */
+ dict_table_t* table; /* table where updated */
+ upd_t* update; /* update vector for the row */
+ sym_node_list_t columns;/* symbol table nodes for the columns
+ to retrieve from the table */
+ ibool has_clust_rec_x_lock;
+ /* TRUE if the select which retrieves the
+ records to update already sets an x-lock on
+ the clustered record; note that it must always
+ set at least an s-lock */
+ ulint cmpl_info;/* information extracted during query
+ compilation; speeds up execution:
+ UPD_NODE_NO_ORD_CHANGE and
+ UPD_NODE_NO_SIZE_CHANGE, ORed */
+ /*----------------------*/
+ /* Local storage for this graph node */
+ ulint state; /* node execution state */
+ dict_index_t* index; /* NULL, or the next index whose record should
+ be updated */
+ dtuple_t* row; /* NULL, or a copy (also fields copied to
+ heap) of the row to update; this must be reset
+ to NULL after a successful update */
+ mem_heap_t* heap; /* memory heap used as auxiliary storage for
+ row; this must be emptied after a successful
+ update if node->row != NULL */
+ /*----------------------*/
+ sym_node_t* table_sym;/* table node in symbol table */
+ que_node_t* col_assign_list;
+ /* column assignment list */
+ ulint magic_n;
+};
+
+#define UPD_NODE_MAGIC_N 1579975
+
+/* Node execution states */
+#define UPD_NODE_SET_IX_LOCK 1 /* execution came to the node from
+ a node above and if the field
+ has_clust_rec_x_lock is FALSE, we
+ should set an intention x-lock on
+ the table */
+#define UPD_NODE_UPDATE_CLUSTERED 2 /* clustered index record should be
+ updated */
+#define UPD_NODE_INSERT_CLUSTERED 3 /* clustered index record should be
+ inserted, old record is already delete
+ marked */
+#define UPD_NODE_UPDATE_ALL_SEC 4 /* an ordering field of the clustered
+ index record was changed, or this is
+ a delete operation: should update
+ all the secondary index records */
+#define UPD_NODE_UPDATE_SOME_SEC 5 /* secondary index entries should be
+ looked at and updated if an ordering
+ field changed */
+
+/* Compilation info flags: these must fit within one byte */
+#define UPD_NODE_NO_ORD_CHANGE 1 /* no secondary index record will be
+ changed in the update and no ordering
+ field of the clustered index */
+#define UPD_NODE_NO_SIZE_CHANGE 2 /* no record field size will be
+ changed in the update */
+
+#ifndef UNIV_NONINL
+#include "row0upd.ic"
+#endif
+
+#endif
diff --git a/innobase/include/row0upd.ic b/innobase/include/row0upd.ic
new file mode 100644
index 00000000000..b1b10bef0e8
--- /dev/null
+++ b/innobase/include/row0upd.ic
@@ -0,0 +1,105 @@
+/******************************************************
+Update of a row
+
+(c) 1996 Innobase Oy
+
+Created 12/27/1996 Heikki Tuuri
+*******************************************************/
+
+#include "mtr0log.h"
+#include "trx0trx.h"
+#include "trx0undo.h"
+#include "row0row.h"
+#include "btr0sea.h"
+
+/*************************************************************************
+Creates an update vector object. */
+UNIV_INLINE
+upd_t*
+upd_create(
+/*=======*/
+ /* out, own: update vector object */
+ ulint n, /* in: number of fields */
+ mem_heap_t* heap) /* in: heap from which memory allocated */
+{
+ upd_t* update;
+
+ update = mem_heap_alloc(heap, sizeof(upd_t));
+
+ update->info_bits = 0;
+ update->n_fields = n;
+ update->fields = mem_heap_alloc(heap, sizeof(upd_field_t) * n);
+
+ return(update);
+}
+
+/*************************************************************************
+Returns the number of fields in the update vector == number of columns
+to be updated by an update vector. */
+UNIV_INLINE
+ulint
+upd_get_n_fields(
+/*=============*/
+ /* out: number of fields */
+ upd_t* update) /* in: update vector */
+{
+ ut_ad(update);
+
+ return(update->n_fields);
+}
+
+/*************************************************************************
+Returns the nth field of an update vector. */
+UNIV_INLINE
+upd_field_t*
+upd_get_nth_field(
+/*==============*/
+ /* out: update vector field */
+ upd_t* update, /* in: update vector */
+ ulint n) /* in: field position in update vector */
+{
+ ut_ad(update);
+ ut_ad(n < update->n_fields);
+
+ return(update->fields + n);
+}
+
+/*************************************************************************
+Sets the clustered index field number to be updated by an update vector
+field. */
+UNIV_INLINE
+void
+upd_field_set_field_no(
+/*===================*/
+ upd_field_t* upd_field, /* in: update vector field */
+ ulint field_no, /* in: field number in a clustered
+ index */
+ dict_index_t* index) /* in: clustered index */
+{
+ ut_ad(index->type & DICT_CLUSTERED);
+
+ upd_field->field_no = field_no;
+
+ dtype_copy(dfield_get_type(&(upd_field->new_val)),
+ dict_index_get_nth_type(index, field_no));
+}
+
+/*************************************************************************
+Updates the trx id and roll ptr field in a clustered index record when
+a row is updated or marked deleted. */
+UNIV_INLINE
+void
+row_upd_rec_sys_fields(
+/*===================*/
+ rec_t* rec, /* in: record */
+ dict_index_t* index, /* in: clustered index */
+ trx_t* trx, /* in: transaction */
+ dulint roll_ptr)/* in: roll ptr of the undo log record */
+{
+ ut_ad(index->type & DICT_CLUSTERED);
+ ut_ad(!buf_block_align(rec)->is_hashed
+ || rw_lock_own(&btr_search_latch, RW_LOCK_EX));
+
+ row_set_rec_trx_id(rec, index, trx->id);
+ row_set_rec_roll_ptr(rec, index, roll_ptr);
+}
diff --git a/innobase/include/row0vers.h b/innobase/include/row0vers.h
new file mode 100644
index 00000000000..30cf82144e9
--- /dev/null
+++ b/innobase/include/row0vers.h
@@ -0,0 +1,95 @@
+/******************************************************
+Row versions
+
+(c) 1997 Innobase Oy
+
+Created 2/6/1997 Heikki Tuuri
+*******************************************************/
+
+#ifndef row0vers_h
+#define row0vers_h
+
+#include "univ.i"
+#include "data0data.h"
+#include "dict0types.h"
+#include "trx0types.h"
+#include "que0types.h"
+#include "rem0types.h"
+#include "mtr0mtr.h"
+#include "read0types.h"
+
+/*********************************************************************
+Finds out if an active transaction has inserted or modified a secondary
+index record. NOTE: the kernel mutex is temporarily released in this
+function! */
+
+trx_t*
+row_vers_impl_x_locked_off_kernel(
+/*==============================*/
+ /* out: NULL if committed, else the active
+ transaction; NOTE that the kernel mutex is
+ temporarily released! */
+ rec_t* rec, /* in: record in a secondary index */
+ dict_index_t* index); /* in: the secondary index */
+/*********************************************************************
+Finds out if we must preserve a delete marked earlier version of a clustered
+index record, because it is >= the purge view. */
+
+ibool
+row_vers_must_preserve_del_marked(
+/*==============================*/
+ /* out: TRUE if earlier version should be preserved */
+ dulint trx_id, /* in: transaction id in the version */
+ mtr_t* mtr); /* in: mtr holding the latch on the clustered index
+ record; it will also hold the latch on purge_view */
+/*********************************************************************
+Finds out if a version of the record, where the version >= the current
+purge view, should have ientry as its secondary index entry. We check
+if there is any not delete marked version of the record where the trx
+id >= purge view, and the secondary index entry == ientry; exactly in
+this case we return TRUE. */
+
+ibool
+row_vers_old_has_index_entry(
+/*=========================*/
+ /* out: TRUE if earlier version should have */
+ ibool also_curr,/* in: TRUE if also rec is included in the
+ versions to search; otherwise only versions
+ prior to it are searched */
+ rec_t* rec, /* in: record in the clustered index; the
+ caller must have a latch on the page */
+ mtr_t* mtr, /* in: mtr holding the latch on rec; it will
+ also hold the latch on purge_view */
+ dict_index_t* index, /* in: the secondary index */
+ dtuple_t* ientry); /* in: the secondary index entry */
+/*********************************************************************
+Constructs the version of a clustered index record which a consistent
+read should see. We assume that the trx id stored in rec is such that
+the consistent read should not see rec in its present version. */
+
+ulint
+row_vers_build_for_consistent_read(
+/*===============================*/
+ /* out: DB_SUCCESS or DB_MISSING_HISTORY */
+ rec_t* rec, /* in: record in a clustered index; the
+ caller must have a latch on the page; this
+ latch locks the top of the stack of versions
+ of this records */
+ mtr_t* mtr, /* in: mtr holding the latch on rec; it will
+ also hold the latch on purge_view */
+ dict_index_t* index, /* in: the clustered index */
+ read_view_t* view, /* in: the consistent read view */
+ mem_heap_t* in_heap,/* in: memory heap from which the memory for
+ old_vers is allocated; memory for possible
+ intermediate versions is allocated and freed
+ locally within the function */
+ rec_t** old_vers);/* out, own: old version, or NULL if the
+ record does not exist in the view, that is,
+ it was freshly inserted afterwards */
+
+
+#ifndef UNIV_NONINL
+#include "row0vers.ic"
+#endif
+
+#endif
diff --git a/innobase/include/row0vers.ic b/innobase/include/row0vers.ic
new file mode 100644
index 00000000000..aa7a7aa2299
--- /dev/null
+++ b/innobase/include/row0vers.ic
@@ -0,0 +1,83 @@
+/******************************************************
+Row versions
+
+(c) 1997 Innobase Oy
+
+Created 2/6/1997 Heikki Tuuri
+*******************************************************/
+
+#include "row0row.h"
+#include "dict0dict.h"
+#include "read0read.h"
+#include "page0page.h"
+#include "log0recv.h"
+
+/*************************************************************************
+Fetches the trx id of a clustered index record or version. */
+UNIV_INLINE
+dulint
+row_vers_get_trx_id(
+/*================*/
+ /* out: trx id or ut_dulint_zero if the
+ clustered index record not found */
+ rec_t* rec, /* in: clustered index record, or an old
+ version of it */
+ dict_table_t* table) /* in: table */
+{
+ return(row_get_rec_trx_id(rec, dict_table_get_first_index(table)));
+}
+
+/*************************************************************************
+Checks if a consistent read can be performed immediately on the index
+record, or if an older version is needed. */
+UNIV_INLINE
+ibool
+row_vers_clust_rec_sees_older(
+/*==========================*/
+ /* out: FALSE if can read immediately */
+ rec_t* rec, /* in: record which should be read or passed
+ over by a read cursor */
+ dict_index_t* index, /* in: clustered index */
+ read_view_t* view) /* in: read view */
+{
+ ut_ad(index->type & DICT_CLUSTERED);
+
+ if (read_view_sees_trx_id(view, row_get_rec_trx_id(rec, index))) {
+
+ return(FALSE);
+ }
+
+ return(TRUE);
+}
+
+/*************************************************************************
+Checks if a secondary index record can be read immediately by a consistent
+read, or if an older version may be needed. To be sure, we will have to
+look in the clustered index. */
+UNIV_INLINE
+ibool
+row_vers_sec_rec_may_see_older(
+/*===========================*/
+ /* out: FALSE if can be read immediately */
+ rec_t* rec, /* in: record which should be read or passed */
+ dict_index_t* index, /* in: secondary index */
+ read_view_t* view) /* in: read view */
+{
+ page_t* page;
+
+ ut_ad(!(index->type & DICT_CLUSTERED));
+
+ page = buf_frame_align(rec);
+
+ if ((ut_dulint_cmp(page_get_max_trx_id(page), view->up_limit_id) >= 0)
+ || recv_recovery_is_on()) {
+
+ /* It may be that the record was inserted or modified by a
+ transaction the view should not see: we have to look in the
+ clustered index */
+
+ return(TRUE);
+ }
+
+ return(FALSE);
+}
diff --git a/innobase/include/srv0que.h b/innobase/include/srv0que.h
new file mode 100644
index 00000000000..05c339cdd32
--- /dev/null
+++ b/innobase/include/srv0que.h
@@ -0,0 +1,53 @@
+/******************************************************
+Server query execution
+
+(c) 1996 Innobase Oy
+
+Created 6/5/1996 Heikki Tuuri
+*******************************************************/
+
+
+#ifndef srv0que_h
+#define srv0que_h
+
+#include "univ.i"
+#include "que0types.h"
+
+/**************************************************************************
+Checks if there is work to do in the server task queue. If there is, the
+thread starts processing a task. Before leaving, it again checks the task
+queue and picks a new task if any exists. This is called by a SRV_WORKER
+thread. */
+
+void
+srv_que_task_queue_check(void);
+/*==========================*/
+/**************************************************************************
+Performs round-robin on the server tasks. This is called by a SRV_WORKER
+thread every second or so. */
+
+que_thr_t*
+srv_que_round_robin(
+/*================*/
+ /* out: the new (may be == thr) query thread
+ to run */
+ que_thr_t* thr); /* in: query thread */
+/**************************************************************************
+Enqueues a task to server task queue and releases a worker thread, if
+there exists one suspended. */
+
+void
+srv_que_task_enqueue(
+/*=================*/
+ que_thr_t* thr); /* in: query thread */
+/**************************************************************************
+Enqueues a task to server task queue and releases a worker thread, if
+there exists one suspended. */
+
+void
+srv_que_task_enqueue_low(
+/*=====================*/
+ que_thr_t* thr); /* in: query thread */
+
+#endif
+
diff --git a/innobase/include/srv0srv.h b/innobase/include/srv0srv.h
new file mode 100644
index 00000000000..6418b903eeb
--- /dev/null
+++ b/innobase/include/srv0srv.h
@@ -0,0 +1,237 @@
+/******************************************************
+The server main program
+
+(c) 1995 Innobase Oy
+
+Created 10/10/1995 Heikki Tuuri
+*******************************************************/
+
+
+#ifndef srv0srv_h
+#define srv0srv_h
+
+#include "univ.i"
+#include "sync0sync.h"
+#include "os0sync.h"
+#include "com0com.h"
+#include "que0types.h"
+
+/* Server parameters which are read from the initfile */
+
+extern char* srv_data_home;
+extern char* srv_logs_home;
+extern char* srv_arch_dir;
+
+extern ulint srv_n_data_files;
+extern char** srv_data_file_names;
+extern ulint* srv_data_file_sizes;
+
+extern char** srv_log_group_home_dirs;
+
+extern ulint srv_n_log_groups;
+extern ulint srv_n_log_files;
+extern ulint srv_log_file_size;
+extern ibool srv_log_archive_on;
+extern ulint srv_log_buffer_size;
+extern ibool srv_flush_log_at_trx_commit;
+
+extern ibool srv_use_native_aio;
+
+extern ulint srv_pool_size;
+extern ulint srv_mem_pool_size;
+extern ulint srv_lock_table_size;
+
+extern ulint srv_n_file_io_threads;
+
+extern ibool srv_archive_recovery;
+extern dulint srv_archive_recovery_limit_lsn;
+
+extern ulint srv_lock_wait_timeout;
+
+/*-------------------------------------------*/
+extern ulint srv_n_spin_wait_rounds;
+extern ulint srv_spin_wait_delay;
+extern ibool srv_priority_boost;
+
+extern ulint srv_pool_size;
+extern ulint srv_mem_pool_size;
+extern ulint srv_lock_table_size;
+
+extern ulint srv_sim_disk_wait_pct;
+extern ulint srv_sim_disk_wait_len;
+extern ibool srv_sim_disk_wait_by_yield;
+extern ibool srv_sim_disk_wait_by_wait;
+
+extern ibool srv_measure_contention;
+extern ibool srv_measure_by_spin;
+
+extern ibool srv_print_thread_releases;
+extern ibool srv_print_lock_waits;
+extern ibool srv_print_buf_io;
+extern ibool srv_print_log_io;
+extern ibool srv_print_parsed_sql;
+extern ibool srv_print_latch_waits;
+
+extern ibool srv_test_nocache;
+extern ibool srv_test_cache_evict;
+
+extern ibool srv_test_extra_mutexes;
+extern ibool srv_test_sync;
+extern ulint srv_test_n_threads;
+extern ulint srv_test_n_loops;
+extern ulint srv_test_n_free_rnds;
+extern ulint srv_test_n_reserved_rnds;
+extern ulint srv_test_n_mutexes;
+extern ulint srv_test_array_size;
+
+extern ulint srv_activity_count;
+
+extern mutex_t* kernel_mutex_temp;/* mutex protecting the server, trx structs,
+ query threads, and lock table: we allocate
+ it from dynamic memory to get it to the
+ same DRAM page as other hotspot semaphores */
+#define kernel_mutex (*kernel_mutex_temp)
+
+typedef struct srv_sys_struct srv_sys_t;
+
+/* The server system */
+extern srv_sys_t* srv_sys;
+
+/*************************************************************************
+Boots Innobase server. */
+
+ulint
+srv_boot(void);
+/*==========*/
+ /* out: DB_SUCCESS or error code */
+/*************************************************************************
+Gets the number of threads in the system. */
+
+ulint
+srv_get_n_threads(void);
+/*===================*/
+/*************************************************************************
+Returns the calling thread type. */
+
+ulint
+srv_get_thread_type(void);
+/*=====================*/
+ /* out: SRV_COM, ... */
+/*************************************************************************
+Releases threads of the type given from suspension in the thread table.
+NOTE! The server mutex has to be reserved by the caller! */
+
+ulint
+srv_release_threads(
+/*================*/
+ /* out: number of threads released: this may be
+ < n if not enough threads were suspended at the
+ moment */
+ ulint type, /* in: thread type */
+ ulint n); /* in: number of threads to release */
+/*************************************************************************
+The master thread controlling the server. */
+
+ulint
+srv_master_thread(
+/*==============*/
+ /* out: a dummy parameter */
+ void* arg); /* in: a dummy parameter required by
+ os_thread_create */
+/*************************************************************************
+Reads a keyword and a value from a file. */
+
+ulint
+srv_read_init_val(
+/*==============*/
+ /* out: DB_SUCCESS or error code */
+ FILE* initfile, /* in: file pointer */
+ char* keyword, /* in: keyword before value(s), or NULL if
+ no keyword read */
+ char* str_buf, /* in/out: buffer for a string value to read,
+ buffer size must be 10000 bytes, if NULL
+ then not read */
+ ulint* num_val, /* out: numerical value to read, if NULL
+ then not read */
+ ibool print_not_err); /* in: if TRUE, then we will not print
+ error messages to console */
+/***********************************************************************
+Tells the Innobase server that there has been activity in the database
+and wakes up the master thread if it is suspended (not sleeping). Used
+in the MySQL interface. Note that there is a small chance that the master
+thread stays suspended (we do not protect our operation with the kernel
+mutex, for performace reasons). */
+
+void
+srv_active_wake_master_thread(void);
+/*===============================*/
+/*******************************************************************
+Puts a MySQL OS thread to wait for a lock to be released. */
+
+ibool
+srv_suspend_mysql_thread(
+/*=====================*/
+ /* out: TRUE if the lock wait timeout was
+ exceeded */
+ que_thr_t* thr); /* in: query thread associated with
+ the MySQL OS thread */
+/************************************************************************
+Releases a MySQL OS thread waiting for a lock to be released, if the
+thread is already suspended. */
+
+void
+srv_release_mysql_thread_if_suspended(
+/*==================================*/
+ que_thr_t* thr); /* in: query thread associated with the
+ MySQL OS thread */
+/*************************************************************************
+A thread which wakes up threads whose lock wait may have lasted too long. */
+
+ulint
+srv_lock_timeout_monitor_thread(
+/*============================*/
+ /* out: a dummy parameter */
+ void* arg); /* in: a dummy parameter required by
+ os_thread_create */
+
+
+/* Types for the threads existing in the system. Threads of types 4 - 9
+are called utility threads. Note that utility threads are mainly disk
+bound, except that version threads 6 - 7 may also be CPU bound, if
+cleaning versions from the buffer pool. */
+
+#define SRV_COM 1 /* threads serving communication and queries */
+#define SRV_CONSOLE 2 /* thread serving console */
+#define SRV_WORKER 3 /* threads serving parallelized queries and
+ queries released from lock wait */
+#define SRV_BUFFER 4 /* thread flushing dirty buffer blocks,
+ not currently in use */
+#define SRV_RECOVERY 5 /* threads finishing a recovery,
+ not currently in use */
+#define SRV_INSERT 6 /* thread flushing the insert buffer to disk,
+ not currently in use */
+#define SRV_MASTER 7 /* the master thread, (whose type number must
+ be biggest) */
+
+/* Thread slot in the thread table */
+typedef struct srv_slot_struct srv_slot_t;
+
+/* Thread table is an array of slots */
+typedef srv_slot_t srv_table_t;
+
+/* The server system struct */
+struct srv_sys_struct{
+ os_event_t operational; /* created threads must wait for the
+ server to become operational by
+ waiting for this event */
+ com_endpoint_t* endpoint; /* the communication endpoint of the
+ server */
+
+ srv_table_t* threads; /* server thread table */
+ UT_LIST_BASE_NODE_T(que_thr_t)
+ tasks; /* task queue */
+};
+
+extern ulint srv_n_threads_active[];
+
+#endif
diff --git a/innobase/include/srv0srv.ic b/innobase/include/srv0srv.ic
new file mode 100644
index 00000000000..73e0729660f
--- /dev/null
+++ b/innobase/include/srv0srv.ic
@@ -0,0 +1,7 @@
+/******************************************************
+Server main program
+
+(c) 1995 Innobase Oy
+
+Created 10/4/1995 Heikki Tuuri
+*******************************************************/
diff --git a/innobase/include/srv0start.h b/innobase/include/srv0start.h
new file mode 100644
index 00000000000..66eeb4f2e3c
--- /dev/null
+++ b/innobase/include/srv0start.h
@@ -0,0 +1,31 @@
+/******************************************************
+Starts the Innobase database server
+
+(c) 1995-2000 Innobase Oy
+
+Created 10/10/1995 Heikki Tuuri
+*******************************************************/
+
+
+#ifndef srv0start_h
+#define srv0start_h
+
+#include "univ.i"
+
+/********************************************************************
+Starts Innobase and creates a new database if database files
+are not found and the user wants. Server parameters are
+read from a file of name "srv_init" in the ib_home directory. */
+
+int
+innobase_start_or_create_for_mysql(void);
+/*====================================*/
+ /* out: DB_SUCCESS or error code */
+/********************************************************************
+Shuts down the Innobase database. */
+
+int
+innobase_shutdown_for_mysql(void);
+/*=============================*/
+ /* out: DB_SUCCESS or error code */
+#endif
diff --git a/innobase/include/sync0arr.h b/innobase/include/sync0arr.h
new file mode 100644
index 00000000000..75d79f4c93f
--- /dev/null
+++ b/innobase/include/sync0arr.h
@@ -0,0 +1,114 @@
+/******************************************************
+The wait array used in synchronization primitives
+
+(c) 1995 Innobase Oy
+
+Created 9/5/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef sync0arr_h
+#define sync0arr_h
+
+#include "univ.i"
+#include "ut0lst.h"
+#include "ut0mem.h"
+#include "os0thread.h"
+
+typedef struct sync_cell_struct sync_cell_t;
+typedef struct sync_array_struct sync_array_t;
+
+#define SYNC_ARRAY_OS_MUTEX 1
+#define SYNC_ARRAY_MUTEX 2
+
+/***********************************************************************
+Creates a synchronization wait array. It is protected by a mutex
+which is automatically reserved when the functions operating on it
+are called. */
+
+sync_array_t*
+sync_array_create(
+/*==============*/
+ /* out, own: created wait array */
+ ulint n_cells, /* in: number of cells in the array
+ to create */
+ ulint protection); /* in: either SYNC_ARRAY_OS_MUTEX or
+ SYNC_ARRAY_MUTEX: determines the type
+ of mutex protecting the data structure */
+/**********************************************************************
+Frees the resources in a wait array. */
+
+void
+sync_array_free(
+/*============*/
+ sync_array_t* arr); /* in, own: sync wait array */
+/**********************************************************************
+Reserves a wait array cell for waiting for an object.
+The event of the cell is reset to nonsignalled state. */
+
+void
+sync_array_reserve_cell(
+/*====================*/
+ sync_array_t* arr, /* in: wait array */
+ void* object, /* in: pointer to the object to wait for */
+ ulint type, /* in: lock request type */
+ #ifdef UNIV_SYNC_DEBUG
+ char* file, /* in: in debug version file where
+ requested */
+ ulint line, /* in: in the debug version line where
+ requested */
+ #endif
+ ulint* index); /* out: index of the reserved cell */
+/**********************************************************************
+This function should be called when a thread starts to wait on
+a wait array cell. In the debug version this function checks
+if the wait for a semaphore will result in a deadlock, in which
+case prints info and asserts. */
+
+void
+sync_array_wait_event(
+/*==================*/
+ sync_array_t* arr, /* in: wait array */
+ ulint index); /* in: index of the reserved cell */
+/**********************************************************************
+Frees the cell. NOTE! sync_array_wait_event frees the cell
+automatically! */
+
+void
+sync_array_free_cell(
+/*=================*/
+ sync_array_t* arr, /* in: wait array */
+ ulint index); /* in: index of the cell in array */
+/**************************************************************************
+Looks for the cells in the wait array which refer
+to the wait object specified,
+and sets their corresponding events to the signaled state. In this
+way releases the threads waiting for the object to contend for the object.
+It is possible that no such cell is found, in which case does nothing. */
+
+void
+sync_array_signal_object(
+/*=====================*/
+ sync_array_t* arr, /* in: wait array */
+ void* object);/* in: wait object */
+/************************************************************************
+Validates the integrity of the wait array. Checks
+that the number of reserved cells equals the count variable. */
+
+void
+sync_array_validate(
+/*================*/
+ sync_array_t* arr); /* in: sync wait array */
+/**************************************************************************
+Prints info of the wait array. */
+
+void
+sync_array_print_info(
+/*==================*/
+ sync_array_t* arr); /* in: wait array */
+
+
+#ifndef UNIV_NONINL
+#include "sync0arr.ic"
+#endif
+
+#endif
diff --git a/innobase/include/sync0arr.ic b/innobase/include/sync0arr.ic
new file mode 100644
index 00000000000..dbe35c033e5
--- /dev/null
+++ b/innobase/include/sync0arr.ic
@@ -0,0 +1,10 @@
+/******************************************************
+The wait array for synchronization primitives
+
+Inline code
+
+(c) 1995 Innobase Oy
+
+Created 9/5/1995 Heikki Tuuri
+*******************************************************/
+
diff --git a/innobase/include/sync0ipm.h b/innobase/include/sync0ipm.h
new file mode 100644
index 00000000000..3244a6d26de
--- /dev/null
+++ b/innobase/include/sync0ipm.h
@@ -0,0 +1,113 @@
+/******************************************************
+A fast mutex for interprocess synchronization.
+mutex_t can be used only within single process,
+but ip mutex also between processes.
+
+(c) 1995 Innobase Oy
+
+Created 9/30/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef sync0ipm_h
+#define sync0ipm_h
+
+#include "univ.i"
+#include "os0sync.h"
+#include "sync0sync.h"
+
+typedef struct ip_mutex_hdl_struct ip_mutex_hdl_t;
+typedef struct ip_mutex_struct ip_mutex_t;
+
+/* NOTE! The structure appears here only for the compiler to
+know its size. Do not use its fields directly!
+The structure used in a fast implementation of
+an interprocess mutex. */
+
+struct ip_mutex_struct {
+ mutex_t mutex; /* Ordinary mutex struct */
+ ulint waiters; /* This field is set to 1 if
+ there may be waiters */
+};
+
+/* The performance of the ip mutex in NT depends on how often
+a thread has to suspend itself waiting for the ip mutex
+to become free. The following variable counts system calls
+involved. */
+
+extern ulint ip_mutex_system_call_count;
+
+/**********************************************************************
+Creates, or rather, initializes
+an ip mutex object in a specified shared memory location (which must be
+appropriately aligned). The ip mutex is initialized in the reset state.
+NOTE! Explicit destroying of the ip mutex with ip_mutex_free
+is not recommended
+as the mutex resides in shared memory and we cannot make sure that
+no process is currently accessing it. Therefore just use
+ip_mutex_close to free the operating system event and mutex. */
+
+ulint
+ip_mutex_create(
+/*============*/
+ /* out: 0 if succeed */
+ ip_mutex_t* ip_mutex, /* in: pointer to shared memory */
+ char* name, /* in: name of the ip mutex */
+ ip_mutex_hdl_t** handle); /* out, own: handle to the
+ created mutex; handle exists
+ in the private address space of
+ the calling process */
+/**********************************************************************
+NOTE! Using this function is not recommended. See the note
+on ip_mutex_create. Destroys an ip mutex */
+
+void
+ip_mutex_free(
+/*==========*/
+ ip_mutex_hdl_t* handle); /* in, own: ip mutex handle */
+/**********************************************************************
+Opens an ip mutex object in a specified shared memory location.
+Explicit closing of the ip mutex with ip_mutex_close is necessary to
+free the operating system event and mutex created, and the handle. */
+
+ulint
+ip_mutex_open(
+/*==========*/
+ /* out: 0 if succeed */
+ ip_mutex_t* ip_mutex, /* in: pointer to shared memory */
+ char* name, /* in: name of the ip mutex */
+ ip_mutex_hdl_t** handle); /* out, own: handle to the
+ opened mutex */
+/**********************************************************************
+Closes an ip mutex. */
+
+void
+ip_mutex_close(
+/*===========*/
+ ip_mutex_hdl_t* handle); /* in, own: ip mutex handle */
+/******************************************************************
+Reserves an ip mutex. */
+UNIV_INLINE
+ulint
+ip_mutex_enter(
+/*===========*/
+ /* out: 0 if success,
+ SYNC_TIME_EXCEEDED if timeout */
+ ip_mutex_hdl_t* ip_mutex_hdl, /* in: pointer to ip mutex handle */
+ ulint time); /* in: maximum time to wait, in
+ microseconds, or
+ SYNC_INFINITE_TIME */
+/******************************************************************
+Releases an ip mutex. */
+UNIV_INLINE
+void
+ip_mutex_exit(
+/*==========*/
+ ip_mutex_hdl_t* ip_mutex_hdl); /* in: pointer to ip mutex handle */
+
+
+
+#ifndef UNIV_NONINL
+#include "sync0ipm.ic"
+#endif
+
+#endif
diff --git a/innobase/include/sync0ipm.ic b/innobase/include/sync0ipm.ic
new file mode 100644
index 00000000000..8487830e1dd
--- /dev/null
+++ b/innobase/include/sync0ipm.ic
@@ -0,0 +1,182 @@
+/******************************************************
+A fast mutex for interprocess synchronization.
+mutex_t can be used only within single process,
+but ip_mutex_t also between processes.
+
+(c) 1995 Innobase Oy
+
+Created 9/30/1995 Heikki Tuuri
+*******************************************************/
+
+/* An extra structure created in the private address space of each process
+which creates or opens the ip mutex. */
+
+struct ip_mutex_hdl_struct {
+ ip_mutex_t* ip_mutex; /* pointer to ip mutex */
+ os_event_t released; /* event which signals that the mutex
+ is released; this is obtained from
+ create or open of an ip mutex */
+ os_mutex_t exclude; /* os mutex obtained when ip mutex is
+ created or opened */
+};
+
+
+UNIV_INLINE
+ulint
+ip_mutex_get_waiters(
+volatile ip_mutex_t* ipm);
+UNIV_INLINE
+void
+ip_mutex_set_waiters(
+volatile ip_mutex_t* ipm,
+ ulint flag);
+UNIV_INLINE
+mutex_t*
+ip_mutex_get_mutex(
+ ip_mutex_t* ipm);
+
+
+/******************************************************************
+Accessor functions for ip mutex. */
+UNIV_INLINE
+ulint
+ip_mutex_get_waiters(
+volatile ip_mutex_t* ipm)
+{
+ return(ipm->waiters);
+}
+UNIV_INLINE
+void
+ip_mutex_set_waiters(
+volatile ip_mutex_t* ipm,
+ ulint flag)
+{
+ ipm->waiters = flag;
+}
+UNIV_INLINE
+mutex_t*
+ip_mutex_get_mutex(
+ ip_mutex_t* ipm)
+{
+ return(&(ipm->mutex));
+}
+
+/******************************************************************
+Reserves an ip mutex. */
+UNIV_INLINE
+ulint
+ip_mutex_enter(
+/*===========*/
+ /* out: 0 if success,
+ SYNC_TIME_EXCEEDED if timeout */
+ ip_mutex_hdl_t* ip_mutex_hdl, /* in: pointer to ip mutex handle */
+ ulint time) /* in: maximum time to wait, in
+ microseconds, or
+ SYNC_INFINITE_TIME */
+{
+ mutex_t* mutex;
+ os_event_t released;
+ os_mutex_t exclude;
+ ip_mutex_t* ip_mutex;
+ ulint loop_count;
+ ulint ret;
+
+ ip_mutex = ip_mutex_hdl->ip_mutex;
+ released = ip_mutex_hdl->released;
+ exclude = ip_mutex_hdl->exclude;
+
+ mutex = ip_mutex_get_mutex(ip_mutex);
+
+ loop_count = 0;
+loop:
+ loop_count++;
+ ut_ad(loop_count < 15);
+
+ if (mutex_enter_nowait(mutex) == 0) {
+ /* Succeeded! */
+
+ return(0);
+ }
+
+ ip_mutex_system_call_count++;
+
+ os_event_reset(released);
+
+ /* Order is important here: FIRST reset event, then set waiters */
+ ip_mutex_set_waiters(ip_mutex, 1);
+
+ if (mutex_enter_nowait(mutex) == 0) {
+ /* Succeeded! */
+
+ return(0);
+ }
+
+ if (time == SYNC_INFINITE_TIME) {
+ time = OS_SYNC_INFINITE_TIME;
+ }
+
+ /* Suspend to wait for release */
+
+ ip_mutex_system_call_count++;
+
+ ret = os_event_wait_time(released, time);
+
+ ip_mutex_system_call_count++;
+
+ os_mutex_enter(exclude);
+ ip_mutex_system_call_count++;
+ os_mutex_exit(exclude);
+
+ if (ret != 0) {
+ ut_a(ret == OS_SYNC_TIME_EXCEEDED);
+
+ return(SYNC_TIME_EXCEEDED);
+ }
+
+ goto loop;
+}
+
+/******************************************************************
+Releases an ip mutex. */
+UNIV_INLINE
+void
+ip_mutex_exit(
+/*==========*/
+ ip_mutex_hdl_t* ip_mutex_hdl) /* in: pointer to ip mutex handle */
+{
+ mutex_t* mutex;
+ os_event_t released;
+ os_mutex_t exclude;
+ ip_mutex_t* ip_mutex;
+
+ ip_mutex = ip_mutex_hdl->ip_mutex;
+ released = ip_mutex_hdl->released;
+ exclude = ip_mutex_hdl->exclude;
+
+ mutex = ip_mutex_get_mutex(ip_mutex);
+
+ mutex_exit(mutex);
+
+ if (ip_mutex_get_waiters(ip_mutex) != 0) {
+
+ ip_mutex_set_waiters(ip_mutex, 0);
+
+ /* Order is important here: FIRST reset waiters,
+ then set event */
+
+ ip_mutex_system_call_count++;
+ os_mutex_enter(exclude);
+
+ /* The use of the exclude mutex seems to prevent some
+ kind of a convoy problem in the test tsproc.c. We do
+ not know why. */
+
+ ip_mutex_system_call_count++;
+
+ os_event_set(released);
+
+ ip_mutex_system_call_count++;
+
+ os_mutex_exit(exclude);
+ }
+}
diff --git a/innobase/include/sync0rw.h b/innobase/include/sync0rw.h
new file mode 100644
index 00000000000..20afdfb025f
--- /dev/null
+++ b/innobase/include/sync0rw.h
@@ -0,0 +1,493 @@
+/******************************************************
+The read-write lock (for threads, not for database transactions)
+
+(c) 1995 Innobase Oy
+
+Created 9/11/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef sync0rw_h
+#define sync0rw_h
+
+#include "univ.i"
+#include "ut0lst.h"
+#include "sync0sync.h"
+#include "os0sync.h"
+
+/* The following undef is to prevent a name conflict with a macro
+in MySQL: */
+#undef rw_lock_t
+
+/* Latch types; these are used also in btr0btr.h: keep the numerical values
+smaller than 30 and the order of the numerical values like below! */
+#define RW_S_LATCH 1
+#define RW_X_LATCH 2
+#define RW_NO_LATCH 3
+
+typedef struct rw_lock_struct rw_lock_t;
+typedef struct rw_lock_debug_struct rw_lock_debug_t;
+
+typedef UT_LIST_BASE_NODE_T(rw_lock_t) rw_lock_list_t;
+
+extern rw_lock_list_t rw_lock_list;
+extern mutex_t rw_lock_list_mutex;
+
+/* The global mutex which protects debug info lists of all rw-locks.
+To modify the debug info list of an rw-lock, this mutex has to be
+
+acquired in addition to the mutex protecting the lock. */
+extern mutex_t rw_lock_debug_mutex;
+extern os_event_t rw_lock_debug_event; /* If deadlock detection does
+ not get immediately the mutex it
+ may wait for this event */
+extern ibool rw_lock_debug_waiters; /* This is set to TRUE, if
+ there may be waiters for the event */
+
+extern ulint rw_s_system_call_count;
+extern ulint rw_s_spin_wait_count;
+extern ulint rw_s_exit_count;
+
+extern ulint rw_x_system_call_count;
+extern ulint rw_x_spin_wait_count;
+extern ulint rw_x_exit_count;
+
+/**********************************************************************
+Creates, or rather, initializes an rw-lock object in a specified memory
+location (which must be appropriately aligned). The rw-lock is initialized
+to the non-locked state. Explicit freeing of the rw-lock with rw_lock_free
+is necessary only if the memory block containing it is freed. */
+
+#define rw_lock_create(L) rw_lock_create_func((L), __FILE__, __LINE__)
+/*=====================*/
+/**********************************************************************
+Creates, or rather, initializes an rw-lock object in a specified memory
+location (which must be appropriately aligned). The rw-lock is initialized
+to the non-locked state. Explicit freeing of the rw-lock with rw_lock_free
+is necessary only if the memory block containing it is freed. */
+
+void
+rw_lock_create_func(
+/*================*/
+ rw_lock_t* lock, /* in: pointer to memory */
+ char* cfile_name, /* in: file name where created */
+ ulint cline); /* in: file line where created */
+/**********************************************************************
+Calling this function is obligatory only if the memory buffer containing
+the rw-lock is freed. Removes an rw-lock object from the global list. The
+rw-lock is checked to be in the non-locked state. */
+
+void
+rw_lock_free(
+/*=========*/
+ rw_lock_t* lock); /* in: rw-lock */
+/**********************************************************************
+Checks that the rw-lock has been initialized and that there are no
+simultaneous shared and exclusive locks. */
+
+ibool
+rw_lock_validate(
+/*=============*/
+ rw_lock_t* lock);
+/******************************************************************
+NOTE! The following macros should be used in rw s-locking, not the
+corresponding function. */
+
+#ifdef UNIV_SYNC_DEBUG
+#define rw_lock_s_lock(M) rw_lock_s_lock_func(\
+ (M), 0, __FILE__, __LINE__)
+#else
+#define rw_lock_s_lock(M) rw_lock_s_lock_func(M)
+#endif
+/******************************************************************
+NOTE! The following macros should be used in rw s-locking, not the
+corresponding function. */
+
+#ifdef UNIV_SYNC_DEBUG
+#define rw_lock_s_lock_gen(M, P) rw_lock_s_lock_func(\
+ (M), (P), __FILE__, __LINE__)
+#else
+#define rw_lock_s_lock_gen(M, P) rw_lock_s_lock_func(M)
+#endif
+/******************************************************************
+NOTE! The following macros should be used in rw s-locking, not the
+corresponding function. */
+
+#ifdef UNIV_SYNC_DEBUG
+#define rw_lock_s_lock_nowait(M) rw_lock_s_lock_func_nowait(\
+ (M), __FILE__, __LINE__)
+#else
+#define rw_lock_s_lock_nowait(M) rw_lock_s_lock_func_nowait(M)
+#endif
+/**********************************************************************
+NOTE! Use the corresponding macro, not directly this function, except if
+you supply the file name and line number. Lock an rw-lock in shared mode
+for the current thread. If the rw-lock is locked in exclusive mode, or
+there is an exclusive lock request waiting, the function spins a preset
+time (controlled by SYNC_SPIN_ROUNDS), waiting for the lock, before
+suspending the thread. */
+UNIV_INLINE
+void
+rw_lock_s_lock_func(
+/*================*/
+ rw_lock_t* lock /* in: pointer to rw-lock */
+ #ifdef UNIV_SYNC_DEBUG
+ ,ulint pass, /* in: pass value; != 0, if the lock will
+ be passed to another thread to unlock */
+ char* file_name, /* in: file name where lock requested */
+ ulint line /* in: line where requested */
+ #endif
+);
+/**********************************************************************
+NOTE! Use the corresponding macro, not directly this function, except if
+you supply the file name and line number. Lock an rw-lock in shared mode
+for the current thread if the lock can be acquired immediately. */
+UNIV_INLINE
+ibool
+rw_lock_s_lock_func_nowait(
+/*=======================*/
+ /* out: TRUE if success */
+ rw_lock_t* lock /* in: pointer to rw-lock */
+ #ifdef UNIV_SYNC_DEBUG
+ ,char* file_name, /* in: file name where lock requested */
+ ulint line /* in: line where requested */
+ #endif
+);
+/**********************************************************************
+NOTE! Use the corresponding macro, not directly this function! Lock an
+rw-lock in exclusive mode for the current thread if the lock can be
+obtained immediately. */
+UNIV_INLINE
+ibool
+rw_lock_x_lock_func_nowait(
+/*=======================*/
+ /* out: TRUE if success */
+ rw_lock_t* lock /* in: pointer to rw-lock */
+ #ifdef UNIV_SYNC_DEBUG
+ ,char* file_name, /* in: file name where lock requested */
+ ulint line /* in: line where requested */
+ #endif
+);
+/**********************************************************************
+Releases a shared mode lock. */
+UNIV_INLINE
+void
+rw_lock_s_unlock_func(
+/*==================*/
+ rw_lock_t* lock /* in: rw-lock */
+#ifdef UNIV_SYNC_DEBUG
+ ,ulint pass /* in: pass value; != 0, if the lock may have
+ been passed to another thread to unlock */
+#endif
+ );
+/***********************************************************************
+Releases a shared mode lock. */
+
+#ifdef UNIV_SYNC_DEBUG
+#define rw_lock_s_unlock(L) rw_lock_s_unlock_func(L, 0)
+#else
+#define rw_lock_s_unlock(L) rw_lock_s_unlock_func(L)
+#endif
+/***********************************************************************
+Releases a shared mode lock. */
+
+#ifdef UNIV_SYNC_DEBUG
+#define rw_lock_s_unlock_gen(L, P) rw_lock_s_unlock_func(L, P)
+#else
+#define rw_lock_s_unlock_gen(L, P) rw_lock_s_unlock_func(L)
+#endif
+/******************************************************************
+NOTE! The following macro should be used in rw x-locking, not the
+corresponding function. */
+
+#ifdef UNIV_SYNC_DEBUG
+#define rw_lock_x_lock(M) rw_lock_x_lock_func(\
+ (M), 0, __FILE__, __LINE__)
+#else
+#define rw_lock_x_lock(M) rw_lock_x_lock_func(M, 0)
+#endif
+/******************************************************************
+NOTE! The following macro should be used in rw x-locking, not the
+corresponding function. */
+
+#ifdef UNIV_SYNC_DEBUG
+#define rw_lock_x_lock_gen(M, P) rw_lock_x_lock_func(\
+ (M), (P), __FILE__, __LINE__)
+#else
+#define rw_lock_x_lock_gen(M, P) rw_lock_x_lock_func(M, P)
+#endif
+/******************************************************************
+NOTE! The following macros should be used in rw x-locking, not the
+corresponding function. */
+
+#ifdef UNIV_SYNC_DEBUG
+#define rw_lock_x_lock_nowait(M) rw_lock_x_lock_func_nowait(\
+ (M), __FILE__, __LINE__)
+#else
+#define rw_lock_x_lock_nowait(M) rw_lock_x_lock_func_nowait(M)
+#endif
+/**********************************************************************
+NOTE! Use the corresponding macro, not directly this function! Lock an
+rw-lock in exclusive mode for the current thread. If the rw-lock is locked
+in shared or exclusive mode, or there is an exclusive lock request waiting,
+the function spins a preset time (controlled by SYNC_SPIN_ROUNDS), waiting
+for the lock, before suspending the thread. If the same thread has an x-lock
+on the rw-lock, locking succeed, with the following exception: if pass != 0,
+only a single x-lock may be taken on the lock. NOTE: If the same thread has
+an s-lock, locking does not succeed! */
+
+void
+rw_lock_x_lock_func(
+/*================*/
+ rw_lock_t* lock, /* in: pointer to rw-lock */
+ ulint pass /* in: pass value; != 0, if the lock will
+ be passed to another thread to unlock */
+ #ifdef UNIV_SYNC_DEBUG
+ ,char* file_name, /* in: file name where lock requested */
+ ulint line /* in: line where requested */
+ #endif
+);
+/**********************************************************************
+Releases an exclusive mode lock. */
+UNIV_INLINE
+void
+rw_lock_x_unlock_func(
+/*==================*/
+ rw_lock_t* lock /* in: rw-lock */
+#ifdef UNIV_SYNC_DEBUG
+ ,ulint pass /* in: pass value; != 0, if the lock may have
+ been passed to another thread to unlock */
+#endif
+ );
+/***********************************************************************
+Releases an exclusive mode lock. */
+
+#ifdef UNIV_SYNC_DEBUG
+#define rw_lock_x_unlock(L) rw_lock_x_unlock_func(L, 0)
+#else
+#define rw_lock_x_unlock(L) rw_lock_x_unlock_func(L)
+#endif
+/***********************************************************************
+Releases an exclusive mode lock. */
+
+#ifdef UNIV_SYNC_DEBUG
+#define rw_lock_x_unlock_gen(L, P) rw_lock_x_unlock_func(L, P)
+#else
+#define rw_lock_x_unlock_gen(L, P) rw_lock_x_unlock_func(L)
+#endif
+/**********************************************************************
+Low-level function which locks an rw-lock in s-mode when we know that it
+is possible and none else is currently accessing the rw-lock structure.
+Then we can do the locking without reserving the mutex. */
+UNIV_INLINE
+void
+rw_lock_s_lock_direct(
+/*==================*/
+ rw_lock_t* lock /* in: pointer to rw-lock */
+ #ifdef UNIV_SYNC_DEBUG
+ ,char* file_name, /* in: file name where lock requested */
+ ulint line /* in: line where requested */
+ #endif
+);
+/**********************************************************************
+Low-level function which locks an rw-lock in x-mode when we know that it
+is not locked and none else is currently accessing the rw-lock structure.
+Then we can do the locking without reserving the mutex. */
+UNIV_INLINE
+void
+rw_lock_x_lock_direct(
+/*==================*/
+ rw_lock_t* lock /* in: pointer to rw-lock */
+ #ifdef UNIV_SYNC_DEBUG
+ ,char* file_name, /* in: file name where lock requested */
+ ulint line /* in: line where requested */
+ #endif
+);
+/**********************************************************************
+This function is used in the insert buffer to move the ownership of an
+x-latch on a buffer frame to the current thread. The x-latch was set by
+the buffer read operation and it protected the buffer frame while the
+read was done. The ownership is moved because we want that the current
+thread is able to acquire a second x-latch which is stored in an mtr.
+This, in turn, is needed to pass the debug checks of index page
+operations. */
+
+void
+rw_lock_x_lock_move_ownership(
+/*==========================*/
+ rw_lock_t* lock); /* in: lock which was x-locked in the
+ buffer read */
+/**********************************************************************
+Releases a shared mode lock when we know there are no waiters and none
+else will access the lock during the time this function is executed. */
+UNIV_INLINE
+void
+rw_lock_s_unlock_direct(
+/*====================*/
+ rw_lock_t* lock); /* in: rw-lock */
+/**********************************************************************
+Releases an exclusive mode lock when we know there are no waiters, and
+none else will access the lock durint the time this function is executed. */
+UNIV_INLINE
+void
+rw_lock_x_unlock_direct(
+/*====================*/
+ rw_lock_t* lock); /* in: rw-lock */
+/**********************************************************************
+Sets the rw-lock latching level field. */
+
+void
+rw_lock_set_level(
+/*==============*/
+ rw_lock_t* lock, /* in: rw-lock */
+ ulint level); /* in: level */
+/**********************************************************************
+Returns the value of writer_count for the lock. Does not reserve the lock
+mutex, so the caller must be sure it is not changed during the call. */
+UNIV_INLINE
+ulint
+rw_lock_get_x_lock_count(
+/*=====================*/
+ /* out: value of writer_count */
+ rw_lock_t* lock); /* in: rw-lock */
+/**********************************************************************
+Checks if the thread has locked the rw-lock in the specified mode, with
+the pass value == 0. */
+
+ibool
+rw_lock_own(
+/*========*/
+ rw_lock_t* lock, /* in: rw-lock */
+ ulint lock_type); /* in: lock type */
+/**********************************************************************
+Checks if somebody has locked the rw-lock in the specified mode. */
+
+ibool
+rw_lock_is_locked(
+/*==============*/
+ rw_lock_t* lock, /* in: rw-lock */
+ ulint lock_type); /* in: lock type: RW_LOCK_SHARED,
+ RW_LOCK_EX */
+/*******************************************************************
+Prints debug info of an rw-lock. */
+
+void
+rw_lock_print(
+/*==========*/
+ rw_lock_t* lock); /* in: rw-lock */
+/*******************************************************************
+Prints debug info of currently locked rw-locks. */
+
+void
+rw_lock_list_print_info(void);
+/*=========================*/
+/*******************************************************************
+Returns the number of currently locked rw-locks.
+Works only in the debug version. */
+
+ulint
+rw_lock_n_locked(void);
+/*==================*/
+
+/*#####################################################################*/
+
+/**********************************************************************
+Acquires the debug mutex. We cannot use the mutex defined in sync0sync,
+because the debug mutex is also acquired in sync0arr while holding the OS
+mutex protecting the sync array, and the ordinary mutex_enter might
+recursively call routines in sync0arr, leading to a deadlock on the OS
+mutex. */
+
+void
+rw_lock_debug_mutex_enter(void);
+/*==========================*/
+/**********************************************************************
+Releases the debug mutex. */
+
+void
+rw_lock_debug_mutex_exit(void);
+/*==========================*/
+/*************************************************************************
+Prints info of a debug struct. */
+
+void
+rw_lock_debug_print(
+/*================*/
+ rw_lock_debug_t* info); /* in: debug struct */
+
+
+#define RW_CNAME_LEN 8
+
+/* NOTE! The structure appears here only for the compiler to know its size.
+Do not use its fields directly! The structure used in the spin lock
+implementation of a read-write lock. Several threads may have a shared lock
+simultaneously in this lock, but only one writer may have an exclusive lock,
+in which case no shared locks are allowed. To prevent starving of a writer
+blocked by readers, a writer may queue for the lock by setting the writer
+field. Then no new readers are allowed in. */
+
+struct rw_lock_struct {
+ ulint reader_count; /* Number of readers who have locked this
+ lock in the shared mode */
+ ulint writer; /* This field is set to RW_LOCK_EX if there
+ is a writer owning the lock (in exclusive
+ mode), RW_LOCK_WAIT_EX if a writer is
+ queueing for the lock, and
+ RW_LOCK_NOT_LOCKED, otherwise. */
+ os_thread_id_t writer_thread;
+ /* Thread id of a possible writer thread */
+ ulint writer_count; /* Number of times the same thread has
+ recursively locked the lock in the exclusive
+ mode */
+ mutex_t mutex; /* The mutex protecting rw_lock_struct */
+ ulint pass; /* Default value 0. This is set to some
+ value != 0 given by the caller of an x-lock
+ operation, if the x-lock is to be passed to
+ another thread to unlock (which happens in
+ asynchronous i/o). */
+ ulint waiters; /* This ulint is set to 1 if there are
+ waiters (readers or writers) in the global
+ wait array, waiting for this rw_lock.
+ Otherwise, = 0. */
+ ibool writer_is_wait_ex;
+ /* This is TRUE if the writer field is
+ RW_LOCK_WAIT_EX; this field is located far
+ from the memory update hotspot fields which
+ are at the start of this struct, thus we can
+ peek this field without causing much memory
+ bus traffic */
+ UT_LIST_NODE_T(rw_lock_t) list;
+ /* All allocated rw locks are put into a
+ list */
+ UT_LIST_BASE_NODE_T(rw_lock_debug_t) debug_list;
+ /* In the debug version: pointer to the debug
+ info list of the lock */
+ ulint level; /* Debug version: level in the global latching
+ order; default SYNC_LEVEL_NONE */
+ char cfile_name[RW_CNAME_LEN];
+ /* File name where lock created */
+ ulint cline; /* Line where created */
+ ulint magic_n;
+};
+
+#define RW_LOCK_MAGIC_N 22643
+
+/* The structure for storing debug info of an rw-lock */
+struct rw_lock_debug_struct {
+
+ os_thread_id_t thread_id; /* The thread id of the thread which
+ locked the rw-lock */
+ ulint pass; /* Pass value given in the lock operation */
+ ulint lock_type; /* Type of the lock: RW_LOCK_EX,
+ RW_LOCK_SHARED, RW_LOCK_WAIT_EX */
+ char* file_name; /* File name where the lock was obtained */
+ ulint line; /* Line where the rw-lock was locked */
+ UT_LIST_NODE_T(rw_lock_debug_t) list;
+ /* Debug structs are linked in a two-way
+ list */
+};
+
+#ifndef UNIV_NONINL
+#include "sync0rw.ic"
+#endif
+
+#endif
diff --git a/innobase/include/sync0rw.ic b/innobase/include/sync0rw.ic
new file mode 100644
index 00000000000..11add13d2d0
--- /dev/null
+++ b/innobase/include/sync0rw.ic
@@ -0,0 +1,510 @@
+/******************************************************
+The read-write lock (for threads)
+
+(c) 1995 Innobase Oy
+
+Created 9/11/1995 Heikki Tuuri
+*******************************************************/
+
+/**********************************************************************
+Lock an rw-lock in shared mode for the current thread. If the rw-lock is
+locked in exclusive mode, or there is an exclusive lock request waiting,
+the function spins a preset time (controlled by SYNC_SPIN_ROUNDS),
+waiting for the lock before suspending the thread. */
+
+void
+rw_lock_s_lock_spin(
+/*================*/
+ rw_lock_t* lock /* in: pointer to rw-lock */
+ #ifdef UNIV_SYNC_DEBUG
+ ,ulint pass, /* in: pass value; != 0, if the lock will
+ be passed to another thread to unlock */
+ char* file_name, /* in: file name where lock requested */
+ ulint line /* in: line where requested */
+ #endif
+);
+/**********************************************************************
+Inserts the debug information for an rw-lock. */
+
+void
+rw_lock_add_debug_info(
+/*===================*/
+ rw_lock_t* lock, /* in: rw-lock */
+ ulint pass, /* in: pass value */
+ ulint lock_type, /* in: lock type */
+ char* file_name, /* in: file where requested */
+ ulint line); /* in: line where requested */
+/**********************************************************************
+Removes a debug information struct for an rw-lock. */
+
+void
+rw_lock_remove_debug_info(
+/*======================*/
+ rw_lock_t* lock, /* in: rw-lock */
+ ulint pass, /* in: pass value */
+ ulint lock_type); /* in: lock type */
+
+
+/************************************************************************
+Accessor functions for rw lock. */
+UNIV_INLINE
+ulint
+rw_lock_get_waiters(
+/*================*/
+ rw_lock_t* lock)
+{
+ return(lock->waiters);
+}
+UNIV_INLINE
+void
+rw_lock_set_waiters(
+/*================*/
+ rw_lock_t* lock,
+ ulint flag)
+{
+ lock->waiters = flag;
+}
+UNIV_INLINE
+ulint
+rw_lock_get_writer(
+/*===============*/
+ rw_lock_t* lock)
+{
+ return(lock->writer);
+}
+UNIV_INLINE
+void
+rw_lock_set_writer(
+/*===============*/
+ rw_lock_t* lock,
+ ulint flag)
+{
+ lock->writer = flag;
+}
+UNIV_INLINE
+ulint
+rw_lock_get_reader_count(
+/*=====================*/
+ rw_lock_t* lock)
+{
+ return(lock->reader_count);
+}
+UNIV_INLINE
+void
+rw_lock_set_reader_count(
+/*=====================*/
+ rw_lock_t* lock,
+ ulint count)
+{
+ lock->reader_count = count;
+}
+UNIV_INLINE
+mutex_t*
+rw_lock_get_mutex(
+/*==============*/
+ rw_lock_t* lock)
+{
+ return(&(lock->mutex));
+}
+
+/**********************************************************************
+Returns the value of writer_count for the lock. Does not reserve the lock
+mutex, so the caller must be sure it is not changed during the call. */
+UNIV_INLINE
+ulint
+rw_lock_get_x_lock_count(
+/*=====================*/
+ /* out: value of writer_count */
+ rw_lock_t* lock) /* in: rw-lock */
+{
+ return(lock->writer_count);
+}
+
+/**********************************************************************
+Low-level function which tries to lock an rw-lock in s-mode. Performs no
+spinning. */
+UNIV_INLINE
+ibool
+rw_lock_s_lock_low(
+/*===============*/
+ /* out: TRUE if success */
+ rw_lock_t* lock /* in: pointer to rw-lock */
+ #ifdef UNIV_SYNC_DEBUG
+ ,ulint pass, /* in: pass value; != 0, if the lock will be
+ passed to another thread to unlock */
+ char* file_name, /* in: file name where lock requested */
+ ulint line /* in: line where requested */
+ #endif
+)
+{
+ ut_ad(mutex_own(rw_lock_get_mutex(lock)));
+
+ /* Check if the writer field is free */
+
+ if (lock->writer == RW_LOCK_NOT_LOCKED) {
+ /* Set the shared lock by incrementing the reader count */
+ lock->reader_count++;
+
+ #ifdef UNIV_SYNC_DEBUG
+ rw_lock_add_debug_info(lock, pass, RW_LOCK_SHARED, file_name,
+ line);
+ #endif
+
+ return(TRUE); /* locking succeeded */
+ }
+
+ return(FALSE); /* locking did not succeed */
+}
+
+/**********************************************************************
+Low-level function which locks an rw-lock in s-mode when we know that it
+is possible and none else is currently accessing the rw-lock structure.
+Then we can do the locking without reserving the mutex. */
+UNIV_INLINE
+void
+rw_lock_s_lock_direct(
+/*==================*/
+ rw_lock_t* lock /* in: pointer to rw-lock */
+ #ifdef UNIV_SYNC_DEBUG
+ ,char* file_name, /* in: file name where lock requested */
+ ulint line /* in: line where requested */
+ #endif
+)
+{
+ ut_ad(lock->writer == RW_LOCK_NOT_LOCKED);
+ ut_ad(rw_lock_get_reader_count(lock) == 0);
+
+ /* Set the shared lock by incrementing the reader count */
+ lock->reader_count++;
+
+ #ifdef UNIV_SYNC_DEBUG
+ rw_lock_add_debug_info(lock, 0, RW_LOCK_SHARED, file_name, line);
+ #endif
+}
+
+/**********************************************************************
+Low-level function which locks an rw-lock in x-mode when we know that it
+is not locked and none else is currently accessing the rw-lock structure.
+Then we can do the locking without reserving the mutex. */
+UNIV_INLINE
+void
+rw_lock_x_lock_direct(
+/*==================*/
+ rw_lock_t* lock /* in: pointer to rw-lock */
+ #ifdef UNIV_SYNC_DEBUG
+ ,char* file_name, /* in: file name where lock requested */
+ ulint line /* in: line where requested */
+ #endif
+)
+{
+ ut_ad(rw_lock_validate(lock));
+ ut_ad(rw_lock_get_reader_count(lock) == 0);
+ ut_ad(rw_lock_get_writer(lock) == RW_LOCK_NOT_LOCKED);
+
+ rw_lock_set_writer(lock, RW_LOCK_EX);
+ lock->writer_thread = os_thread_get_curr_id();
+ lock->writer_count++;
+ lock->pass = 0;
+
+ #ifdef UNIV_SYNC_DEBUG
+ rw_lock_add_debug_info(lock, 0, RW_LOCK_EX, file_name, line);
+ #endif
+}
+
+/**********************************************************************
+NOTE! Use the corresponding macro, not directly this function! Lock an
+rw-lock in shared mode for the current thread. If the rw-lock is locked
+in exclusive mode, or there is an exclusive lock request waiting, the
+function spins a preset time (controlled by SYNC_SPIN_ROUNDS), waiting for
+the lock, before suspending the thread. */
+UNIV_INLINE
+void
+rw_lock_s_lock_func(
+/*================*/
+ rw_lock_t* lock /* in: pointer to rw-lock */
+ #ifdef UNIV_SYNC_DEBUG
+ ,ulint pass, /* in: pass value; != 0, if the lock will
+ be passed to another thread to unlock */
+ char* file_name, /* in: file name where lock requested */
+ ulint line /* in: line where requested */
+ #endif
+)
+{
+ /* NOTE: As we do not know the thread ids for threads which have
+ s-locked a latch, and s-lockers will be served only after waiting
+ x-lock requests have been fulfilled, then if this thread already
+ owns an s-lock here, it may end up in a deadlock with another thread
+ which requests an x-lock here. Therefore, we will forbid recursive
+ s-locking of a latch: the following assert will warn the programmer
+ of the possibility of a tjis kind of deadlock. If we want to implement
+ safe recursive s-locking, we should keep in a list the thread ids of
+ the threads which have s-locked a latch. This would use some CPU
+ time. */
+
+ ut_ad(!rw_lock_own(lock, RW_LOCK_SHARED)); /* see NOTE above */
+
+ mutex_enter(rw_lock_get_mutex(lock));
+
+ if (TRUE == rw_lock_s_lock_low(lock
+ #ifdef UNIV_SYNC_DEBUG
+ ,pass, file_name, line
+ #endif
+ )) {
+ mutex_exit(rw_lock_get_mutex(lock));
+
+ return; /* Success */
+ } else {
+ /* Did not succeed, try spin wait */
+ mutex_exit(rw_lock_get_mutex(lock));
+
+ rw_lock_s_lock_spin(lock
+ #ifdef UNIV_SYNC_DEBUG
+ ,pass, file_name, line
+ #endif
+ );
+ return;
+ }
+}
+
+/**********************************************************************
+NOTE! Use the corresponding macro, not directly this function! Lock an
+rw-lock in shared mode for the current thread if the lock can be acquired
+immediately. */
+UNIV_INLINE
+ibool
+rw_lock_s_lock_func_nowait(
+/*=======================*/
+ /* out: TRUE if success */
+ rw_lock_t* lock /* in: pointer to rw-lock */
+ #ifdef UNIV_SYNC_DEBUG
+ ,char* file_name, /* in: file name where lock requested */
+ ulint line /* in: line where requested */
+ #endif
+)
+{
+ ibool success = FALSE;
+
+ mutex_enter(rw_lock_get_mutex(lock));
+
+ if (lock->writer == RW_LOCK_NOT_LOCKED) {
+ /* Set the shared lock by incrementing the reader count */
+ lock->reader_count++;
+
+ #ifdef UNIV_SYNC_DEBUG
+ rw_lock_add_debug_info(lock, 0, RW_LOCK_SHARED, file_name,
+ line);
+ #endif
+
+ success = TRUE;
+ }
+
+ mutex_exit(rw_lock_get_mutex(lock));
+
+ return(success);
+}
+
+/**********************************************************************
+NOTE! Use the corresponding macro, not directly this function! Lock an
+rw-lock in exclusive mode for the current thread if the lock can be
+obtained immediately. */
+UNIV_INLINE
+ibool
+rw_lock_x_lock_func_nowait(
+/*=======================*/
+ /* out: TRUE if success */
+ rw_lock_t* lock /* in: pointer to rw-lock */
+ #ifdef UNIV_SYNC_DEBUG
+ ,char* file_name, /* in: file name where lock requested */
+ ulint line /* in: line where requested */
+ #endif
+)
+{
+ ibool success = FALSE;
+
+ mutex_enter(rw_lock_get_mutex(lock));
+
+ if ((rw_lock_get_reader_count(lock) == 0)
+ && ((rw_lock_get_writer(lock) == RW_LOCK_NOT_LOCKED)
+ || ((rw_lock_get_writer(lock) == RW_LOCK_EX)
+ && (lock->pass == 0)
+ && (lock->writer_thread == os_thread_get_curr_id())))) {
+
+ rw_lock_set_writer(lock, RW_LOCK_EX);
+ lock->writer_thread = os_thread_get_curr_id();
+ lock->writer_count++;
+ lock->pass = 0;
+
+ #ifdef UNIV_SYNC_DEBUG
+ rw_lock_add_debug_info(lock, 0, RW_LOCK_EX, file_name, line);
+ #endif
+
+ success = TRUE;
+ }
+
+ mutex_exit(rw_lock_get_mutex(lock));
+
+ ut_ad(rw_lock_validate(lock));
+
+ return(success);
+}
+
+/**********************************************************************
+Releases a shared mode lock. */
+UNIV_INLINE
+void
+rw_lock_s_unlock_func(
+/*==================*/
+ rw_lock_t* lock /* in: rw-lock */
+#ifdef UNIV_SYNC_DEBUG
+ ,ulint pass /* in: pass value; != 0, if the lock may have
+ been passed to another thread to unlock */
+#endif
+ )
+{
+ mutex_t* mutex = &(lock->mutex);
+ ibool sg = FALSE;
+
+ /* Acquire the mutex protecting the rw-lock fields */
+ mutex_enter(mutex);
+
+ /* Reset the shared lock by decrementing the reader count */
+
+ ut_ad(lock->reader_count > 0);
+ lock->reader_count--;
+
+ #ifdef UNIV_SYNC_DEBUG
+ rw_lock_remove_debug_info(lock, pass, RW_LOCK_SHARED);
+ #endif
+
+ /* If there may be waiters and this was the last s-lock,
+ signal the object */
+
+ if (lock->waiters && (lock->reader_count == 0)) {
+ sg = TRUE;
+
+ rw_lock_set_waiters(lock, 0);
+ }
+
+ mutex_exit(mutex);
+
+ if (sg == TRUE) {
+ sync_array_signal_object(sync_primary_wait_array, lock);
+ }
+
+ ut_ad(rw_lock_validate(lock));
+
+#ifdef UNIV_SYNC_PERF_STAT
+ rw_s_exit_count++;
+#endif
+}
+
+/**********************************************************************
+Releases a shared mode lock when we know there are no waiters and none
+else will access the lock during the time this function is executed. */
+UNIV_INLINE
+void
+rw_lock_s_unlock_direct(
+/*====================*/
+ rw_lock_t* lock) /* in: rw-lock */
+{
+ /* Reset the shared lock by decrementing the reader count */
+
+ ut_ad(lock->reader_count > 0);
+
+ lock->reader_count--;
+
+ #ifdef UNIV_SYNC_DEBUG
+ rw_lock_remove_debug_info(lock, 0, RW_LOCK_SHARED);
+ #endif
+
+ ut_ad(!lock->waiters);
+ ut_ad(rw_lock_validate(lock));
+#ifdef UNIV_SYNC_PERF_STAT
+ rw_s_exit_count++;
+#endif
+}
+
+/**********************************************************************
+Releases an exclusive mode lock. */
+UNIV_INLINE
+void
+rw_lock_x_unlock_func(
+/*==================*/
+ rw_lock_t* lock /* in: rw-lock */
+#ifdef UNIV_SYNC_DEBUG
+ ,ulint pass /* in: pass value; != 0, if the lock may have
+ been passed to another thread to unlock */
+#endif
+ )
+{
+ ibool sg = FALSE;
+
+ /* Acquire the mutex protecting the rw-lock fields */
+ mutex_enter(&(lock->mutex));
+
+ /* Reset the exclusive lock if this thread no longer has an x-mode
+ lock */
+
+ ut_ad(lock->writer_count > 0);
+
+ lock->writer_count--;
+
+ if (lock->writer_count == 0) {
+ rw_lock_set_writer(lock, RW_LOCK_NOT_LOCKED);
+ }
+
+ #ifdef UNIV_SYNC_DEBUG
+ rw_lock_remove_debug_info(lock, pass, RW_LOCK_EX);
+ #endif
+
+ /* If there may be waiters, signal the lock */
+ if (lock->waiters && (lock->writer_count == 0)) {
+
+ sg = TRUE;
+ rw_lock_set_waiters(lock, 0);
+ }
+
+ mutex_exit(&(lock->mutex));
+
+ if (sg == TRUE) {
+ sync_array_signal_object(sync_primary_wait_array, lock);
+ }
+
+ ut_ad(rw_lock_validate(lock));
+
+#ifdef UNIV_SYNC_PERF_STAT
+ rw_x_exit_count++;
+#endif
+}
+
+/**********************************************************************
+Releases an exclusive mode lock when we know there are no waiters, and
+none else will access the lock durint the time this function is executed. */
+UNIV_INLINE
+void
+rw_lock_x_unlock_direct(
+/*====================*/
+ rw_lock_t* lock) /* in: rw-lock */
+{
+ /* Reset the exclusive lock if this thread no longer has an x-mode
+ lock */
+
+ ut_ad(lock->writer_count > 0);
+
+ lock->writer_count--;
+
+ if (lock->writer_count == 0) {
+ rw_lock_set_writer(lock, RW_LOCK_NOT_LOCKED);
+ }
+
+ #ifdef UNIV_SYNC_DEBUG
+ rw_lock_remove_debug_info(lock, 0, RW_LOCK_EX);
+ #endif
+
+ ut_ad(!lock->waiters);
+ ut_ad(rw_lock_validate(lock));
+
+#ifdef UNIV_SYNC_PERF_STAT
+ rw_x_exit_count++;
+#endif
+}
diff --git a/innobase/include/sync0sync.h b/innobase/include/sync0sync.h
new file mode 100644
index 00000000000..87c4628d2e4
--- /dev/null
+++ b/innobase/include/sync0sync.h
@@ -0,0 +1,497 @@
+/******************************************************
+Mutex, the basic synchronization primitive
+
+(c) 1995 Innobase Oy
+
+Created 9/5/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef sync0sync_h
+#define sync0sync_h
+
+#include "univ.i"
+#include "sync0types.h"
+#include "ut0lst.h"
+#include "ut0mem.h"
+#include "os0thread.h"
+#include "os0sync.h"
+#include "sync0arr.h"
+
+/**********************************************************************
+Initializes the synchronization data structures. */
+
+void
+sync_init(void);
+/*===========*/
+/**********************************************************************
+Frees the resources in synchronization data structures. */
+
+void
+sync_close(void);
+/*===========*/
+/**********************************************************************
+Creates, or rather, initializes a mutex object to a specified memory
+location (which must be appropriately aligned). The mutex is initialized
+in the reset state. Explicit freeing of the mutex with mutex_free is
+necessary only if the memory block containing it is freed. */
+
+
+#define mutex_create(M) mutex_create_func((M), __FILE__, __LINE__)
+/*===================*/
+/**********************************************************************
+Creates, or rather, initializes a mutex object in a specified memory
+location (which must be appropriately aligned). The mutex is initialized
+in the reset state. Explicit freeing of the mutex with mutex_free is
+necessary only if the memory block containing it is freed. */
+
+void
+mutex_create_func(
+/*==============*/
+ mutex_t* mutex, /* in: pointer to memory */
+ char* cfile_name, /* in: file name where created */
+ ulint cline); /* in: file line where created */
+/**********************************************************************
+Calling this function is obligatory only if the memory buffer containing
+the mutex is freed. Removes a mutex object from the mutex list. The mutex
+is checked to be in the reset state. */
+
+void
+mutex_free(
+/*=======*/
+ mutex_t* mutex); /* in: mutex */
+/******************************************************************
+NOTE! The following macro should be used in mutex locking, not the
+corresponding function. */
+
+#ifdef UNIV_SYNC_DEBUG
+#define mutex_enter(M) mutex_enter_func((M), __FILE__, __LINE__)
+#else
+#define mutex_enter(M) mutex_enter_func(M)
+#endif
+/******************************************************************
+NOTE! The following macro should be used in mutex locking, not the
+corresponding function. */
+
+/* NOTE! currently same as mutex_enter! */
+
+#ifdef UNIV_SYNC_DEBUG
+#define mutex_enter_fast(M) mutex_enter_func((M), __FILE__, __LINE__)
+#else
+#define mutex_enter_fast(M) mutex_enter_func(M)
+#endif
+
+#define mutex_enter_fast_func mutex_enter_func;
+/**********************************************************************
+NOTE! Use the corresponding macro in the header file, not this function
+directly. Locks a mutex for the current thread. If the mutex is reserved
+the function spins a preset time (controlled by SYNC_SPIN_ROUNDS) waiting
+for the mutex before suspending the thread. */
+UNIV_INLINE
+void
+mutex_enter_func(
+/*=============*/
+ mutex_t* mutex /* in: pointer to mutex */
+ #ifdef UNIV_SYNC_DEBUG
+ ,char* file_name, /* in: file name where locked */
+ ulint line /* in: line where locked */
+ #endif
+ );
+/************************************************************************
+Tries to lock the mutex for the current thread. If the lock is not acquired
+immediately, returns with return value 1. */
+
+ulint
+mutex_enter_nowait(
+/*===============*/
+ /* out: 0 if succeed, 1 if not */
+ mutex_t* mutex); /* in: pointer to mutex */
+/**********************************************************************
+Unlocks a mutex owned by the current thread. */
+UNIV_INLINE
+void
+mutex_exit(
+/*=======*/
+ mutex_t* mutex); /* in: pointer to mutex */
+/**********************************************************************
+Returns TRUE if no mutex or rw-lock is currently locked.
+Works only in the debug version. */
+
+ibool
+sync_all_freed(void);
+/*================*/
+/*#####################################################################
+FUNCTION PROTOTYPES FOR DEBUGGING */
+/***********************************************************************
+Prints wait info of the sync system. */
+
+void
+sync_print_wait_info(void);
+/*======================*/
+/***********************************************************************
+Prints info of the sync system. */
+
+void
+sync_print(void);
+/*============*/
+/**********************************************************************
+Checks that the mutex has been initialized. */
+
+ibool
+mutex_validate(
+/*===========*/
+ mutex_t* mutex);
+/**********************************************************************
+Sets the mutex latching level field. */
+
+void
+mutex_set_level(
+/*============*/
+ mutex_t* mutex, /* in: mutex */
+ ulint level); /* in: level */
+/**********************************************************************
+Adds a latch and its level in the thread level array. Allocates the memory
+for the array if called first time for this OS thread. Makes the checks
+against other latch levels stored in the array for this thread. */
+
+void
+sync_thread_add_level(
+/*==================*/
+ void* latch, /* in: pointer to a mutex or an rw-lock */
+ ulint level); /* in: level in the latching order; if SYNC_LEVEL_NONE,
+ nothing is done */
+/**********************************************************************
+Removes a latch from the thread level array if it is found there. */
+
+ibool
+sync_thread_reset_level(
+/*====================*/
+ /* out: TRUE if found from the array; it is no error
+ if the latch is not found, as we presently are not
+ able to determine the level for every latch
+ reservation the program does */
+ void* latch); /* in: pointer to a mutex or an rw-lock */
+/**********************************************************************
+Checks that the level array for the current thread is empty. */
+
+ibool
+sync_thread_levels_empty(void);
+/*==========================*/
+ /* out: TRUE if empty */
+/**********************************************************************
+Checks that the level array for the current thread is empty. */
+
+ibool
+sync_thread_levels_empty_gen(
+/*=========================*/
+ /* out: TRUE if empty except the
+ exceptions specified below */
+ ibool dict_mutex_allowed); /* in: TRUE if dictionary mutex is
+ allowed to be owned by the thread,
+ also purge_is_running mutex is
+ allowed */
+/**********************************************************************
+Checks that the current thread owns the mutex. Works only
+in the debug version. */
+
+ibool
+mutex_own(
+/*======*/
+ /* out: TRUE if owns */
+ mutex_t* mutex); /* in: mutex */
+/**********************************************************************
+Gets the debug information for a reserved mutex. */
+
+void
+mutex_get_debug_info(
+/*=================*/
+ mutex_t* mutex, /* in: mutex */
+ char** file_name, /* out: file where requested */
+ ulint* line, /* out: line where requested */
+ os_thread_id_t* thread_id); /* out: id of the thread which owns
+ the mutex */
+/**********************************************************************
+Counts currently reserved mutexes. Works only in the debug version. */
+
+ulint
+mutex_n_reserved(void);
+/*==================*/
+/**********************************************************************
+Prints debug info of currently reserved mutexes. */
+
+void
+mutex_list_print_info(void);
+/*========================*/
+/**********************************************************************
+NOT to be used outside this module except in debugging! Gets the value
+of the lock word. */
+UNIV_INLINE
+ulint
+mutex_get_lock_word(
+/*================*/
+ mutex_t* mutex); /* in: mutex */
+/**********************************************************************
+NOT to be used outside this module except in debugging! Gets the waiters
+field in a mutex. */
+UNIV_INLINE
+ulint
+mutex_get_waiters(
+/*==============*/
+ /* out: value to set */
+ mutex_t* mutex); /* in: mutex */
+/**********************************************************************
+Implements the memory barrier operation which makes a serialization point to
+the instruction flow. This is needed because the Pentium may speculatively
+execute reads before preceding writes are committed. We could also use here
+any LOCKed instruction (see Intel Software Dev. Manual, Vol. 3). */
+
+void
+mutex_fence(void);
+/*=============*/
+
+/*
+ LATCHING ORDER WITHIN THE DATABASE
+ ==================================
+
+The mutex or latch in the central memory object, for instance, a rollback
+segment object, must be acquired before acquiring the latch or latches to
+the corresponding file data structure. In the latching order below, these
+file page object latches are placed immediately below the corresponding
+central memory object latch or mutex.
+
+Synchronization object Notes
+---------------------- -----
+
+Dictionary mutex If we have a pointer to a dictionary
+| object, e.g., a table, it can be
+| accessed without reserving the
+| dictionary mutex. We must have a
+| reservation, a memoryfix, to the
+| appropriate table object in this case,
+| and the table must be explicitly
+| released later.
+V
+Dictionary header
+|
+V
+Secondary index tree latch The tree latch protects also all
+| the B-tree non-leaf pages. These
+V can be read with the page only
+Secondary index non-leaf bufferfixed to save CPU time,
+| no s-latch is needed on the page.
+| Modification of a page requires an
+| x-latch on the page, however. If a
+| thread owns an x-latch to the tree,
+| it is allowed to latch non-leaf pages
+| even after it has acquired the fsp
+| latch.
+V
+Secondary index leaf The latch on the secondary index leaf
+| can be kept while accessing the
+| clustered index, to save CPU time.
+V
+Clustered index tree latch To increase concurrency, the tree
+| latch is usually released when the
+| leaf page latch has been acquired.
+V
+Clustered index non-leaf
+|
+V
+Clustered index leaf
+|
+V
+Transaction system header
+|
+V
+Transaction undo mutex The undo log entry must be written
+| before any index page is modified.
+| Transaction undo mutex is for the undo
+| logs the analogue of the tree latch
+| for a B-tree. If a thread has the
+| trx undo mutex reserved, it is allowed
+| to latch the undo log pages in any
+| order, and also after it has acquired
+| the fsp latch.
+V
+Rollback segment mutex The rollback segment mutex must be
+| reserved, if, e.g., a new page must
+| be added to an undo log. The rollback
+| segment and the undo logs in its
+| history list can be seen as an
+| analogue of a B-tree, and the latches
+| reserved similarly, using a version of
+| lock-coupling. If an undo log must be
+| extended by a page when inserting an
+| undo log record, this corresponds to
+| a pessimistic insert in a B-tree.
+V
+Rollback segment header
+|
+V
+Purge system latch
+|
+V
+Undo log pages If a thread owns the trx undo mutex,
+| or for a log in the history list, the
+| rseg mutex, it is allowed to latch
+| undo log pages in any order, and even
+| after it has acquired the fsp latch.
+| If a thread does not have the
+| appropriate mutex, it is allowed to
+| latch only a single undo log page in
+| a mini-transaction.
+V
+File space management latch If a mini-transaction must allocate
+| several file pages, it can do that,
+| because it keeps the x-latch to the
+| file space management in its memo.
+V
+File system pages
+|
+V
+Kernel mutex If a kernel operation needs a file
+| page allocation, it must reserve the
+| fsp x-latch before acquiring the kernel
+| mutex.
+V
+Search system mutex
+|
+V
+Buffer pool mutex
+|
+V
+Log mutex
+|
+Any other latch
+|
+V
+Memory pool mutex */
+
+/* Latching order levels */
+#define SYNC_NO_ORDER_CHECK 3000 /* this can be used to suppress
+ latching order checking */
+#define SYNC_LEVEL_NONE 2000 /* default: level not defined */
+#define SYNC_DICT 1000
+#define SYNC_PURGE_IS_RUNNING 997
+#define SYNC_DICT_HEADER 995
+#define SYNC_IBUF_HEADER 914
+#define SYNC_IBUF_PESS_INSERT_MUTEX 912
+#define SYNC_IBUF_MUTEX 910 /* ibuf mutex is really below
+ SYNC_FSP_PAGE: we assign value this
+ high only to get the program to pass
+ the debug checks */
+/*-------------------------------*/
+#define SYNC_INDEX_TREE 900
+#define SYNC_TREE_NODE_NEW 892
+#define SYNC_TREE_NODE_FROM_HASH 891
+#define SYNC_TREE_NODE 890
+#define SYNC_PURGE_SYS 810
+#define SYNC_PURGE_LATCH 800
+#define SYNC_TRX_UNDO 700
+#define SYNC_RSEG 600
+#define SYNC_RSEG_HEADER_NEW 591
+#define SYNC_RSEG_HEADER 590
+#define SYNC_TRX_UNDO_PAGE 570
+#define SYNC_FSP 400
+#define SYNC_FSP_PAGE 395
+/*------------------------------------- Insert buffer headers */
+/*------------------------------------- ibuf_mutex */
+/*------------------------------------- Insert buffer trees */
+#define SYNC_IBUF_BITMAP_MUTEX 351
+#define SYNC_IBUF_BITMAP 350
+/*-------------------------------*/
+#define SYNC_KERNEL 300
+#define SYNC_REC_LOCK 299
+#define SYNC_TRX_LOCK_HEAP 298
+#define SYNC_TRX_SYS_HEADER 290
+#define SYNC_LOG 170
+#define SYNC_RECV 168
+#define SYNC_SEARCH_SYS 160 /* NOTE that if we have a memory
+ heap that can be extended to the
+ buffer pool, its logical level is
+ SYNC_SEARCH_SYS, as memory allocation
+ can call routines there! Otherwise
+ the level is SYNC_MEM_HASH. */
+#define SYNC_BUF_POOL 150
+#define SYNC_BUF_BLOCK 149
+#define SYNC_ANY_LATCH 135
+#define SYNC_MEM_HASH 131
+#define SYNC_MEM_POOL 130
+
+/* Codes used to designate lock operations */
+#define RW_LOCK_NOT_LOCKED 350
+#define RW_LOCK_EX 351
+#define RW_LOCK_EXCLUSIVE 351
+#define RW_LOCK_SHARED 352
+#define RW_LOCK_WAIT_EX 353
+#define SYNC_MUTEX 354
+
+#define MUTEX_CNAME_LEN 8
+
+/* NOTE! The structure appears here only for the compiler to know its size.
+Do not use its fields directly! The structure used in the spin lock
+implementation of a mutual exclusion semaphore. */
+
+struct mutex_struct {
+ ulint lock_word; /* This ulint is the target of the atomic
+ test-and-set instruction in Win32 */
+#ifndef _WIN32
+ os_fast_mutex_t
+ os_fast_mutex; /* In other systems we use this OS mutex
+ in place of lock_word */
+#endif
+ ulint waiters; /* This ulint is set to 1 if there are (or
+ may be) threads waiting in the global wait
+ array for this mutex to be released.
+ Otherwise, this is 0. */
+ UT_LIST_NODE_T(mutex_t) list; /* All allocated mutexes are put into
+ a list. Pointers to the next and prev. */
+ os_thread_id_t thread_id; /* Debug version: The thread id of the
+ thread which locked the mutex. */
+ char* file_name; /* Debug version: File name where the mutex
+ was locked */
+ ulint line; /* Debug version: Line where the mutex was
+ locked */
+ ulint level; /* Debug version: level in the global latching
+ order; default SYNC_LEVEL_NONE */
+ char cfile_name[MUTEX_CNAME_LEN];
+ /* File name where mutex created */
+ ulint cline; /* Line where created */
+ ulint magic_n;
+};
+
+#define MUTEX_MAGIC_N (ulint)979585
+
+/* The global array of wait cells for implementation of the databases own
+mutexes and read-write locks. Appears here for debugging purposes only! */
+
+extern sync_array_t* sync_primary_wait_array;
+
+/* Constant determining how long spin wait is continued before suspending
+the thread. A value 600 rounds on a 1995 100 MHz Pentium seems to correspond
+to 20 microseconds. */
+
+#define SYNC_SPIN_ROUNDS srv_n_spin_wait_rounds
+
+#define SYNC_INFINITE_TIME ((ulint)(-1))
+
+/* Means that a timeout elapsed when waiting */
+
+#define SYNC_TIME_EXCEEDED (ulint)1
+
+/* The number of system calls made in this module. Intended for performance
+monitoring. */
+
+extern ulint mutex_system_call_count;
+extern ulint mutex_exit_count;
+
+/* Latching order checks start when this is set TRUE */
+extern ibool sync_order_checks_on;
+
+/* This variable is set to TRUE when sync_init is called */
+extern ibool sync_initialized;
+
+#ifndef UNIV_NONINL
+#include "sync0sync.ic"
+#endif
+
+#endif
diff --git a/innobase/include/sync0sync.ic b/innobase/include/sync0sync.ic
new file mode 100644
index 00000000000..a937ac5d579
--- /dev/null
+++ b/innobase/include/sync0sync.ic
@@ -0,0 +1,226 @@
+/******************************************************
+Mutex, the basic synchronization primitive
+
+(c) 1995 Innobase Oy
+
+Created 9/5/1995 Heikki Tuuri
+*******************************************************/
+
+/**********************************************************************
+Sets the waiters field in a mutex. */
+
+void
+mutex_set_waiters(
+/*==============*/
+ mutex_t* mutex, /* in: mutex */
+ ulint n); /* in: value to set */
+/**********************************************************************
+Reserves a mutex for the current thread. If the mutex is reserved, the
+function spins a preset time (controlled by SYNC_SPIN_ROUNDS) waiting
+for the mutex before suspending the thread. */
+
+void
+mutex_spin_wait(
+/*============*/
+ mutex_t* mutex /* in: pointer to mutex */
+
+ #ifdef UNIV_SYNC_DEBUG
+ ,char* file_name, /* in: file name where mutex requested */
+ ulint line /* in: line where requested */
+ #endif
+);
+/**********************************************************************
+Sets the debug information for a reserved mutex. */
+
+void
+mutex_set_debug_info(
+/*=================*/
+ mutex_t* mutex, /* in: mutex */
+ char* file_name, /* in: file where requested */
+ ulint line); /* in: line where requested */
+/**********************************************************************
+Releases the threads waiting in the primary wait array for this mutex. */
+
+void
+mutex_signal_object(
+/*================*/
+ mutex_t* mutex); /* in: mutex */
+
+/**********************************************************************
+Performs an atomic test-and-set instruction to the lock_word field of a
+mutex. */
+UNIV_INLINE
+ulint
+mutex_test_and_set(
+/*===============*/
+ /* out: the previous value of lock_word: 0 or
+ 1 */
+ mutex_t* mutex) /* in: mutex */
+{
+#ifdef _WIN32
+ ulint res;
+ ulint* lw; /* assembler code is used to ensure that
+ lock_word is loaded from memory */
+ ut_ad(mutex);
+ ut_ad(sizeof(ulint) == 4);
+
+ lw = &(mutex->lock_word);
+
+ __asm MOV ECX, lw
+ __asm MOV EDX, 1
+ __asm XCHG EDX, DWORD PTR [ECX]
+ __asm MOV res, EDX
+
+ /* The fence below would prevent this thread from reading the data
+ structure protected by the mutex before the test-and-set operation is
+ committed, but the fence is apparently not needed:
+
+ In a posting to comp.arch newsgroup (August 10, 1997) Andy Glew said
+ that in P6 a LOCKed instruction like XCHG establishes a fence with
+ respect to memory reads and writes and thus an explicit fence is not
+ needed. In P5 he seemed to agree with a previous newsgroup poster that
+ LOCKed instructions serialize all instruction execution, and,
+ consequently, also memory operations. This is confirmed in Intel
+ Software Dev. Manual, Vol. 3. */
+
+ /* mutex_fence(); */
+
+ return(res);
+#else
+ ibool ret;
+
+ ret = os_fast_mutex_trylock(&(mutex->os_fast_mutex));
+
+ if (ret == 0) {
+ mutex->lock_word = 1;
+ }
+
+ return(ret);
+#endif
+}
+
+/**********************************************************************
+Performs a reset instruction to the lock_word field of a mutex. This
+instruction also serializes memory operations to the program order. */
+UNIV_INLINE
+void
+mutex_reset_lock_word(
+/*==================*/
+ mutex_t* mutex) /* in: mutex */
+{
+#ifdef _WIN32
+ ulint* lw; /* assembler code is used to ensure that
+ lock_word is loaded from memory */
+ ut_ad(mutex);
+
+ lw = &(mutex->lock_word);
+
+ __asm MOV EDX, 0
+ __asm MOV ECX, lw
+ __asm XCHG EDX, DWORD PTR [ECX]
+#else
+ mutex->lock_word = 0;
+
+ os_fast_mutex_unlock(&(mutex->os_fast_mutex));
+#endif
+}
+
+/**********************************************************************
+Gets the value of the lock word. */
+UNIV_INLINE
+ulint
+mutex_get_lock_word(
+/*================*/
+ mutex_t* mutex) /* in: mutex */
+{
+volatile ulint* ptr; /* declared volatile to ensure that
+ lock_word is loaded from memory */
+ ut_ad(mutex);
+
+ ptr = &(mutex->lock_word);
+
+ return(*ptr);
+}
+
+/**********************************************************************
+Gets the waiters field in a mutex. */
+UNIV_INLINE
+ulint
+mutex_get_waiters(
+/*==============*/
+ /* out: value to set */
+ mutex_t* mutex) /* in: mutex */
+{
+volatile ulint* ptr; /* declared volatile to ensure that
+ the value is read from memory */
+ ut_ad(mutex);
+
+ ptr = &(mutex->waiters);
+
+ return(*ptr); /* Here we assume that the read of a single
+ word from memory is atomic */
+}
+
+/**********************************************************************
+Unlocks a mutex owned by the current thread. */
+UNIV_INLINE
+void
+mutex_exit(
+/*=======*/
+ mutex_t* mutex) /* in: pointer to mutex */
+{
+ ut_ad(mutex_own(mutex));
+
+#ifdef UNIV_SYNC_DEBUG
+ mutex->thread_id = ULINT_UNDEFINED;
+
+ sync_thread_reset_level(mutex);
+#endif
+ mutex_reset_lock_word(mutex);
+
+ if (mutex_get_waiters(mutex) != 0) {
+
+ mutex_signal_object(mutex);
+ }
+
+#ifdef UNIV_SYNC_PERF_STAT
+ mutex_exit_count++;
+#endif
+}
+
+/**********************************************************************
+Locks a mutex for the current thread. If the mutex is reserved, the function
+spins a preset time (controlled by SYNC_SPIN_ROUNDS), waiting for the mutex
+before suspending the thread. */
+UNIV_INLINE
+void
+mutex_enter_func(
+/*=============*/
+ mutex_t* mutex /* in: pointer to mutex */
+ #ifdef UNIV_SYNC_DEBUG
+ ,char* file_name, /* in: file name where locked */
+ ulint line /* in: line where locked */
+ #endif
+ )
+{
+ ut_ad(mutex_validate(mutex));
+
+ /* Note that we do not peek at the value of lock_word before trying
+ the atomic test_and_set; we could peek, and possibly save time. */
+
+ if (!mutex_test_and_set(mutex)) {
+
+ #ifdef UNIV_SYNC_DEBUG
+ mutex_set_debug_info(mutex, file_name, line);
+ #endif
+
+ return; /* Succeeded! */
+ }
+
+ mutex_spin_wait(mutex
+ #ifdef UNIV_SYNC_DEBUG
+ ,file_name,
+ line
+ #endif
+ );
+}
diff --git a/innobase/include/sync0types.h b/innobase/include/sync0types.h
new file mode 100644
index 00000000000..2c31f80cca3
--- /dev/null
+++ b/innobase/include/sync0types.h
@@ -0,0 +1,15 @@
+/******************************************************
+Global types for sync
+
+(c) 1995 Innobase Oy
+
+Created 9/5/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef sync0types_h
+#define sync0types_h
+
+typedef struct mutex_struct mutex_t;
+
+
+#endif
diff --git a/innobase/include/thr0loc.h b/innobase/include/thr0loc.h
new file mode 100644
index 00000000000..32e2dc3ae93
--- /dev/null
+++ b/innobase/include/thr0loc.h
@@ -0,0 +1,67 @@
+/******************************************************
+The thread local storage
+
+(c) 1995 Innobase Oy
+
+Created 10/5/1995 Heikki Tuuri
+*******************************************************/
+
+/* This module implements storage private to each thread,
+a capability useful in some situations like storing the
+OS handle to the current thread, or its priority. */
+
+#ifndef thr0loc_h
+#define thr0loc_h
+
+#include "univ.i"
+#include "os0thread.h"
+
+/********************************************************************
+Initializes the thread local storage module. */
+
+void
+thr_local_init(void);
+/*================*/
+/***********************************************************************
+Creates a local storage struct for the calling new thread. */
+
+void
+thr_local_create(void);
+/*==================*/
+/***********************************************************************
+Frees the local storage struct for the specified thread. */
+
+void
+thr_local_free(
+/*===========*/
+ os_thread_id_t id); /* in: thread id */
+/***********************************************************************
+Gets the slot number in the thread table of a thread. */
+
+ulint
+thr_local_get_slot_no(
+/*==================*/
+ /* out: slot number */
+ os_thread_id_t id); /* in: thread id of the thread */
+/***********************************************************************
+Sets in the local storage the slot number in the thread table of a thread. */
+
+void
+thr_local_set_slot_no(
+/*==================*/
+ os_thread_id_t id, /* in: thread id of the thread */
+ ulint slot_no);/* in: slot number */
+/***********************************************************************
+Returns pointer to the 'in_ibuf' field within the current thread local
+storage. */
+
+ibool*
+thr_local_get_in_ibuf_field(void);
+/*=============================*/
+ /* out: pointer to the in_ibuf field */
+
+#ifndef UNIV_NONINL
+#include "thr0loc.ic"
+#endif
+
+#endif
diff --git a/innobase/include/thr0loc.ic b/innobase/include/thr0loc.ic
new file mode 100644
index 00000000000..b8b8136180c
--- /dev/null
+++ b/innobase/include/thr0loc.ic
@@ -0,0 +1,7 @@
+/******************************************************
+Thread local storage
+
+(c) 1995 Innobase Oy
+
+Created 10/4/1995 Heikki Tuuri
+*******************************************************/
diff --git a/innobase/include/trx0purge.h b/innobase/include/trx0purge.h
new file mode 100644
index 00000000000..8870ebc936c
--- /dev/null
+++ b/innobase/include/trx0purge.h
@@ -0,0 +1,166 @@
+/******************************************************
+Purge old versions
+
+(c) 1996 Innobase Oy
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef trx0purge_h
+#define trx0purge_h
+
+#include "univ.i"
+#include "trx0types.h"
+#include "mtr0mtr.h"
+#include "trx0sys.h"
+#include "que0types.h"
+#include "page0page.h"
+#include "usr0sess.h"
+#include "fil0fil.h"
+
+/* The global data structure coordinating a purge */
+extern trx_purge_t* purge_sys;
+
+/* A dummy undo record used as a return value when we have a whole undo log
+which needs no purge */
+extern trx_undo_rec_t trx_purge_dummy_rec;
+
+/************************************************************************
+Calculates the file address of an undo log header when we have the file
+address of its history list node. */
+UNIV_INLINE
+fil_addr_t
+trx_purge_get_log_from_hist(
+/*========================*/
+ /* out: file address of the log */
+ fil_addr_t node_addr); /* in: file address of the history
+ list node of the log */
+/*********************************************************************
+Checks if trx_id is >= purge_view: then it is guaranteed that its update
+undo log still exists in the system. */
+
+ibool
+trx_purge_update_undo_must_exist(
+/*=============================*/
+ /* out: TRUE if is sure that it is preserved, also
+ if the function returns FALSE, it is possible that
+ the undo log still exists in the system */
+ dulint trx_id);/* in: transaction id */
+/************************************************************************
+Creates the global purge system control structure and inits the history
+mutex. */
+
+void
+trx_purge_sys_create(void);
+/*======================*/
+/************************************************************************
+Adds the update undo log as the first log in the history list. Removes the
+update undo log segment from the rseg slot if it is too big for reuse. */
+
+void
+trx_purge_add_update_undo_to_history(
+/*=================================*/
+ trx_t* trx, /* in: transaction */
+ page_t* undo_page, /* in: update undo log header page,
+ x-latched */
+ mtr_t* mtr); /* in: mtr */
+/************************************************************************
+Fetches the next undo log record from the history list to purge. It must be
+released with the corresponding release function. */
+
+trx_undo_rec_t*
+trx_purge_fetch_next_rec(
+/*=====================*/
+ /* out: copy of an undo log record, or
+ pointer to the dummy undo log record
+ &trx_purge_dummy_rec if the whole undo log
+ can skipped in purge; NULL if none left */
+ dulint* roll_ptr,/* out: roll pointer to undo record */
+ trx_undo_inf_t** cell, /* out: storage cell for the record in the
+ purge array */
+ mem_heap_t* heap); /* in: memory heap where copied */
+/***********************************************************************
+Releases a reserved purge undo record. */
+
+void
+trx_purge_rec_release(
+/*==================*/
+ trx_undo_inf_t* cell); /* in: storage cell */
+/***********************************************************************
+This function runs a purge batch. */
+
+ulint
+trx_purge(void);
+/*===========*/
+ /* out: number of undo log pages handled in
+ the batch */
+
+/* The control structure used in the purge operation */
+struct trx_purge_struct{
+ ulint state; /* Purge system state */
+ sess_t* sess; /* System session running the purge
+ query */
+ trx_t* trx; /* System transaction running the purge
+ query: this trx is not in the trx list
+ of the trx system and it never ends */
+ que_t* query; /* The query graph which will do the
+ parallelized purge operation */
+ rw_lock_t purge_is_running;/* Purge operation set an x-latch here
+ while it is accessing a table: this
+ prevents dropping of the table */
+ rw_lock_t latch; /* The latch protecting the purge view.
+ A purge operation must acquire an
+ x-latch here for the instant at which
+ it changes the purge view: an undo
+ log operation can prevent this by
+ obtaining an s-latch here. */
+ read_view_t* view; /* The purge will not remove undo logs
+ which are >= this view (purge view) */
+ mutex_t mutex; /* Mutex protecting the fields below */
+ ulint n_pages_handled;/* Approximate number of undo log
+ pages processed in purge */
+ ulint handle_limit; /* Target of how many pages to get
+ processed in the current purge */
+ /*------------------------------*/
+ /* The following two fields form the 'purge pointer' which advances
+ during a purge, and which is used in history list truncation */
+
+ dulint purge_trx_no; /* Purge has advanced past all
+ transactions whose number is less
+ than this */
+ dulint purge_undo_no; /* Purge has advanced past all records
+ whose undo number is less than this */
+ /*-----------------------------*/
+ ibool next_stored; /* TRUE if the info of the next record
+ to purge is stored below: if yes, then
+ the transaction number and the undo
+ number of the record are stored in
+ purge_trx_no and purge_undo_no above */
+ trx_rseg_t* rseg; /* Rollback segment for the next undo
+ record to purge */
+ ulint page_no; /* Page number for the next undo
+ record to purge, page number of the
+ log header, if dummy record */
+ ulint offset; /* Page offset for the next undo
+ record to purge, 0 if the dummy
+ record */
+ ulint hdr_page_no; /* Header page of the undo log where
+ the next record to purge belongs */
+ ulint hdr_offset; /* Header byte offset on the page */
+ /*-----------------------------*/
+ trx_undo_arr_t* arr; /* Array of transaction numbers and
+ undo numbers of the undo records
+ currently under processing in purge */
+ mem_heap_t* heap; /* Temporary storage used during a
+ purge: can be emptied after purge
+ completes */
+};
+
+#define TRX_PURGE_ON 1 /* purge operation is running */
+#define TRX_STOP_PURGE 2 /* purge operation is stopped, or
+ it should be stopped */
+#ifndef UNIV_NONINL
+#include "trx0purge.ic"
+#endif
+
+#endif
diff --git a/innobase/include/trx0purge.ic b/innobase/include/trx0purge.ic
new file mode 100644
index 00000000000..451e8ca31d0
--- /dev/null
+++ b/innobase/include/trx0purge.ic
@@ -0,0 +1,26 @@
+/******************************************************
+Purge old versions
+
+(c) 1996 Innobase Oy
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#include "trx0undo.h"
+
+/************************************************************************
+Calculates the file address of an undo log header when we have the file
+address of its history list node. */
+UNIV_INLINE
+fil_addr_t
+trx_purge_get_log_from_hist(
+/*========================*/
+ /* out: file address of the log */
+ fil_addr_t node_addr) /* in: file address of the history
+ list node of the log */
+{
+ node_addr.boffset -= TRX_UNDO_HISTORY_NODE;
+
+ return(node_addr);
+}
+
diff --git a/innobase/include/trx0rec.h b/innobase/include/trx0rec.h
new file mode 100644
index 00000000000..ea9e9f3fce5
--- /dev/null
+++ b/innobase/include/trx0rec.h
@@ -0,0 +1,284 @@
+/******************************************************
+Transaction undo log record
+
+(c) 1996 Innobase Oy
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef trx0rec_h
+#define trx0rec_h
+
+#include "univ.i"
+#include "trx0types.h"
+#include "row0types.h"
+#include "mtr0mtr.h"
+#include "trx0sys.h"
+#include "dict0types.h"
+#include "que0types.h"
+#include "data0data.h"
+#include "rem0types.h"
+
+/***************************************************************************
+Copies the undo record to the heap. */
+UNIV_INLINE
+trx_undo_rec_t*
+trx_undo_rec_copy(
+/*==============*/
+ /* out, own: copy of undo log record */
+ trx_undo_rec_t* undo_rec, /* in: undo log record */
+ mem_heap_t* heap); /* in: heap where copied */
+/**************************************************************************
+Reads the undo log record type. */
+UNIV_INLINE
+ulint
+trx_undo_rec_get_type(
+/*==================*/
+ /* out: record type */
+ trx_undo_rec_t* undo_rec); /* in: undo log record */
+/**************************************************************************
+Reads from an undo log record the record compiler info. */
+UNIV_INLINE
+ulint
+trx_undo_rec_get_cmpl_info(
+/*=======================*/
+ /* out: compiler info */
+ trx_undo_rec_t* undo_rec); /* in: undo log record */
+/**************************************************************************
+Reads the undo log record number. */
+UNIV_INLINE
+dulint
+trx_undo_rec_get_undo_no(
+/*=====================*/
+ /* out: undo no */
+ trx_undo_rec_t* undo_rec); /* in: undo log record */
+/**************************************************************************
+Reads from an undo log record the general parameters. */
+
+byte*
+trx_undo_rec_get_pars(
+/*==================*/
+ /* out: remaining part of undo log
+ record after reading these values */
+ trx_undo_rec_t* undo_rec, /* in: undo log record */
+ ulint* type, /* out: undo record type:
+ TRX_UNDO_INSERT_REC, ... */
+ ulint* cmpl_info, /* out: compiler info, relevant only
+ for update type records */
+ dulint* undo_no, /* out: undo log record number */
+ dulint* table_id); /* out: table id */
+/***********************************************************************
+Builds a row reference from an undo log record. */
+
+byte*
+trx_undo_rec_get_row_ref(
+/*=====================*/
+ /* out: pointer to remaining part of undo
+ record */
+ byte* ptr, /* in: remaining part of a copy of an undo log
+ record, at the start of the row reference;
+ NOTE that this copy of the undo log record must
+ be preserved as long as the row reference is
+ used, as we do NOT copy the data in the
+ record! */
+ dict_index_t* index, /* in: clustered index */
+ dtuple_t** ref, /* out, own: row reference */
+ mem_heap_t* heap); /* in: memory heap from which the memory
+ needed is allocated */
+/***********************************************************************
+Skips a row reference from an undo log record. */
+
+byte*
+trx_undo_rec_skip_row_ref(
+/*======================*/
+ /* out: pointer to remaining part of undo
+ record */
+ byte* ptr, /* in: remaining part in update undo log
+ record, at the start of the row reference */
+ dict_index_t* index); /* in: clustered index */
+/**************************************************************************
+Reads from an undo log update record the system field values of the old
+version. */
+
+byte*
+trx_undo_update_rec_get_sys_cols(
+/*=============================*/
+ /* out: remaining part of undo log
+ record after reading these values */
+ byte* ptr, /* in: remaining part of undo log
+ record after reading general
+ parameters */
+ dulint* trx_id, /* out: trx id */
+ dulint* roll_ptr, /* out: roll ptr */
+ ulint* info_bits); /* out: info bits state */
+/***********************************************************************
+Builds an update vector based on a remaining part of an undo log record. */
+
+byte*
+trx_undo_update_rec_get_update(
+/*===========================*/
+ /* out: remaining part of the record */
+ byte* ptr, /* in: remaining part in update undo log
+ record, after reading the row reference
+ NOTE that this copy of the undo log record must
+ be preserved as long as the update vector is
+ used, as we do NOT copy the data in the
+ record! */
+ dict_index_t* index, /* in: clustered index */
+ ulint type, /* in: TRX_UNDO_UPD_EXIST_REC,
+ TRX_UNDO_UPD_DEL_REC, or
+ TRX_UNDO_DEL_MARK_REC; in the last case,
+ only trx id and roll ptr fields are added to
+ the update vector */
+ dulint trx_id, /* in: transaction id from this undorecord */
+ dulint roll_ptr,/* in: roll pointer from this undo record */
+ ulint info_bits,/* in: info bits from this undo record */
+ mem_heap_t* heap, /* in: memory heap from which the memory
+ needed is allocated */
+ upd_t** upd); /* out, own: update vector */
+/***********************************************************************
+Builds a partial row from an update undo log record. It contains the
+columns which occur as ordering in any index of the table. */
+
+byte*
+trx_undo_rec_get_partial_row(
+/*=========================*/
+ /* out: pointer to remaining part of undo
+ record */
+ byte* ptr, /* in: remaining part in update undo log
+ record of a suitable type, at the start of
+ the stored index columns;
+ NOTE that this copy of the undo log record must
+ be preserved as long as the partial row is
+ used, as we do NOT copy the data in the
+ record! */
+ dict_index_t* index, /* in: clustered index */
+ dtuple_t** row, /* out, own: partial row */
+ mem_heap_t* heap); /* in: memory heap from which the memory
+ needed is allocated */
+/***************************************************************************
+Writes information to an undo log about an insert, update, or a delete marking
+of a clustered index record. This information is used in a rollback of the
+transaction and in consistent reads that must look to the history of this
+transaction. */
+
+ulint
+trx_undo_report_row_operation(
+/*==========================*/
+ /* out: DB_SUCCESS or error code */
+ ulint flags, /* in: if BTR_NO_UNDO_LOG_FLAG bit is
+ set, does nothing */
+ ulint op_type, /* in: TRX_UNDO_INSERT_OP or
+ TRX_UNDO_MODIFY_OP */
+ que_thr_t* thr, /* in: query thread */
+ dict_index_t* index, /* in: clustered index */
+ dtuple_t* clust_entry, /* in: in the case of an insert,
+ index entry to insert into the
+ clustered index, otherwise NULL */
+ upd_t* update, /* in: in the case of an update,
+ the update vector, otherwise NULL */
+ ulint cmpl_info, /* in: compiler info on secondary
+ index updates */
+ rec_t* rec, /* in: case of an update or delete
+ marking, the record in the clustered
+ index, otherwise NULL */
+ dulint* roll_ptr); /* out: rollback pointer to the
+ inserted undo log record,
+ ut_dulint_zero if BTR_NO_UNDO_LOG
+ flag was specified */
+/**********************************************************************
+Copies an undo record to heap. This function can be called if we know that
+the undo log record exists. */
+
+trx_undo_rec_t*
+trx_undo_get_undo_rec_low(
+/*======================*/
+ /* out, own: copy of the record */
+ dulint roll_ptr, /* in: roll pointer to record */
+ mem_heap_t* heap); /* in: memory heap where copied */
+/**********************************************************************
+Copies an undo record to heap. */
+
+ulint
+trx_undo_get_undo_rec(
+/*==================*/
+ /* out: DB_SUCCESS, or
+ DB_MISSING_HISTORY if the undo log
+ has been truncated and we cannot
+ fetch the old version; NOTE: the
+ caller must have latches on the
+ clustered index page and purge_view */
+ dulint roll_ptr, /* in: roll pointer to record */
+ dulint trx_id, /* in: id of the trx that generated
+ the roll pointer: it points to an
+ undo log of this transaction */
+ trx_undo_rec_t** undo_rec, /* out, own: copy of the record */
+ mem_heap_t* heap); /* in: memory heap where copied */
+/***********************************************************************
+Build a previous version of a clustered index record. This function checks
+that the caller has a latch on the index page of the clustered index record
+and an s-latch on the purge_view. This guarantees that the stack of versions
+is locked. */
+
+ulint
+trx_undo_prev_version_build(
+/*========================*/
+ /* out: DB_SUCCESS, or DB_MISSING_HISTORY if
+ the previous version is not >= purge_view,
+ which means that it may have been removed */
+ rec_t* index_rec,/* in: clustered index record in the
+ index tree */
+ mtr_t* index_mtr,/* in: mtr which contains the latch to
+ index_rec page and purge_view */
+ rec_t* rec, /* in: version of a clustered index record */
+ dict_index_t* index, /* in: clustered index */
+ mem_heap_t* heap, /* in: memory heap from which the memory
+ needed is allocated */
+ rec_t** old_vers);/* out, own: previous version, or NULL if
+ rec is the first inserted version, or if
+ history data has been deleted */
+/***************************************************************
+Parses a redo log record of adding an undo log record. */
+
+byte*
+trx_undo_parse_add_undo_rec(
+/*========================*/
+ /* out: end of log record or NULL */
+ byte* ptr, /* in: buffer */
+ byte* end_ptr,/* in: buffer end */
+ page_t* page); /* in: page or NULL */
+/***************************************************************
+Parses a redo log record of erasing of an undo page end. */
+
+byte*
+trx_undo_parse_erase_page_end(
+/*==========================*/
+ /* out: end of log record or NULL */
+ byte* ptr, /* in: buffer */
+ byte* end_ptr,/* in: buffer end */
+ page_t* page, /* in: page or NULL */
+ mtr_t* mtr); /* in: mtr or NULL */
+
+/* Types of an undo log record: these have to be smaller than 16, as the
+compilation info multiplied by 16 is ORed to this value in an undo log
+record */
+#define TRX_UNDO_INSERT_REC 11 /* fresh insert into clustered index */
+#define TRX_UNDO_UPD_EXIST_REC 12 /* update of a non-delete-marked
+ record */
+#define TRX_UNDO_UPD_DEL_REC 13 /* update of a delete marked record to
+ a not delete marked record; also the
+ fields of the record can change */
+#define TRX_UNDO_DEL_MARK_REC 14 /* delete marking of a record; fields
+ do not change */
+#define TRX_UNDO_CMPL_INFO_MULT 16 /* compilation info is multiplied by
+ this and ORed to the type above */
+
+/* Operation type flags used in trx_undo_report_row_operation */
+#define TRX_UNDO_INSERT_OP 1
+#define TRX_UNDO_MODIFY_OP 2
+
+#ifndef UNIV_NONINL
+#include "trx0rec.ic"
+#endif
+
+#endif
diff --git a/innobase/include/trx0rec.ic b/innobase/include/trx0rec.ic
new file mode 100644
index 00000000000..f813a52ff9c
--- /dev/null
+++ b/innobase/include/trx0rec.ic
@@ -0,0 +1,69 @@
+/******************************************************
+Transaction undo log record
+
+(c) 1996 Innobase Oy
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+/**************************************************************************
+Reads from an undo log record the record type. */
+UNIV_INLINE
+ulint
+trx_undo_rec_get_type(
+/*==================*/
+ /* out: record type */
+ trx_undo_rec_t* undo_rec) /* in: undo log record */
+{
+ return(mach_read_from_1(undo_rec + 2) & (TRX_UNDO_CMPL_INFO_MULT - 1));
+}
+
+/**************************************************************************
+Reads from an undo log record the record compiler info. */
+UNIV_INLINE
+ulint
+trx_undo_rec_get_cmpl_info(
+/*=======================*/
+ /* out: compiler info */
+ trx_undo_rec_t* undo_rec) /* in: undo log record */
+{
+ return(mach_read_from_1(undo_rec + 2) / TRX_UNDO_CMPL_INFO_MULT);
+}
+
+/**************************************************************************
+Reads the undo log record number. */
+UNIV_INLINE
+dulint
+trx_undo_rec_get_undo_no(
+/*=====================*/
+ /* out: undo no */
+ trx_undo_rec_t* undo_rec) /* in: undo log record */
+{
+ byte* ptr;
+
+ ptr = undo_rec + 3;
+
+ return(mach_dulint_read_much_compressed(ptr));
+}
+
+/***************************************************************************
+Copies the undo record to the heap. */
+UNIV_INLINE
+trx_undo_rec_t*
+trx_undo_rec_copy(
+/*==============*/
+ /* out, own: copy of undo log record */
+ trx_undo_rec_t* undo_rec, /* in: undo log record */
+ mem_heap_t* heap) /* in: heap where copied */
+{
+ ulint len;
+ trx_undo_rec_t* rec_copy;
+
+ len = mach_read_from_2(undo_rec) + buf_frame_align(undo_rec)
+ - undo_rec;
+ rec_copy = mem_heap_alloc(heap, len);
+
+ ut_memcpy(rec_copy, undo_rec, len);
+
+ return(rec_copy);
+}
diff --git a/innobase/include/trx0roll.h b/innobase/include/trx0roll.h
new file mode 100644
index 00000000000..c456768e820
--- /dev/null
+++ b/innobase/include/trx0roll.h
@@ -0,0 +1,216 @@
+/******************************************************
+Transaction rollback
+
+(c) 1996 Innobase Oy
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef trx0roll_h
+#define trx0roll_h
+
+#include "univ.i"
+#include "trx0trx.h"
+#include "trx0types.h"
+#include "mtr0mtr.h"
+#include "trx0sys.h"
+
+/***********************************************************************
+Returns a transaction savepoint taken at this point in time. */
+
+trx_savept_t
+trx_savept_take(
+/*============*/
+ /* out: savepoint */
+ trx_t* trx); /* in: transaction */
+/***********************************************************************
+Creates an undo number array. */
+
+trx_undo_arr_t*
+trx_undo_arr_create(void);
+/*=====================*/
+/***********************************************************************
+Frees an undo number array. */
+
+void
+trx_undo_arr_free(
+/*==============*/
+ trx_undo_arr_t* arr); /* in: undo number array */
+/***********************************************************************
+Returns pointer to nth element in an undo number array. */
+UNIV_INLINE
+trx_undo_inf_t*
+trx_undo_arr_get_nth_info(
+/*======================*/
+ /* out: pointer to the nth element */
+ trx_undo_arr_t* arr, /* in: undo number array */
+ ulint n); /* in: position */
+/***************************************************************************
+Tries truncate the undo logs. */
+
+void
+trx_roll_try_truncate(
+/*==================*/
+ trx_t* trx); /* in: transaction */
+/************************************************************************
+Pops the topmost record when the two undo logs of a transaction are seen
+as a single stack of records ordered by their undo numbers. Inserts the
+undo number of the popped undo record to the array of currently processed
+undo numbers in the transaction. When the query thread finishes processing
+of this undo record, it must be released with trx_undo_rec_release. */
+
+trx_undo_rec_t*
+trx_roll_pop_top_rec_of_trx(
+/*========================*/
+ /* out: undo log record copied to heap, NULL
+ if none left, or if the undo number of the
+ top record would be less than the limit */
+ trx_t* trx, /* in: transaction */
+ dulint limit, /* in: least undo number we need */
+ dulint* roll_ptr,/* out: roll pointer to undo record */
+ mem_heap_t* heap); /* in: memory heap where copied */
+/************************************************************************
+Reserves an undo log record for a query thread to undo. This should be
+called if the query thread gets the undo log record not using the pop
+function above. */
+
+ibool
+trx_undo_rec_reserve(
+/*=================*/
+ /* out: TRUE if succeeded */
+ trx_t* trx, /* in: transaction */
+ dulint undo_no);/* in: undo number of the record */
+/***********************************************************************
+Releases a reserved undo record. */
+
+void
+trx_undo_rec_release(
+/*=================*/
+ trx_t* trx, /* in: transaction */
+ dulint undo_no);/* in: undo number */
+/*************************************************************************
+Starts a rollback operation. */
+
+void
+trx_rollback(
+/*=========*/
+ trx_t* trx, /* in: transaction */
+ trx_sig_t* sig, /* in: signal starting the rollback */
+ que_thr_t** next_thr);/* in/out: next query thread to run;
+ if the value which is passed in is
+ a pointer to a NULL pointer, then the
+ calling function can start running
+ a new query thread */
+/***********************************************************************
+Rollback uncommitted transactions which have no user session. */
+
+void
+trx_rollback_all_without_sess(void);
+/*===============================*/
+/********************************************************************
+Finishes a transaction rollback. */
+
+void
+trx_finish_rollback_off_kernel(
+/*===========================*/
+ que_t* graph, /* in: undo graph which can now be freed */
+ trx_t* trx, /* in: transaction */
+ que_thr_t** next_thr);/* in/out: next query thread to run;
+ if the value which is passed in is
+ a pointer to a NULL pointer, then the
+ calling function can start running
+ a new query thread; if this parameter is
+ NULL, it is ignored */
+/********************************************************************
+Builds an undo 'query' graph for a transaction. The actual rollback is
+performed by executing this query graph like a query subprocedure call.
+The reply about the completion of the rollback will be sent by this
+graph. */
+
+que_t*
+trx_roll_graph_build(
+/*=================*/
+ /* out, own: the query graph */
+ trx_t* trx); /* in: trx handle */
+/*************************************************************************
+Creates a rollback command node struct. */
+
+roll_node_t*
+roll_node_create(
+/*=============*/
+ /* out, own: rollback node struct */
+ mem_heap_t* heap); /* in: mem heap where created */
+/***************************************************************
+Performs an execution step for a rollback command node in a query graph. */
+
+que_thr_t*
+trx_rollback_step(
+/*==============*/
+ /* out: query thread to run next, or NULL */
+ que_thr_t* thr); /* in: query thread */
+/***********************************************************************
+Rollback a transaction used in MySQL. */
+
+int
+trx_rollback_for_mysql(
+/*===================*/
+ /* out: error code or DB_SUCCESS */
+ trx_t* trx); /* in: transaction handle */
+/***********************************************************************
+Rollback the latest SQL statement for MySQL. */
+
+int
+trx_rollback_last_sql_stat_for_mysql(
+/*=================================*/
+ /* out: error code or DB_SUCCESS */
+ trx_t* trx); /* in: transaction handle */
+/***********************************************************************
+Rollback a transaction used in MySQL. */
+
+int
+trx_general_rollback_for_mysql(
+/*===========================*/
+ /* out: error code or DB_SUCCESS */
+ trx_t* trx, /* in: transaction handle */
+ ibool partial,/* in: TRUE if partial rollback requested */
+ trx_savept_t* savept);/* in: pointer to savepoint undo number, if
+ partial rollback requested */
+
+extern sess_t* trx_dummy_sess;
+
+/* A cell in the array used during a rollback and a purge */
+struct trx_undo_inf_struct{
+ dulint trx_no; /* transaction number: not defined during
+ a rollback */
+ dulint undo_no; /* undo number of an undo record */
+ ibool in_use; /* TRUE if the cell is in use */
+};
+
+/* During a rollback and a purge, undo numbers of undo records currently being
+processed are stored in this array */
+
+struct trx_undo_arr_struct{
+ ulint n_cells; /* number of cells in the array */
+ ulint n_used; /* number of cells currently in use */
+ trx_undo_inf_t* infos; /* the array of undo infos */
+ mem_heap_t* heap; /* memory heap from which allocated */
+};
+
+/* Rollback command node in a query graph */
+struct roll_node_struct{
+ que_common_t common; /* node type: QUE_NODE_ROLLBACK */
+ ulint state; /* node execution state */
+ ibool partial;/* TRUE if we want a partial rollback */
+ trx_savept_t savept; /* savepoint to which to roll back, in the
+ case of a partial rollback */
+};
+
+/* Rollback node states */
+#define ROLL_NODE_SEND 1
+#define ROLL_NODE_WAIT 2
+
+#ifndef UNIV_NONINL
+#include "trx0roll.ic"
+#endif
+
+#endif
diff --git a/innobase/include/trx0roll.ic b/innobase/include/trx0roll.ic
new file mode 100644
index 00000000000..dfde83ac478
--- /dev/null
+++ b/innobase/include/trx0roll.ic
@@ -0,0 +1,23 @@
+/******************************************************
+Transaction rollback
+
+(c) 1996 Innobase Oy
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+/***********************************************************************
+Returns pointer to nth element in an undo number array. */
+UNIV_INLINE
+trx_undo_inf_t*
+trx_undo_arr_get_nth_info(
+/*======================*/
+ /* out: pointer to the nth element */
+ trx_undo_arr_t* arr, /* in: undo number array */
+ ulint n) /* in: position */
+{
+ ut_ad(arr);
+ ut_ad(n < arr->n_cells);
+
+ return(arr->infos + n);
+}
diff --git a/innobase/include/trx0rseg.h b/innobase/include/trx0rseg.h
new file mode 100644
index 00000000000..fd64612ab3f
--- /dev/null
+++ b/innobase/include/trx0rseg.h
@@ -0,0 +1,193 @@
+/******************************************************
+Rollback segment
+
+(c) 1996 Innobase Oy
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef trx0rseg_h
+#define trx0rseg_h
+
+#include "univ.i"
+#include "trx0types.h"
+#include "trx0sys.h"
+
+/**********************************************************************
+Gets a rollback segment header. */
+UNIV_INLINE
+trx_rsegf_t*
+trx_rsegf_get(
+/*==========*/
+ /* out: rollback segment header, page
+ x-latched */
+ ulint space, /* in: space where placed */
+ ulint page_no, /* in: page number of the header */
+ mtr_t* mtr); /* in: mtr */
+/**********************************************************************
+Gets a newly created rollback segment header. */
+UNIV_INLINE
+trx_rsegf_t*
+trx_rsegf_get_new(
+/*==============*/
+ /* out: rollback segment header, page
+ x-latched */
+ ulint space, /* in: space where placed */
+ ulint page_no, /* in: page number of the header */
+ mtr_t* mtr); /* in: mtr */
+/*******************************************************************
+Gets the file page number of the nth undo log slot. */
+UNIV_INLINE
+ulint
+trx_rsegf_get_nth_undo(
+/*===================*/
+ /* out: page number of the undo log segment */
+ trx_rsegf_t* rsegf, /* in: rollback segment header */
+ ulint n, /* in: index of slot */
+ mtr_t* mtr); /* in: mtr */
+/*******************************************************************
+Sets the file page number of the nth undo log slot. */
+UNIV_INLINE
+void
+trx_rsegf_set_nth_undo(
+/*===================*/
+ trx_rsegf_t* rsegf, /* in: rollback segment header */
+ ulint n, /* in: index of slot */
+ ulint page_no,/* in: page number of the undo log segment */
+ mtr_t* mtr); /* in: mtr */
+/********************************************************************
+Looks for a free slot for an undo log segment. */
+UNIV_INLINE
+ulint
+trx_rsegf_undo_find_free(
+/*=====================*/
+ /* out: slot index or ULINT_UNDEFINED if not
+ found */
+ trx_rsegf_t* rsegf, /* in: rollback segment header */
+ mtr_t* mtr); /* in: mtr */
+/**********************************************************************
+Looks for a rollback segment, based on the rollback segment id. */
+
+trx_rseg_t*
+trx_rseg_get_on_id(
+/*===============*/
+ /* out: rollback segment */
+ ulint id); /* in: rollback segment id */
+/********************************************************************
+Creates a rollback segment header. This function is called only when
+a new rollback segment is created in the database. */
+
+ulint
+trx_rseg_header_create(
+/*===================*/
+ /* out: page number of the created segment,
+ FIL_NULL if fail */
+ ulint space, /* in: space id */
+ ulint max_size, /* in: max size in pages */
+ ulint* slot_no, /* out: rseg id == slot number in trx sys */
+ mtr_t* mtr); /* in: mtr */
+/*************************************************************************
+Creates the memory copies for rollback segments and initializes the
+rseg list and array in trx_sys at a database startup. */
+
+void
+trx_rseg_list_and_array_init(
+/*=========================*/
+ trx_sysf_t* sys_header, /* in: trx system header */
+ mtr_t* mtr); /* in: mtr */
+/********************************************************************
+Creates a new rollback segment to the database. */
+
+trx_rseg_t*
+trx_rseg_create(
+/*============*/
+ /* out: the created segment object, NULL if
+ fail */
+ ulint space, /* in: space id */
+ ulint max_size, /* in: max size in pages */
+ ulint* id, /* out: rseg id */
+ mtr_t* mtr); /* in: mtr */
+
+
+/* Number of undo log slots in a rollback segment file copy */
+#define TRX_RSEG_N_SLOTS 1024
+
+/* Maximum number of transactions supported by a single rollback segment */
+#define TRX_RSEG_MAX_N_TRXS (TRX_RSEG_N_SLOTS / 2)
+
+/* The rollback segment memory object */
+struct trx_rseg_struct{
+ /*--------------------------------------------------------*/
+ ulint id; /* rollback segment id == the index of
+ its slot in the trx system file copy */
+ mutex_t mutex; /* mutex protecting the fields in this
+ struct except id; NOTE that the latching
+ order must always be kernel mutex ->
+ rseg mutex */
+ ulint space; /* space where the rollback segment is
+ header is placed */
+ ulint page_no;/* page number of the rollback segment
+ header */
+ ulint max_size;/* maximum allowed size in pages */
+ ulint curr_size;/* current size in pages */
+ /*--------------------------------------------------------*/
+ /* Fields for update undo logs */
+ UT_LIST_BASE_NODE_T(trx_undo_t) update_undo_list;
+ /* List of update undo logs */
+ UT_LIST_BASE_NODE_T(trx_undo_t) update_undo_cached;
+ /* List of update undo log segments
+ cached for fast reuse */
+ /*--------------------------------------------------------*/
+ /* Fields for insert undo logs */
+ UT_LIST_BASE_NODE_T(trx_undo_t) insert_undo_list;
+ /* List of insert undo logs */
+ UT_LIST_BASE_NODE_T(trx_undo_t) insert_undo_cached;
+ /* List of insert undo log segments
+ cached for fast reuse */
+ /*--------------------------------------------------------*/
+ ulint last_page_no; /* Page number of the last not yet
+ purged log header in the history list;
+ FIL_NULL if all list purged */
+ ulint last_offset; /* Byte offset of the last not yet
+ purged log header */
+ dulint last_trx_no; /* Transaction number of the last not
+ yet purged log */
+ ibool last_del_marks; /* TRUE if the last not yet purged log
+ needs purging */
+ /*--------------------------------------------------------*/
+ UT_LIST_NODE_T(trx_rseg_t) rseg_list;
+ /* the list of the rollback segment
+ memory objects */
+};
+
+/* Undo log segment slot in a rollback segment header */
+/*-------------------------------------------------------------*/
+#define TRX_RSEG_SLOT_PAGE_NO 0 /* Page number of the header page of
+ an undo log segment */
+/*-------------------------------------------------------------*/
+/* Slot size */
+#define TRX_RSEG_SLOT_SIZE 4
+
+/* The offset of the rollback segment header on its page */
+#define TRX_RSEG FSEG_PAGE_DATA
+
+/* Transaction rollback segment header */
+/*-------------------------------------------------------------*/
+#define TRX_RSEG_MAX_SIZE 0 /* Maximum allowed size for rollback
+ segment in pages */
+#define TRX_RSEG_HISTORY_SIZE 4 /* Number of file pages occupied
+ by the logs in the history list */
+#define TRX_RSEG_HISTORY 8 /* The update undo logs for committed
+ transactions */
+#define TRX_RSEG_FSEG_HEADER (8 + FLST_BASE_NODE_SIZE)
+ /* Header for the file segment where
+ this page is placed */
+#define TRX_RSEG_UNDO_SLOTS (8 + FLST_BASE_NODE_SIZE + FSEG_HEADER_SIZE)
+ /* Undo log segment slots */
+/*-------------------------------------------------------------*/
+
+#ifndef UNIV_NONINL
+#include "trx0rseg.ic"
+#endif
+
+#endif
diff --git a/innobase/include/trx0rseg.ic b/innobase/include/trx0rseg.ic
new file mode 100644
index 00000000000..aeb4466ff0f
--- /dev/null
+++ b/innobase/include/trx0rseg.ic
@@ -0,0 +1,112 @@
+/******************************************************
+Rollback segment
+
+(c) 1996 Innobase Oy
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#include "srv0srv.h"
+
+/**********************************************************************
+Gets a rollback segment header. */
+UNIV_INLINE
+trx_rsegf_t*
+trx_rsegf_get(
+/*==========*/
+ /* out: rollback segment header, page
+ x-latched */
+ ulint space, /* in: space where placed */
+ ulint page_no, /* in: page number of the header */
+ mtr_t* mtr) /* in: mtr */
+{
+ trx_rsegf_t* header;
+
+ header = TRX_RSEG + buf_page_get(space, page_no, RW_X_LATCH, mtr);
+
+ buf_page_dbg_add_level(header, SYNC_RSEG_HEADER);
+
+ return(header);
+}
+
+/**********************************************************************
+Gets a newly created rollback segment header. */
+UNIV_INLINE
+trx_rsegf_t*
+trx_rsegf_get_new(
+/*==============*/
+ /* out: rollback segment header, page
+ x-latched */
+ ulint space, /* in: space where placed */
+ ulint page_no, /* in: page number of the header */
+ mtr_t* mtr) /* in: mtr */
+{
+ trx_rsegf_t* header;
+
+ header = TRX_RSEG + buf_page_get(space, page_no, RW_X_LATCH, mtr);
+
+ buf_page_dbg_add_level(header, SYNC_RSEG_HEADER_NEW);
+
+ return(header);
+}
+
+/*******************************************************************
+Gets the file page number of the nth undo log slot. */
+UNIV_INLINE
+ulint
+trx_rsegf_get_nth_undo(
+/*===================*/
+ /* out: page number of the undo log segment */
+ trx_rsegf_t* rsegf, /* in: rollback segment header */
+ ulint n, /* in: index of slot */
+ mtr_t* mtr) /* in: mtr */
+{
+ ut_ad(n < TRX_RSEG_N_SLOTS);
+
+ return(mtr_read_ulint(rsegf + TRX_RSEG_UNDO_SLOTS +
+ n * TRX_RSEG_SLOT_SIZE, MLOG_4BYTES, mtr));
+}
+
+/*******************************************************************
+Sets the file page number of the nth undo log slot. */
+UNIV_INLINE
+void
+trx_rsegf_set_nth_undo(
+/*===================*/
+ trx_rsegf_t* rsegf, /* in: rollback segment header */
+ ulint n, /* in: index of slot */
+ ulint page_no,/* in: page number of the undo log segment */
+ mtr_t* mtr) /* in: mtr */
+{
+ ut_ad(n < TRX_RSEG_N_SLOTS);
+
+ mlog_write_ulint(rsegf + TRX_RSEG_UNDO_SLOTS + n * TRX_RSEG_SLOT_SIZE,
+ page_no, MLOG_4BYTES, mtr);
+}
+
+/********************************************************************
+Looks for a free slot for an undo log segment. */
+UNIV_INLINE
+ulint
+trx_rsegf_undo_find_free(
+/*=====================*/
+ /* out: slot index or ULINT_UNDEFINED if not
+ found */
+ trx_rsegf_t* rsegf, /* in: rollback segment header */
+ mtr_t* mtr) /* in: mtr */
+{
+ ulint i;
+ ulint page_no;
+
+ for (i = 0; i < TRX_RSEG_N_SLOTS; i++) {
+
+ page_no = trx_rsegf_get_nth_undo(rsegf, i, mtr);
+
+ if (page_no == FIL_NULL) {
+
+ return(i);
+ }
+ }
+
+ return(ULINT_UNDEFINED);
+}
diff --git a/innobase/include/trx0sys.h b/innobase/include/trx0sys.h
new file mode 100644
index 00000000000..d0506dd65b7
--- /dev/null
+++ b/innobase/include/trx0sys.h
@@ -0,0 +1,270 @@
+/******************************************************
+Transaction system
+
+(c) 1996 Innobase Oy
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef trx0sys_h
+#define trx0sys_h
+
+#include "univ.i"
+
+#include "trx0types.h"
+#include "mtr0mtr.h"
+#include "mtr0log.h"
+#include "ut0byte.h"
+#include "mem0mem.h"
+#include "sync0sync.h"
+#include "ut0lst.h"
+#include "buf0buf.h"
+#include "fil0fil.h"
+#include "fut0lst.h"
+#include "fsp0fsp.h"
+#include "read0types.h"
+
+/* The transaction system */
+extern trx_sys_t* trx_sys;
+
+/*******************************************************************
+Checks if a page address is the trx sys header page. */
+UNIV_INLINE
+ibool
+trx_sys_hdr_page(
+/*=============*/
+ /* out: TRUE if trx sys header page */
+ ulint space, /* in: space */
+ ulint page_no);/* in: page number */
+/*********************************************************************
+Creates and initializes the central memory structures for the transaction
+system. This is called when the database is started. */
+
+void
+trx_sys_init_at_db_start(void);
+/*==========================*/
+/*********************************************************************
+Creates and initializes the transaction system at the database creation. */
+
+void
+trx_sys_create(void);
+/*================*/
+/********************************************************************
+Looks for a free slot for a rollback segment in the trx system file copy. */
+
+ulint
+trx_sysf_rseg_find_free(
+/*====================*/
+ /* out: slot index or ULINT_UNDEFINED
+ if not found */
+ mtr_t* mtr); /* in: mtr */
+/*******************************************************************
+Gets the pointer in the nth slot of the rseg array. */
+UNIV_INLINE
+trx_rseg_t*
+trx_sys_get_nth_rseg(
+/*=================*/
+ /* out: pointer to rseg object, NULL if slot
+ not in use */
+ trx_sys_t* sys, /* in: trx system */
+ ulint n); /* in: index of slot */
+/*******************************************************************
+Sets the pointer in the nth slot of the rseg array. */
+UNIV_INLINE
+void
+trx_sys_set_nth_rseg(
+/*=================*/
+ trx_sys_t* sys, /* in: trx system */
+ ulint n, /* in: index of slot */
+ trx_rseg_t* rseg); /* in: pointer to rseg object, NULL if slot
+ not in use */
+/**************************************************************************
+Gets a pointer to the transaction system file copy and x-locks its page. */
+UNIV_INLINE
+trx_sysf_t*
+trx_sysf_get(
+/*=========*/
+ /* out: pointer to system file copy, page x-locked */
+ mtr_t* mtr); /* in: mtr */
+/*********************************************************************
+Gets the space of the nth rollback segment slot in the trx system
+file copy. */
+UNIV_INLINE
+ulint
+trx_sysf_rseg_get_space(
+/*====================*/
+ /* out: space id */
+ trx_sysf_t* sys_header, /* in: trx sys file copy */
+ ulint i, /* in: slot index == rseg id */
+ mtr_t* mtr); /* in: mtr */
+/*********************************************************************
+Gets the page number of the nth rollback segment slot in the trx system
+file copy. */
+UNIV_INLINE
+ulint
+trx_sysf_rseg_get_page_no(
+/*======================*/
+ /* out: page number, FIL_NULL
+ if slot unused */
+ trx_sysf_t* sys_header, /* in: trx sys file copy */
+ ulint i, /* in: slot index == rseg id */
+ mtr_t* mtr); /* in: mtr */
+/*********************************************************************
+Sets the space id of the nth rollback segment slot in the trx system
+file copy. */
+UNIV_INLINE
+void
+trx_sysf_rseg_set_space(
+/*====================*/
+ trx_sysf_t* sys_header, /* in: trx sys file copy */
+ ulint i, /* in: slot index == rseg id */
+ ulint space, /* in: space id */
+ mtr_t* mtr); /* in: mtr */
+/*********************************************************************
+Sets the page number of the nth rollback segment slot in the trx system
+file copy. */
+UNIV_INLINE
+void
+trx_sysf_rseg_set_page_no(
+/*======================*/
+ trx_sysf_t* sys_header, /* in: trx sys file copy */
+ ulint i, /* in: slot index == rseg id */
+ ulint page_no, /* in: page number, FIL_NULL if
+ the slot is reset to unused */
+ mtr_t* mtr); /* in: mtr */
+/*********************************************************************
+Allocates a new transaction id. */
+UNIV_INLINE
+dulint
+trx_sys_get_new_trx_id(void);
+/*========================*/
+ /* out: new, allocated trx id */
+/*********************************************************************
+Allocates a new transaction number. */
+UNIV_INLINE
+dulint
+trx_sys_get_new_trx_no(void);
+/*========================*/
+ /* out: new, allocated trx number */
+/*********************************************************************
+Writes a trx id to an index page. In case that the id size changes in
+some future version, this function should be used instead of
+mach_write_... */
+UNIV_INLINE
+void
+trx_write_trx_id(
+/*=============*/
+ byte* ptr, /* in: pointer to memory where written */
+ dulint id); /* in: id */
+/*********************************************************************
+Reads a trx id from an index page. In case that the id size changes in
+some future version, this function should be used instead of
+mach_read_... */
+UNIV_INLINE
+dulint
+trx_read_trx_id(
+/*============*/
+ /* out: id */
+ byte* ptr); /* in: pointer to memory from where to read */
+/********************************************************************
+Looks for the trx handle with the given id in trx_list. */
+UNIV_INLINE
+trx_t*
+trx_get_on_id(
+/*==========*/
+ /* out: the trx handle or NULL if not found */
+ dulint trx_id); /* in: trx id to search for */
+/********************************************************************
+Returns the minumum trx id in trx list. This is the smallest id for which
+the trx can possibly be active. (But, you must look at the trx->conc_state to
+find out if the minimum trx id transaction itself is active, or already
+committed.) */
+UNIV_INLINE
+dulint
+trx_list_get_min_trx_id(void);
+/*=========================*/
+ /* out: the minimum trx id, or trx_sys->max_trx_id
+ if the trx list is empty */
+/********************************************************************
+Checks if a transaction with the given id is active. */
+UNIV_INLINE
+ibool
+trx_is_active(
+/*==========*/
+ /* out: TRUE if active */
+ dulint trx_id);/* in: trx id of the transaction */
+/********************************************************************
+Checks that trx is in the trx list. */
+
+ibool
+trx_in_trx_list(
+/*============*/
+ /* out: TRUE if is in */
+ trx_t* in_trx);/* in: trx */
+
+/* The automatically created system rollback segment has this id */
+#define TRX_SYS_SYSTEM_RSEG_ID 0
+
+/* Max number of rollback segments: the number of segment specification slots
+in the transaction system array; rollback segment id must fit in one byte,
+therefore 256 */
+#define TRX_SYS_N_RSEGS 256
+
+/* Space id and page no where the trx system file copy resides */
+#define TRX_SYS_SPACE 0 /* the SYSTEM tablespace */
+#define TRX_SYS_PAGE_NO FSP_TRX_SYS_PAGE_NO
+
+/* The offset of the transaction system header on the page */
+#define TRX_SYS FSEG_PAGE_DATA
+
+/* Transaction system header; protected by trx_sys->mutex */
+/*-------------------------------------------------------------*/
+#define TRX_SYS_TRX_ID_STORE 0 /* The maximum trx id or trx number
+ modulo TRX_SYS_TRX_ID_UPDATE_MARGIN
+ written to a file page by any
+ transaction; the assignment of
+ transaction ids continues from this
+ number rounded up by .._MARGIN plus
+ .._MARGIN when the database is
+ started */
+#define TRX_SYS_FSEG_HEADER 8 /* segment header for the tablespace
+ segment the trx system is created
+ into */
+#define TRX_SYS_RSEGS (8 + FSEG_HEADER_SIZE)
+ /* the start of the array of rollback
+ segment specification slots */
+/*-------------------------------------------------------------*/
+
+/* The transaction system central memory data structure; protected by the
+kernel mutex */
+struct trx_sys_struct{
+ dulint max_trx_id; /* The smallest number not yet
+ assigned as a transaction id or
+ transaction number */
+ UT_LIST_BASE_NODE_T(trx_t) trx_list;
+ /* List of active and committed in
+ memory transactions, sorted on trx id,
+ biggest first */
+ UT_LIST_BASE_NODE_T(trx_rseg_t) rseg_list;
+ /* List of rollback segment objects */
+ trx_rseg_t* latest_rseg; /* Latest rollback segment in the
+ round-robin assignment of rollback
+ segments to transactions */
+ trx_rseg_t* rseg_array[TRX_SYS_N_RSEGS];
+ /* Pointer array to rollback segments;
+ NULL if slot not in use */
+ UT_LIST_BASE_NODE_T(read_view_t) view_list;
+ /* List of read views sorted on trx no,
+ biggest first */
+};
+
+/* When a trx id which is zero modulo this number (which must be a power of
+two) is assigned, the field TRX_SYS_TRX_ID_STORE on the transaction system
+page is updated */
+#define TRX_SYS_TRX_ID_WRITE_MARGIN 256
+
+#ifndef UNIV_NONINL
+#include "trx0sys.ic"
+#endif
+
+#endif
diff --git a/innobase/include/trx0sys.ic b/innobase/include/trx0sys.ic
new file mode 100644
index 00000000000..786e7905933
--- /dev/null
+++ b/innobase/include/trx0sys.ic
@@ -0,0 +1,352 @@
+/******************************************************
+Transaction system
+
+(c) 1996 Innobase Oy
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#include "srv0srv.h"
+#include "trx0trx.h"
+#include "data0type.h"
+
+/* The typedef for rseg slot in the file copy */
+typedef byte trx_sysf_rseg_t;
+
+/* Rollback segment specification slot offsets */
+/*-------------------------------------------------------------*/
+#define TRX_SYS_RSEG_SPACE 0 /* space where the the segment
+ header is placed */
+#define TRX_SYS_RSEG_PAGE_NO 4 /* page number where the the segment
+ header is placed; this is FIL_NULL
+ if the slot is unused */
+/*-------------------------------------------------------------*/
+/* Size of a rollback segment specification slot */
+#define TRX_SYS_RSEG_SLOT_SIZE 8
+
+/*********************************************************************
+Writes the value of max_trx_id to the file based trx system header. */
+
+void
+trx_sys_flush_max_trx_id(void);
+/*==========================*/
+
+/*******************************************************************
+Checks if a page address is the trx sys header page. */
+UNIV_INLINE
+ibool
+trx_sys_hdr_page(
+/*=============*/
+ /* out: TRUE if trx sys header page */
+ ulint space, /* in: space */
+ ulint page_no)/* in: page number */
+{
+ if ((space == TRX_SYS_SPACE) && (page_no == TRX_SYS_PAGE_NO)) {
+
+ return(TRUE);
+ }
+
+ return(FALSE);
+}
+
+/*******************************************************************
+Gets the pointer in the nth slot of the rseg array. */
+UNIV_INLINE
+trx_rseg_t*
+trx_sys_get_nth_rseg(
+/*=================*/
+ /* out: pointer to rseg object, NULL if slot
+ not in use */
+ trx_sys_t* sys, /* in: trx system */
+ ulint n) /* in: index of slot */
+{
+ ut_ad(mutex_own(&(kernel_mutex)));
+ ut_ad(n < TRX_SYS_N_RSEGS);
+
+ return(sys->rseg_array[n]);
+}
+
+/*******************************************************************
+Sets the pointer in the nth slot of the rseg array. */
+UNIV_INLINE
+void
+trx_sys_set_nth_rseg(
+/*=================*/
+ trx_sys_t* sys, /* in: trx system */
+ ulint n, /* in: index of slot */
+ trx_rseg_t* rseg) /* in: pointer to rseg object, NULL if slot
+ not in use */
+{
+ ut_ad(n < TRX_SYS_N_RSEGS);
+
+ sys->rseg_array[n] = rseg;
+}
+
+/**************************************************************************
+Gets a pointer to the transaction system header and x-latches its page. */
+UNIV_INLINE
+trx_sysf_t*
+trx_sysf_get(
+/*=========*/
+ /* out: pointer to system header, page x-latched. */
+ mtr_t* mtr) /* in: mtr */
+{
+ trx_sysf_t* header;
+
+ ut_ad(mutex_own(&(kernel_mutex)));
+ ut_ad(mtr);
+
+ header = TRX_SYS + buf_page_get(TRX_SYS_SPACE, TRX_SYS_PAGE_NO,
+ RW_X_LATCH, mtr);
+
+ buf_page_dbg_add_level(header, SYNC_TRX_SYS_HEADER);
+
+ return(header);
+}
+
+/*********************************************************************
+Gets the space of the nth rollback segment slot in the trx system
+file copy. */
+UNIV_INLINE
+ulint
+trx_sysf_rseg_get_space(
+/*====================*/
+ /* out: space id */
+ trx_sysf_t* sys_header, /* in: trx sys header */
+ ulint i, /* in: slot index == rseg id */
+ mtr_t* mtr) /* in: mtr */
+{
+ ut_ad(mutex_own(&(kernel_mutex)));
+ ut_ad(sys_header);
+ ut_ad(i < TRX_SYS_N_RSEGS);
+
+ return(mtr_read_ulint(sys_header + TRX_SYS_RSEGS
+ + i * TRX_SYS_RSEG_SLOT_SIZE
+ + TRX_SYS_RSEG_SPACE, MLOG_4BYTES, mtr));
+}
+
+/*********************************************************************
+Gets the page number of the nth rollback segment slot in the trx system
+header. */
+UNIV_INLINE
+ulint
+trx_sysf_rseg_get_page_no(
+/*======================*/
+ /* out: page number, FIL_NULL
+ if slot unused */
+ trx_sysf_t* sys_header, /* in: trx system header */
+ ulint i, /* in: slot index == rseg id */
+ mtr_t* mtr) /* in: mtr */
+{
+ ut_ad(sys_header);
+ ut_ad(mutex_own(&(kernel_mutex)));
+ ut_ad(i < TRX_SYS_N_RSEGS);
+
+ return(mtr_read_ulint(sys_header + TRX_SYS_RSEGS
+ + i * TRX_SYS_RSEG_SLOT_SIZE
+ + TRX_SYS_RSEG_PAGE_NO, MLOG_4BYTES, mtr));
+}
+
+/*********************************************************************
+Sets the space id of the nth rollback segment slot in the trx system
+file copy. */
+UNIV_INLINE
+void
+trx_sysf_rseg_set_space(
+/*====================*/
+ trx_sysf_t* sys_header, /* in: trx sys file copy */
+ ulint i, /* in: slot index == rseg id */
+ ulint space, /* in: space id */
+ mtr_t* mtr) /* in: mtr */
+{
+ ut_ad(mutex_own(&(kernel_mutex)));
+ ut_ad(sys_header);
+ ut_ad(i < TRX_SYS_N_RSEGS);
+
+ mlog_write_ulint(sys_header + TRX_SYS_RSEGS
+ + i * TRX_SYS_RSEG_SLOT_SIZE
+ + TRX_SYS_RSEG_SPACE,
+ space,
+ MLOG_4BYTES, mtr);
+}
+
+/*********************************************************************
+Sets the page number of the nth rollback segment slot in the trx system
+header. */
+UNIV_INLINE
+void
+trx_sysf_rseg_set_page_no(
+/*======================*/
+ trx_sysf_t* sys_header, /* in: trx sys header */
+ ulint i, /* in: slot index == rseg id */
+ ulint page_no, /* in: page number, FIL_NULL if the
+ slot is reset to unused */
+ mtr_t* mtr) /* in: mtr */
+{
+ ut_ad(mutex_own(&(kernel_mutex)));
+ ut_ad(sys_header);
+ ut_ad(i < TRX_SYS_N_RSEGS);
+
+ mlog_write_ulint(sys_header + TRX_SYS_RSEGS
+ + i * TRX_SYS_RSEG_SLOT_SIZE
+ + TRX_SYS_RSEG_PAGE_NO,
+ page_no,
+ MLOG_4BYTES, mtr);
+}
+
+/*********************************************************************
+Writes a trx id to an index page. In case that the id size changes in
+some future version, this function should be used instead of
+mach_write_... */
+UNIV_INLINE
+void
+trx_write_trx_id(
+/*=============*/
+ byte* ptr, /* in: pointer to memory where written */
+ dulint id) /* in: id */
+{
+ ut_ad(DATA_TRX_ID_LEN == 6);
+
+ mach_write_to_6(ptr, id);
+}
+
+/*********************************************************************
+Reads a trx id from an index page. In case that the id size changes in
+some future version, this function should be used instead of
+mach_read_... */
+UNIV_INLINE
+dulint
+trx_read_trx_id(
+/*============*/
+ /* out: id */
+ byte* ptr) /* in: pointer to memory from where to read */
+{
+ ut_ad(DATA_TRX_ID_LEN == 6);
+
+ return(mach_read_from_6(ptr));
+}
+
+/********************************************************************
+Looks for the trx handle with the given id in trx_list. */
+UNIV_INLINE
+trx_t*
+trx_get_on_id(
+/*==========*/
+ /* out: the trx handle or NULL if not found */
+ dulint trx_id) /* in: trx id to search for */
+{
+ trx_t* trx;
+
+ ut_ad(mutex_own(&(kernel_mutex)));
+
+ trx = UT_LIST_GET_FIRST(trx_sys->trx_list);
+
+ while (trx != NULL) {
+ if (0 == ut_dulint_cmp(trx_id, trx->id)) {
+
+ return(trx);
+ }
+
+ trx = UT_LIST_GET_NEXT(trx_list, trx);
+ }
+
+ return(NULL);
+}
+
+/********************************************************************
+Returns the minumum trx id in trx list. This is the smallest id for which
+the trx can possibly be active. (But, you must look at the trx->conc_state to
+find out if the minimum trx id transaction itself is active, or already
+committed.) */
+UNIV_INLINE
+dulint
+trx_list_get_min_trx_id(void)
+/*=========================*/
+ /* out: the minimum trx id, or trx_sys->max_trx_id
+ if the trx list is empty */
+{
+ trx_t* trx;
+
+ ut_ad(mutex_own(&(kernel_mutex)));
+
+ trx = UT_LIST_GET_LAST(trx_sys->trx_list);
+
+ if (trx == NULL) {
+
+ return(trx_sys->max_trx_id);
+ }
+
+ return(trx->id);
+}
+
+/********************************************************************
+Checks if a transaction with the given id is active. */
+UNIV_INLINE
+ibool
+trx_is_active(
+/*==========*/
+ /* out: TRUE if active */
+ dulint trx_id) /* in: trx id of the transaction */
+{
+ trx_t* trx;
+
+ ut_ad(mutex_own(&(kernel_mutex)));
+
+ if (ut_dulint_cmp(trx_id, trx_list_get_min_trx_id()) < 0) {
+
+ return(FALSE);
+ }
+
+ trx = trx_get_on_id(trx_id);
+ if (trx && (trx->conc_state == TRX_ACTIVE)) {
+
+ return(TRUE);
+ }
+
+ return(FALSE);
+}
+
+/*********************************************************************
+Allocates a new transaction id. */
+UNIV_INLINE
+dulint
+trx_sys_get_new_trx_id(void)
+/*========================*/
+ /* out: new, allocated trx id */
+{
+ dulint id;
+
+ ut_ad(mutex_own(&kernel_mutex));
+
+ /* VERY important: after the database is started, max_trx_id value is
+ divisible by TRX_SYS_TRX_ID_WRITE_MARGIN, and the following if
+ will evaluate to TRUE when this function is first time called,
+ and the value for trx id will be written to disk-based header!
+ Thus trx id values will not overlap when the database is
+ repeatedly started! */
+
+ if (ut_dulint_get_low(trx_sys->max_trx_id)
+ % TRX_SYS_TRX_ID_WRITE_MARGIN == 0) {
+
+ trx_sys_flush_max_trx_id();
+ }
+
+ id = trx_sys->max_trx_id;
+
+ UT_DULINT_INC(trx_sys->max_trx_id);
+
+ return(id);
+}
+
+/*********************************************************************
+Allocates a new transaction number. */
+UNIV_INLINE
+dulint
+trx_sys_get_new_trx_no(void)
+/*========================*/
+ /* out: new, allocated trx number */
+{
+ ut_ad(mutex_own(&kernel_mutex));
+
+ return(trx_sys_get_new_trx_id());
+}
diff --git a/innobase/include/trx0trx.h b/innobase/include/trx0trx.h
new file mode 100644
index 00000000000..e2a1b4435e7
--- /dev/null
+++ b/innobase/include/trx0trx.h
@@ -0,0 +1,412 @@
+/******************************************************
+The transaction
+
+(c) 1996 Innobase Oy
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef trx0trx_h
+#define trx0trx_h
+
+#include "univ.i"
+#include "trx0types.h"
+#include "lock0types.h"
+#include "usr0types.h"
+#include "que0types.h"
+#include "mem0mem.h"
+#include "read0types.h"
+
+/* If this flag is defined, then unneeded update undo logs are discarded,
+saving CPU time. The kernel mutex contention is increased, however. */
+
+#define TRX_UPDATE_UNDO_OPT
+
+extern ulint trx_n_mysql_transactions;
+
+/********************************************************************
+Creates and initializes a transaction object. */
+
+trx_t*
+trx_create(
+/*=======*/
+ /* out, own: the transaction */
+ sess_t* sess); /* in: session or NULL */
+/************************************************************************
+Creates a transaction object for MySQL. */
+
+trx_t*
+trx_allocate_for_mysql(void);
+/*========================*/
+ /* out, own: transaction object */
+/************************************************************************
+Frees a transaction object. */
+
+void
+trx_free(
+/*=====*/
+ trx_t* trx); /* in, own: trx object */
+/************************************************************************
+Frees a transaction object for MySQL. */
+
+void
+trx_free_for_mysql(
+/*===============*/
+ trx_t* trx); /* in, own: trx object */
+/********************************************************************
+Creates trx objects for transactions and initializes the trx list of
+trx_sys at database start. Rollback segment and undo log lists must
+already exist when this function is called, because the lists of
+transactions to be rolled back or cleaned up are built based on the
+undo log lists. */
+
+void
+trx_lists_init_at_db_start(void);
+/*============================*/
+/********************************************************************
+Starts a new transaction. */
+
+ibool
+trx_start(
+/*======*/
+ /* out: TRUE if success, FALSE if the rollback
+ segment could not support this many transactions */
+ trx_t* trx, /* in: transaction */
+ ulint rseg_id);/* in: rollback segment id; if ULINT_UNDEFINED
+ is passed, the system chooses the rollback segment
+ automatically in a round-robin fashion */
+/********************************************************************
+Starts a new transaction. */
+
+ibool
+trx_start_low(
+/*==========*/
+ /* out: TRUE */
+ trx_t* trx, /* in: transaction */
+ ulint rseg_id);/* in: rollback segment id; if ULINT_UNDEFINED
+ is passed, the system chooses the rollback segment
+ automatically in a round-robin fashion */
+/*****************************************************************
+Starts the transaction if it is not yet started. */
+UNIV_INLINE
+void
+trx_start_if_not_started(
+/*=====================*/
+ trx_t* trx); /* in: transaction */
+/********************************************************************
+Commits a transaction. */
+
+void
+trx_commit_off_kernel(
+/*==================*/
+ trx_t* trx); /* in: transaction */
+/**************************************************************************
+Does the transaction commit for MySQL. */
+
+ulint
+trx_commit_for_mysql(
+/*=================*/
+ /* out: 0 or error number */
+ trx_t* trx); /* in: trx handle */
+/**************************************************************************
+Marks the latest SQL statement ended. */
+
+void
+trx_mark_sql_stat_end(
+/*==================*/
+ trx_t* trx); /* in: trx handle */
+/************************************************************************
+Assigns a read view for a consistent read query. All the consistent reads
+within the same transaction will get the same read view, which is created
+when this function is first called for a new started transaction. */
+
+read_view_t*
+trx_assign_read_view(
+/*=================*/
+ /* out: consistent read view */
+ trx_t* trx); /* in: active transaction */
+/***************************************************************
+The transaction must be in the TRX_QUE_LOCK_WAIT state. Puts it to
+the TRX_QUE_RUNNING state and releases query threads which were
+waiting for a lock in the wait_thrs list. */
+
+void
+trx_end_lock_wait(
+/*==============*/
+ trx_t* trx); /* in: transaction */
+/********************************************************************
+Sends a signal to a trx object. */
+
+ibool
+trx_sig_send(
+/*=========*/
+ /* out: TRUE if the signal was
+ successfully delivered */
+ trx_t* trx, /* in: trx handle */
+ ulint type, /* in: signal type */
+ ulint sender, /* in: TRX_SIG_SELF or
+ TRX_SIG_OTHER_SESS */
+ ibool reply, /* in: TRUE if the sender of the signal
+ wants reply after the operation induced
+ by the signal is completed; if type
+ is TRX_SIG_END_WAIT, this must be
+ FALSE */
+ que_thr_t* receiver_thr, /* in: query thread which wants the
+ reply, or NULL */
+ trx_savept_t* savept, /* in: possible rollback savepoint, or
+ NULL */
+ que_thr_t** next_thr); /* in/out: next query thread to run;
+ if the value which is passed in is
+ a pointer to a NULL pointer, then the
+ calling function can start running
+ a new query thread; if the parameter
+ is NULL, it is ignored */
+/********************************************************************
+Send the reply message when a signal in the queue of the trx has
+been handled. */
+
+void
+trx_sig_reply(
+/*==========*/
+ trx_t* trx, /* in: trx handle */
+ trx_sig_t* sig, /* in: signal */
+ que_thr_t** next_thr); /* in/out: next query thread to run;
+ if the value which is passed in is
+ a pointer to a NULL pointer, then the
+ calling function can start running
+ a new query thread */
+/********************************************************************
+Removes the signal object from a trx signal queue. */
+
+void
+trx_sig_remove(
+/*===========*/
+ trx_t* trx, /* in: trx handle */
+ trx_sig_t* sig); /* in, own: signal */
+/********************************************************************
+Starts handling of a trx signal. */
+
+void
+trx_sig_start_handle(
+/*=================*/
+ trx_t* trx, /* in: trx handle */
+ que_thr_t** next_thr); /* in/out: next query thread to run;
+ if the value which is passed in is
+ a pointer to a NULL pointer, then the
+ calling function can start running
+ a new query thread */
+/********************************************************************
+Ends signal handling. If the session is in the error state, and
+trx->graph_before_signal_handling != NULL, returns control to the error
+handling routine of the graph (currently only returns the control to the
+graph root which then sends an error message to the client). */
+
+void
+trx_end_signal_handling(
+/*====================*/
+ trx_t* trx); /* in: trx */
+/*************************************************************************
+Creates a commit command node struct. */
+
+commit_node_t*
+commit_node_create(
+/*===============*/
+ /* out, own: commit node struct */
+ mem_heap_t* heap); /* in: mem heap where created */
+/***************************************************************
+Performs an execution step for a commit type node in a query graph. */
+
+que_thr_t*
+trx_commit_step(
+/*============*/
+ /* out: query thread to run next, or NULL */
+ que_thr_t* thr); /* in: query thread */
+
+
+/* Signal to a transaction */
+struct trx_sig_struct{
+ ulint type; /* signal type */
+ ulint state; /* TRX_SIG_WAITING or
+ TRX_SIG_BEING_HANDLED */
+ ulint sender; /* TRX_SIG_SELF or
+ TRX_SIG_OTHER_SESS */
+ ibool reply; /* TRUE if the sender of the signal
+ wants reply after the operation induced
+ by the signal is completed; if this
+ field is TRUE and the receiver field
+ below is NULL, then a SUCCESS message
+ is sent to the client of the session
+ to which this trx belongs */
+ que_thr_t* receiver; /* query thread which wants the reply,
+ or NULL */
+ trx_savept_t savept; /* possible rollback savepoint */
+ UT_LIST_NODE_T(trx_sig_t)
+ signals; /* queue of pending signals to the
+ transaction */
+ UT_LIST_NODE_T(trx_sig_t)
+ reply_signals; /* list of signals for which the sender
+ transaction is waiting a reply */
+};
+
+/* The transaction handle; every session has a trx object which is freed only
+when the session is freed; in addition there may be session-less transactions
+rolling back after a database recovery */
+
+struct trx_struct{
+ /* All the next fields are protected by the kernel mutex, except the
+ undo logs which are protected by undo_mutex */
+ ulint type; /* TRX_USER, TRX_PURGE */
+ ulint conc_state; /* state of the trx from the point
+ of view of concurrency control:
+ TRX_ACTIVE, TRX_COMMITTED_IN_MEMORY,
+ ... */
+ dulint id; /* transaction id */
+ dulint no; /* transaction serialization number ==
+ max trx id when the transaction is
+ moved to COMMITTED_IN_MEMORY state */
+ ibool dict_operation; /* TRUE if the trx is used to create
+ a table, create an index, or drop a
+ table */
+ dulint table_id; /* table id if the preceding field is
+ TRUE */
+ os_thread_id_t mysql_thread_id;/* id of the MySQL thread associated
+ with this transaction object */
+ ulint n_mysql_tables_in_use; /* number of Innobase tables
+ used in the processing of the current
+ SQL statement in MySQL */
+ UT_LIST_NODE_T(trx_t)
+ trx_list; /* list of transactions */
+ /*------------------------------*/
+ mutex_t undo_mutex; /* mutex protecting the fields in this
+ section (down to undo_no_arr), EXCEPT
+ last_sql_stat_start, which can be
+ accessed only when we know that there
+ cannot be any activity in the undo
+ logs! */
+ dulint undo_no; /* next undo log record number to
+ assign */
+ trx_savept_t last_sql_stat_start;
+ /* undo_no when the last sql statement
+ was started: in case of an error, trx
+ is rolled back down to this undo
+ number; see note at undo_mutex! */
+ trx_rseg_t* rseg; /* rollback segment assigned to the
+ transaction, or NULL if not assigned
+ yet */
+ trx_undo_t* insert_undo; /* pointer to the insert undo log, or
+ NULL if no inserts performed yet */
+ trx_undo_t* update_undo; /* pointer to the update undo log, or
+ NULL if no update performed yet */
+ dulint roll_limit; /* least undo number to undo during
+ a rollback */
+ ulint pages_undone; /* number of undo log pages undone
+ since the last undo log truncation */
+ trx_undo_arr_t* undo_no_arr; /* array of undo numbers of undo log
+ records which are currently processed
+ by a rollback operation */
+ /*------------------------------*/
+ ulint error_state; /* 0 if no error, otherwise error
+ number */
+ sess_t* sess; /* session of the trx, NULL if none */
+ ulint que_state; /* TRX_QUE_RUNNING, TRX_QUE_LOCK_WAIT,
+ ... */
+ que_t* graph; /* query currently run in the session,
+ or NULL if none; NOTE that the query
+ belongs to the session, and it can
+ survive over a transaction commit, if
+ it is a stored procedure with a COMMIT
+ WORK statement, for instance */
+ ulint n_active_thrs; /* number of active query threads */
+ ibool handling_signals;/* this is TRUE as long as the trx
+ is handling signals */
+ que_t* graph_before_signal_handling;
+ /* value of graph when signal handling
+ for this trx started: this is used to
+ return control to the original query
+ graph for error processing */
+ trx_sig_t sig; /* one signal object can be allocated
+ in this space, avoiding mem_alloc */
+ UT_LIST_BASE_NODE_T(trx_sig_t)
+ signals; /* queue of processed or pending
+ signals to the trx */
+ UT_LIST_BASE_NODE_T(trx_sig_t)
+ reply_signals; /* list of signals sent by the query
+ threads of this trx for which a thread
+ is waiting for a reply; if this trx is
+ killed, the reply requests in the list
+ must be canceled */
+ /*------------------------------*/
+ lock_t* wait_lock; /* if trx execution state is
+ TRX_QUE_LOCK_WAIT, this points to
+ the lock request, otherwise this is
+ NULL */
+ UT_LIST_BASE_NODE_T(que_thr_t)
+ wait_thrs; /* query threads belonging to this
+ trx that are in the QUE_THR_LOCK_WAIT
+ state */
+ /*------------------------------*/
+ mem_heap_t* lock_heap; /* memory heap for the locks of the
+ transaction; protected by
+ lock_heap_mutex */
+ UT_LIST_BASE_NODE_T(lock_t)
+ trx_locks; /* locks reserved by the transaction;
+ protected by lock_heap_mutex */
+ /*------------------------------*/
+ mem_heap_t* read_view_heap; /* memory heap for the read view */
+ read_view_t* read_view; /* consistent read view or NULL */
+};
+
+#define TRX_MAX_N_THREADS 32 /* maximum number of concurrent
+ threads running a single operation of
+ a transaction, e.g., a parallel query */
+/* Transaction types */
+#define TRX_USER 1 /* normal user transaction */
+#define TRX_PURGE 2 /* purge transaction: this is not
+ inserted to the trx list of trx_sys
+ and no rollback segment is assigned to
+ this */
+/* Transaction concurrency states */
+#define TRX_NOT_STARTED 1
+#define TRX_ACTIVE 2
+#define TRX_COMMITTED_IN_MEMORY 3
+
+/* Transaction execution states when trx state is TRX_ACTIVE */
+#define TRX_QUE_RUNNING 1 /* transaction is running */
+#define TRX_QUE_LOCK_WAIT 2 /* transaction is waiting for a lock */
+#define TRX_QUE_ROLLING_BACK 3 /* transaction is rolling back */
+#define TRX_QUE_COMMITTING 4 /* transaction is committing */
+
+/* Types of a trx signal */
+#define TRX_SIG_NO_SIGNAL 100
+#define TRX_SIG_TOTAL_ROLLBACK 1
+#define TRX_SIG_ROLLBACK_TO_SAVEPT 2
+#define TRX_SIG_COMMIT 3
+#define TRX_SIG_ERROR_OCCURRED 4
+#define TRX_SIG_BREAK_EXECUTION 5
+
+/* Sender types of a signal */
+#define TRX_SIG_SELF 1 /* sent by the session itself, or
+ by an error occurring within this
+ session */
+#define TRX_SIG_OTHER_SESS 2 /* sent by another session (which
+ must hold rights to this) */
+/* Signal states */
+#define TRX_SIG_WAITING 1
+#define TRX_SIG_BEING_HANDLED 2
+
+/* Commit command node in a query graph */
+struct commit_node_struct{
+ que_common_t common; /* node type: QUE_NODE_COMMIT */
+ ulint state; /* node execution state */
+};
+
+/* Commit node states */
+#define COMMIT_NODE_SEND 1
+#define COMMIT_NODE_WAIT 2
+
+
+#ifndef UNIV_NONINL
+#include "trx0trx.ic"
+#endif
+
+#endif
diff --git a/innobase/include/trx0trx.ic b/innobase/include/trx0trx.ic
new file mode 100644
index 00000000000..9d453047600
--- /dev/null
+++ b/innobase/include/trx0trx.ic
@@ -0,0 +1,23 @@
+/******************************************************
+The transaction
+
+(c) 1996 Innobase Oy
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+/*****************************************************************
+Starts the transaction if it is not yet started. */
+UNIV_INLINE
+void
+trx_start_if_not_started(
+/*=====================*/
+ trx_t* trx) /* in: transaction */
+{
+ ut_ad(trx->conc_state != TRX_COMMITTED_IN_MEMORY);
+
+ if (trx->conc_state == TRX_NOT_STARTED) {
+
+ trx_start(trx, ULINT_UNDEFINED);
+ }
+}
diff --git a/innobase/include/trx0types.h b/innobase/include/trx0types.h
new file mode 100644
index 00000000000..02da1605077
--- /dev/null
+++ b/innobase/include/trx0types.h
@@ -0,0 +1,43 @@
+/******************************************************
+Transaction system global type definitions
+
+(c) 1996 Innobase Oy
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef trx0types_h
+#define trx0types_h
+
+#include "lock0types.h"
+#include "ut0byte.h"
+
+/* Memory objects */
+typedef struct trx_struct trx_t;
+typedef struct trx_sys_struct trx_sys_t;
+typedef struct trx_sig_struct trx_sig_t;
+typedef struct trx_rseg_struct trx_rseg_t;
+typedef struct trx_undo_struct trx_undo_t;
+typedef struct trx_undo_arr_struct trx_undo_arr_t;
+typedef struct trx_undo_inf_struct trx_undo_inf_t;
+typedef struct trx_purge_struct trx_purge_t;
+typedef struct roll_node_struct roll_node_t;
+typedef struct commit_node_struct commit_node_t;
+
+/* Transaction savepoint */
+typedef struct trx_savept_struct trx_savept_t;
+struct trx_savept_struct{
+ dulint least_undo_no; /* least undo number to undo */
+};
+
+/* File objects */
+typedef byte trx_sysf_t;
+typedef byte trx_rsegf_t;
+typedef byte trx_usegf_t;
+typedef byte trx_ulogf_t;
+typedef byte trx_upagef_t;
+
+/* Undo log record */
+typedef byte trx_undo_rec_t;
+
+#endif
diff --git a/innobase/include/trx0undo.h b/innobase/include/trx0undo.h
new file mode 100644
index 00000000000..82c21f756e6
--- /dev/null
+++ b/innobase/include/trx0undo.h
@@ -0,0 +1,473 @@
+/******************************************************
+Transaction undo log
+
+(c) 1996 Innobase Oy
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef trx0undo_h
+#define trx0undo_h
+
+#include "univ.i"
+#include "trx0types.h"
+#include "mtr0mtr.h"
+#include "trx0sys.h"
+#include "page0types.h"
+
+/***************************************************************************
+Builds a roll pointer dulint. */
+UNIV_INLINE
+dulint
+trx_undo_build_roll_ptr(
+/*====================*/
+ /* out: roll pointer */
+ ibool is_insert, /* in: TRUE if insert undo log */
+ ulint rseg_id, /* in: rollback segment id */
+ ulint page_no, /* in: page number */
+ ulint offset); /* in: offset of the undo entry within page */
+/***************************************************************************
+Decodes a roll pointer dulint. */
+UNIV_INLINE
+void
+trx_undo_decode_roll_ptr(
+/*=====================*/
+ dulint roll_ptr, /* in: roll pointer */
+ ibool* is_insert, /* out: TRUE if insert undo log */
+ ulint* rseg_id, /* out: rollback segment id */
+ ulint* page_no, /* out: page number */
+ ulint* offset); /* out: offset of the undo entry within page */
+/***************************************************************************
+Returns TRUE if the roll pointer is of the insert type. */
+UNIV_INLINE
+ibool
+trx_undo_roll_ptr_is_insert(
+/*========================*/
+ /* out: TRUE if insert undo log */
+ dulint roll_ptr); /* in: roll pointer */
+/*********************************************************************
+Writes a roll ptr to an index page. In case that the size changes in
+some future version, this function should be used instead of
+mach_write_... */
+UNIV_INLINE
+void
+trx_write_roll_ptr(
+/*===============*/
+ byte* ptr, /* in: pointer to memory where written */
+ dulint roll_ptr); /* in: roll ptr */
+/*********************************************************************
+Reads a roll ptr from an index page. In case that the roll ptr size
+changes in some future version, this function should be used instead of
+mach_read_... */
+UNIV_INLINE
+dulint
+trx_read_roll_ptr(
+/*==============*/
+ /* out: roll ptr */
+ byte* ptr); /* in: pointer to memory from where to read */
+/**********************************************************************
+Gets an undo log page and x-latches it. */
+UNIV_INLINE
+page_t*
+trx_undo_page_get(
+/*===============*/
+ /* out: pointer to page x-latched */
+ ulint space, /* in: space where placed */
+ ulint page_no, /* in: page number */
+ mtr_t* mtr); /* in: mtr */
+/**********************************************************************
+Gets an undo log page and s-latches it. */
+UNIV_INLINE
+page_t*
+trx_undo_page_get_s_latched(
+/*=========================*/
+ /* out: pointer to page s-latched */
+ ulint space, /* in: space where placed */
+ ulint page_no, /* in: page number */
+ mtr_t* mtr); /* in: mtr */
+/**********************************************************************
+Returns the previous undo record on the page in the specified log, or
+NULL if none exists. */
+UNIV_INLINE
+trx_undo_rec_t*
+trx_undo_page_get_prev_rec(
+/*=======================*/
+ /* out: pointer to record, NULL if none */
+ trx_undo_rec_t* rec, /* in: undo log record */
+ ulint page_no,/* in: undo log header page number */
+ ulint offset); /* in: undo log header offset on page */
+/**********************************************************************
+Returns the next undo log record on the page in the specified log, or
+NULL if none exists. */
+UNIV_INLINE
+trx_undo_rec_t*
+trx_undo_page_get_next_rec(
+/*=======================*/
+ /* out: pointer to record, NULL if none */
+ trx_undo_rec_t* rec, /* in: undo log record */
+ ulint page_no,/* in: undo log header page number */
+ ulint offset); /* in: undo log header offset on page */
+/**********************************************************************
+Returns the last undo record on the page in the specified undo log, or
+NULL if none exists. */
+UNIV_INLINE
+trx_undo_rec_t*
+trx_undo_page_get_last_rec(
+/*=======================*/
+ /* out: pointer to record, NULL if none */
+ page_t* undo_page,/* in: undo log page */
+ ulint page_no,/* in: undo log header page number */
+ ulint offset); /* in: undo log header offset on page */
+/**********************************************************************
+Returns the first undo record on the page in the specified undo log, or
+NULL if none exists. */
+UNIV_INLINE
+trx_undo_rec_t*
+trx_undo_page_get_first_rec(
+/*========================*/
+ /* out: pointer to record, NULL if none */
+ page_t* undo_page,/* in: undo log page */
+ ulint page_no,/* in: undo log header page number */
+ ulint offset); /* in: undo log header offset on page */
+/***************************************************************************
+Gets the previous record in an undo log. */
+
+trx_undo_rec_t*
+trx_undo_get_prev_rec(
+/*==================*/
+ /* out: undo log record, the page s-latched,
+ NULL if none */
+ trx_undo_rec_t* rec, /* in: undo record */
+ ulint page_no,/* in: undo log header page number */
+ ulint offset, /* in: undo log header offset on page */
+ mtr_t* mtr); /* in: mtr */
+/***************************************************************************
+Gets the next record in an undo log. */
+
+trx_undo_rec_t*
+trx_undo_get_next_rec(
+/*==================*/
+ /* out: undo log record, the page s-latched,
+ NULL if none */
+ trx_undo_rec_t* rec, /* in: undo record */
+ ulint page_no,/* in: undo log header page number */
+ ulint offset, /* in: undo log header offset on page */
+ mtr_t* mtr); /* in: mtr */
+/***************************************************************************
+Gets the first record in an undo log. */
+
+trx_undo_rec_t*
+trx_undo_get_first_rec(
+/*===================*/
+ /* out: undo log record, the page latched, NULL if
+ none */
+ ulint space, /* in: undo log header space */
+ ulint page_no,/* in: undo log header page number */
+ ulint offset, /* in: undo log header offset on page */
+ ulint mode, /* in: latching mode: RW_S_LATCH or RW_X_LATCH */
+ mtr_t* mtr); /* in: mtr */
+/************************************************************************
+Tries to add a page to the undo log segment where the undo log is placed. */
+
+ulint
+trx_undo_add_page(
+/*==============*/
+ /* out: page number if success, else
+ FIL_NULL */
+ trx_t* trx, /* in: transaction */
+ trx_undo_t* undo, /* in: undo log memory object */
+ mtr_t* mtr); /* in: mtr which does not have a latch to any
+ undo log page; the caller must have reserved
+ the rollback segment mutex */
+/***************************************************************************
+Truncates an undo log from the end. This function is used during a rollback
+to free space from an undo log. */
+
+void
+trx_undo_truncate_end(
+/*==================*/
+ trx_t* trx, /* in: transaction whose undo log it is */
+ trx_undo_t* undo, /* in: undo log */
+ dulint limit); /* in: all undo records with undo number
+ >= this value should be truncated */
+/***************************************************************************
+Truncates an undo log from the start. This function is used during a purge
+operation. */
+
+void
+trx_undo_truncate_start(
+/*====================*/
+ trx_rseg_t* rseg, /* in: rollback segment */
+ ulint space, /* in: space id of the log */
+ ulint hdr_page_no, /* in: header page number */
+ ulint hdr_offset, /* in: header offset on the page */
+ dulint limit); /* in: all undo pages with undo numbers <
+ this value should be truncated; NOTE that
+ the function only frees whole pages; the
+ header page is not freed, but emptied, if
+ all the records there are < limit */
+/************************************************************************
+Initializes the undo log lists for a rollback segment memory copy.
+This function is only called when the database is started or a new
+rollback segment created. */
+
+ulint
+trx_undo_lists_init(
+/*================*/
+ /* out: the combined size of undo log segments
+ in pages */
+ trx_rseg_t* rseg); /* in: rollback segment memory object */
+/**************************************************************************
+Assigns an undo log for a transaction. A new undo log is created or a cached
+undo log reused. */
+
+trx_undo_t*
+trx_undo_assign_undo(
+/*=================*/
+ /* out: the undo log, NULL if did not succeed: out of
+ space */
+ trx_t* trx, /* in: transaction */
+ ulint type); /* in: TRX_UNDO_INSERT or TRX_UNDO_UPDATE */
+/**********************************************************************
+Sets the state of the undo log segment at a transaction finish. */
+
+page_t*
+trx_undo_set_state_at_finish(
+/*=========================*/
+ /* out: undo log segment header page,
+ x-latched */
+ trx_t* trx, /* in: transaction */
+ trx_undo_t* undo, /* in: undo log memory copy */
+ mtr_t* mtr); /* in: mtr */
+/**************************************************************************
+Adds the update undo log header as the first in the history list, and
+frees the memory object, or puts it to the list of cached update undo log
+segments. */
+
+void
+trx_undo_update_cleanup(
+/*====================*/
+ trx_t* trx, /* in: trx owning the update undo log */
+ page_t* undo_page, /* in: update undo log header page,
+ x-latched */
+ mtr_t* mtr); /* in: mtr */
+/**************************************************************************
+Discards an undo log and puts the segment to the list of cached update undo
+log segments. This optimized function is called if there is no need to
+keep the update undo log because there exist no read views and the transaction
+made no delete markings, which would make purge necessary. We restrict this
+to undo logs of size 1 to make things simpler. */
+
+dulint
+trx_undo_update_cleanup_by_discard(
+/*===============================*/
+ /* out: log sequence number at which mtr is
+ committed */
+ trx_t* trx, /* in: trx owning the update undo log */
+ mtr_t* mtr); /* in: mtr */
+/**********************************************************************
+Frees or caches an insert undo log after a transaction commit or rollback.
+Knowledge of inserts is not needed after a commit or rollback, therefore
+the data can be discarded. */
+
+void
+trx_undo_insert_cleanup(
+/*====================*/
+ trx_t* trx); /* in: transaction handle */
+/***************************************************************
+Parses the redo log entry of an undo log page initialization. */
+
+byte*
+trx_undo_parse_page_init(
+/*======================*/
+ /* out: end of log record or NULL */
+ byte* ptr, /* in: buffer */
+ byte* end_ptr,/* in: buffer end */
+ page_t* page, /* in: page or NULL */
+ mtr_t* mtr); /* in: mtr or NULL */
+/***************************************************************
+Parses the redo log entry of an undo log page header create or reuse. */
+
+byte*
+trx_undo_parse_page_header(
+/*=======================*/
+ /* out: end of log record or NULL */
+ ulint type, /* in: MLOG_UNDO_HDR_CREATE or MLOG_UNDO_HDR_REUSE */
+ byte* ptr, /* in: buffer */
+ byte* end_ptr,/* in: buffer end */
+ page_t* page, /* in: page or NULL */
+ mtr_t* mtr); /* in: mtr or NULL */
+/***************************************************************
+Parses the redo log entry of an undo log page header discard. */
+
+byte*
+trx_undo_parse_discard_latest(
+/*==========================*/
+ /* out: end of log record or NULL */
+ byte* ptr, /* in: buffer */
+ byte* end_ptr,/* in: buffer end */
+ page_t* page, /* in: page or NULL */
+ mtr_t* mtr); /* in: mtr or NULL */
+
+
+/* Types of an undo log segment */
+#define TRX_UNDO_INSERT 1 /* contains undo entries for inserts */
+#define TRX_UNDO_UPDATE 2 /* contains undo entries for updates
+ and delete markings: in short,
+ modifys (the name 'UPDATE' is a
+ historical relic) */
+/* States of an undo log segment */
+#define TRX_UNDO_ACTIVE 1 /* contains an undo log of an active
+ transaction */
+#define TRX_UNDO_CACHED 2 /* cached for quick reuse */
+#define TRX_UNDO_TO_FREE 3 /* insert undo segment can be freed */
+#define TRX_UNDO_TO_PURGE 4 /* update undo segment will not be
+ reused: it can be freed in purge when
+ all undo data in it is removed */
+
+/* Transaction undo log memory object; this is protected by the undo_mutex
+in the corresponding transaction object */
+
+struct trx_undo_struct{
+ /*-----------------------------*/
+ ulint id; /* undo log slot number within the
+ rollback segment */
+ ulint type; /* TRX_UNDO_INSERT or
+ TRX_UNDO_UPDATE */
+ ulint state; /* state of the corresponding undo log
+ segment */
+ ibool del_marks; /* relevant only in an update undo log:
+ this is TRUE if the transaction may
+ have delete marked records, because of
+ a delete of a row or an update of an
+ indexed field; purge is then
+ necessary. */
+ dulint trx_id; /* id of the trx assigned to the undo
+ log */
+ ibool dict_operation; /* TRUE if a dict operation trx */
+ dulint table_id; /* if a dict operation, then the table
+ id */
+ trx_rseg_t* rseg; /* rseg where the undo log belongs */
+ /*-----------------------------*/
+ ulint space; /* space id where the undo log
+ placed */
+ ulint hdr_page_no; /* page number of the header page in
+ the undo log */
+ ulint hdr_offset; /* header offset of the undo log on the
+ page */
+ ulint last_page_no; /* page number of the last page in the
+ undo log; this may differ from
+ top_page_no during a rollback */
+ ulint size; /* current size in pages */
+ /*-----------------------------*/
+ ulint empty; /* TRUE if the stack of undo log
+ records is currently empty */
+ ulint top_page_no; /* page number where the latest undo
+ log record was catenated; during
+ rollback the page from which the latest
+ undo record was chosen */
+ ulint top_offset; /* offset of the latest undo record,
+ i.e., the topmost element in the undo
+ log if we think of it as a stack */
+ dulint top_undo_no; /* undo number of the latest record */
+ page_t* guess_page; /* guess for the buffer frame where
+ the top page might reside */
+ /*-----------------------------*/
+ UT_LIST_NODE_T(trx_undo_t) undo_list;
+ /* undo log objects in the rollback
+ segment are chained into lists */
+};
+
+/* The offset of the undo log page header on pages of the undo log */
+#define TRX_UNDO_PAGE_HDR FSEG_PAGE_DATA
+/*-------------------------------------------------------------*/
+/* Transaction undo log page header offsets */
+#define TRX_UNDO_PAGE_TYPE 0 /* TRX_UNDO_INSERT or
+ TRX_UNDO_UPDATE */
+#define TRX_UNDO_PAGE_START 2 /* Byte offset where the undo log
+ records for the LATEST transaction
+ start on this page (remember that
+ in an update undo log, the first page
+ can contain several undo logs) */
+#define TRX_UNDO_PAGE_FREE 4 /* On each page of the undo log this
+ field contains the byte offset of the
+ first free byte on the page */
+#define TRX_UNDO_PAGE_NODE 6 /* The file list node in the chain
+ of undo log pages */
+/*-------------------------------------------------------------*/
+#define TRX_UNDO_PAGE_HDR_SIZE (6 + FLST_NODE_SIZE)
+
+/* An update undo segment with just one page can be reused if it has
+< this number bytes used */
+
+#define TRX_UNDO_PAGE_REUSE_LIMIT (3 * UNIV_PAGE_SIZE / 4)
+
+/* An update undo log segment may contain several undo logs on its first page
+if the undo logs took so little space that the segment could be cached and
+reused. All the undo log headers are then on the first page, and the last one
+owns the undo log records on subsequent pages if the segment is bigger than
+one page. If an undo log is stored in a segment, then on the first page it is
+allowed to have zero undo records, but if the segment extends to several
+pages, then all the rest of the pages must contain at least one undo log
+record. */
+
+/* The offset of the undo log segment header on the first page of the undo
+log segment */
+
+#define TRX_UNDO_SEG_HDR (TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE)
+/*-------------------------------------------------------------*/
+#define TRX_UNDO_STATE 0 /* TRX_UNDO_ACTIVE, ... */
+#define TRX_UNDO_LAST_LOG 2 /* Offset of the last undo log header
+ on the segment header page, 0 if
+ none */
+#define TRX_UNDO_FSEG_HEADER 4 /* Header for the file segment which
+ the undo log segment occupies */
+#define TRX_UNDO_PAGE_LIST (4 + FSEG_HEADER_SIZE)
+ /* Base node for the list of pages in
+ the undo log segment; defined only on
+ the undo log segment's first page */
+/*-------------------------------------------------------------*/
+/* Size of the undo log segment header */
+#define TRX_UNDO_SEG_HDR_SIZE (4 + FSEG_HEADER_SIZE + FLST_BASE_NODE_SIZE)
+
+
+/* The undo log header. There can be several undo log headers on the first
+page of an update undo log segment. */
+/*-------------------------------------------------------------*/
+#define TRX_UNDO_TRX_ID 0 /* Transaction id */
+#define TRX_UNDO_TRX_NO 8 /* Transaction number of the
+ transaction; defined only if the log
+ is in a history list */
+#define TRX_UNDO_DEL_MARKS 16 /* Defined only in an update undo
+ log: TRUE if the transaction may have
+ done delete markings of records, and
+ thus purge is necessary */
+#define TRX_UNDO_LOG_START 18 /* Offset of the first undo log record
+ of this log on the header page; purge
+ may remove undo log record from the
+ log start, and therefore this is not
+ necessarily the same as this log
+ header end offset */
+#define TRX_UNDO_DICT_OPERATION 20 /* TRUE if the transaction is a table
+ create, index create, or drop
+ transaction: in recovery
+ the transaction cannot be rolled back
+ in the usual way: a 'rollback' rather
+ means dropping the created or dropped
+ table, if it still exists */
+#define TRX_UNDO_TABLE_ID 22 /* Id of the table if the preceding
+ field is TRUE */
+#define TRX_UNDO_NEXT_LOG 30 /* Offset of the next undo log header
+ on this page, 0 if none */
+#define TRX_UNDO_PREV_LOG 32 /* Offset of the previous undo log
+ header on this page, 0 if none */
+#define TRX_UNDO_HISTORY_NODE 34 /* If the log is put to the history
+ list, the file list node is here */
+/*-------------------------------------------------------------*/
+#define TRX_UNDO_LOG_HDR_SIZE (34 + FLST_NODE_SIZE)
+
+#ifndef UNIV_NONINL
+#include "trx0undo.ic"
+#endif
+
+#endif
diff --git a/innobase/include/trx0undo.ic b/innobase/include/trx0undo.ic
new file mode 100644
index 00000000000..bedbc02b00b
--- /dev/null
+++ b/innobase/include/trx0undo.ic
@@ -0,0 +1,319 @@
+/******************************************************
+Transaction undo log
+
+(c) 1996 Innobase Oy
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#include "data0type.h"
+
+/***************************************************************************
+Builds a roll pointer dulint. */
+UNIV_INLINE
+dulint
+trx_undo_build_roll_ptr(
+/*====================*/
+ /* out: roll pointer */
+ ibool is_insert, /* in: TRUE if insert undo log */
+ ulint rseg_id, /* in: rollback segment id */
+ ulint page_no, /* in: page number */
+ ulint offset) /* in: offset of the undo entry within page */
+{
+ ut_ad(DATA_ROLL_PTR_LEN == 7);
+ ut_ad(rseg_id < 128);
+
+ return(ut_dulint_create(is_insert * 128 * 256 * 256
+ + rseg_id * 256 * 256
+ + (page_no / 256) / 256,
+ (page_no % (256 * 256)) * 256 * 256
+ + offset));
+}
+
+/***************************************************************************
+Decodes a roll pointer dulint. */
+UNIV_INLINE
+void
+trx_undo_decode_roll_ptr(
+/*=====================*/
+ dulint roll_ptr, /* in: roll pointer */
+ ibool* is_insert, /* out: TRUE if insert undo log */
+ ulint* rseg_id, /* out: rollback segment id */
+ ulint* page_no, /* out: page number */
+ ulint* offset) /* out: offset of the undo entry within page */
+{
+ ulint low;
+ ulint high;
+
+ ut_ad(DATA_ROLL_PTR_LEN == 7);
+ ut_ad(TRUE == 1);
+
+ high = ut_dulint_get_high(roll_ptr);
+ low = ut_dulint_get_low(roll_ptr);
+
+ *offset = low % (256 * 256);
+
+ *is_insert = high / (256 * 256 * 128); /* TRUE == 1 */
+ *rseg_id = (high / (256 * 256)) % 128;
+
+ *page_no = (high % (256 * 256)) * 256 * 256
+ + (low / 256) / 256;
+}
+
+/***************************************************************************
+Returns TRUE if the roll pointer is of the insert type. */
+UNIV_INLINE
+ibool
+trx_undo_roll_ptr_is_insert(
+/*========================*/
+ /* out: TRUE if insert undo log */
+ dulint roll_ptr) /* in: roll pointer */
+{
+ ulint high;
+
+ ut_ad(DATA_ROLL_PTR_LEN == 7);
+ ut_ad(TRUE == 1);
+
+ high = ut_dulint_get_high(roll_ptr);
+
+ return(high / (256 * 256 * 128));
+}
+
+/*********************************************************************
+Writes a roll ptr to an index page. In case that the size changes in
+some future version, this function should be used instead of
+mach_write_... */
+UNIV_INLINE
+void
+trx_write_roll_ptr(
+/*===============*/
+ byte* ptr, /* in: pointer to memory where written */
+ dulint roll_ptr) /* in: roll ptr */
+{
+ ut_ad(DATA_ROLL_PTR_LEN == 7);
+
+ mach_write_to_7(ptr, roll_ptr);
+}
+
+/*********************************************************************
+Reads a roll ptr from an index page. In case that the roll ptr size
+changes in some future version, this function should be used instead of
+mach_read_... */
+UNIV_INLINE
+dulint
+trx_read_roll_ptr(
+/*==============*/
+ /* out: roll ptr */
+ byte* ptr) /* in: pointer to memory from where to read */
+{
+ ut_ad(DATA_ROLL_PTR_LEN == 7);
+
+ return(mach_read_from_7(ptr));
+}
+
+/**********************************************************************
+Gets an undo log page and x-latches it. */
+UNIV_INLINE
+page_t*
+trx_undo_page_get(
+/*===============*/
+ /* out: pointer to page x-latched */
+ ulint space, /* in: space where placed */
+ ulint page_no, /* in: page number */
+ mtr_t* mtr) /* in: mtr */
+{
+ page_t* page;
+
+ page = buf_page_get(space, page_no, RW_X_LATCH, mtr);
+
+ buf_page_dbg_add_level(page, SYNC_TRX_UNDO_PAGE);
+
+ return(page);
+}
+
+/**********************************************************************
+Gets an undo log page and s-latches it. */
+UNIV_INLINE
+page_t*
+trx_undo_page_get_s_latched(
+/*=========================*/
+ /* out: pointer to page s-latched */
+ ulint space, /* in: space where placed */
+ ulint page_no, /* in: page number */
+ mtr_t* mtr) /* in: mtr */
+{
+ page_t* page;
+
+ page = buf_page_get(space, page_no, RW_S_LATCH, mtr);
+
+ buf_page_dbg_add_level(page, SYNC_TRX_UNDO_PAGE);
+
+ return(page);
+}
+
+/**********************************************************************
+Returns the start offset of the undo log records of the specified undo
+log on the page. */
+UNIV_INLINE
+ulint
+trx_undo_page_get_start(
+/*====================*/
+ /* out: start offset */
+ page_t* undo_page,/* in: undo log page */
+ ulint page_no,/* in: undo log header page number */
+ ulint offset) /* in: undo log header offset on page */
+{
+ ulint start;
+
+ if (page_no == buf_frame_get_page_no(undo_page)) {
+
+ start = mach_read_from_2(offset + undo_page
+ + TRX_UNDO_LOG_START);
+ } else {
+ start = TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE;
+ }
+
+ return(start);
+}
+
+/**********************************************************************
+Returns the end offset of the undo log records of the specified undo
+log on the page. */
+UNIV_INLINE
+ulint
+trx_undo_page_get_end(
+/*==================*/
+ /* out: end offset */
+ page_t* undo_page,/* in: undo log page */
+ ulint page_no,/* in: undo log header page number */
+ ulint offset) /* in: undo log header offset on page */
+{
+ trx_ulogf_t* log_hdr;
+ ulint end;
+
+ if (page_no == buf_frame_get_page_no(undo_page)) {
+
+ log_hdr = undo_page + offset;
+
+ end = mach_read_from_2(log_hdr + TRX_UNDO_NEXT_LOG);
+
+ if (end == 0) {
+ end = mach_read_from_2(undo_page + TRX_UNDO_PAGE_HDR
+ + TRX_UNDO_PAGE_FREE);
+ }
+ } else {
+ end = mach_read_from_2(undo_page + TRX_UNDO_PAGE_HDR
+ + TRX_UNDO_PAGE_FREE);
+ }
+
+ return(end);
+}
+
+/**********************************************************************
+Returns the previous undo record on the page in the specified log, or
+NULL if none exists. */
+UNIV_INLINE
+trx_undo_rec_t*
+trx_undo_page_get_prev_rec(
+/*=======================*/
+ /* out: pointer to record, NULL if none */
+ trx_undo_rec_t* rec, /* in: undo log record */
+ ulint page_no,/* in: undo log header page number */
+ ulint offset) /* in: undo log header offset on page */
+{
+ page_t* undo_page;
+ ulint start;
+
+ undo_page = buf_frame_align(rec);
+
+ start = trx_undo_page_get_start(undo_page, page_no, offset);
+
+ if (start + undo_page == rec) {
+
+ return(NULL);
+ }
+
+ return(undo_page + mach_read_from_2(rec - 2));
+}
+
+/**********************************************************************
+Returns the next undo log record on the page in the specified log, or
+NULL if none exists. */
+UNIV_INLINE
+trx_undo_rec_t*
+trx_undo_page_get_next_rec(
+/*=======================*/
+ /* out: pointer to record, NULL if none */
+ trx_undo_rec_t* rec, /* in: undo log record */
+ ulint page_no,/* in: undo log header page number */
+ ulint offset) /* in: undo log header offset on page */
+{
+ page_t* undo_page;
+ ulint end;
+ ulint next;
+
+ undo_page = buf_frame_align(rec);
+
+ end = trx_undo_page_get_end(undo_page, page_no, offset);
+
+ next = mach_read_from_2(rec);
+
+ if (next == end) {
+
+ return(NULL);
+ }
+
+ return(undo_page + next);
+}
+
+/**********************************************************************
+Returns the last undo record on the page in the specified undo log, or
+NULL if none exists. */
+UNIV_INLINE
+trx_undo_rec_t*
+trx_undo_page_get_last_rec(
+/*=======================*/
+ /* out: pointer to record, NULL if none */
+ page_t* undo_page,/* in: undo log page */
+ ulint page_no,/* in: undo log header page number */
+ ulint offset) /* in: undo log header offset on page */
+{
+ ulint start;
+ ulint end;
+
+ start = trx_undo_page_get_start(undo_page, page_no, offset);
+ end = trx_undo_page_get_end(undo_page, page_no, offset);
+
+ if (start == end) {
+
+ return(NULL);
+ }
+
+ return(undo_page + mach_read_from_2(undo_page + end - 2));
+}
+
+/**********************************************************************
+Returns the first undo record on the page in the specified undo log, or
+NULL if none exists. */
+UNIV_INLINE
+trx_undo_rec_t*
+trx_undo_page_get_first_rec(
+/*========================*/
+ /* out: pointer to record, NULL if none */
+ page_t* undo_page,/* in: undo log page */
+ ulint page_no,/* in: undo log header page number */
+ ulint offset) /* in: undo log header offset on page */
+{
+ ulint start;
+ ulint end;
+
+ start = trx_undo_page_get_start(undo_page, page_no, offset);
+ end = trx_undo_page_get_end(undo_page, page_no, offset);
+
+ if (start == end) {
+
+ return(NULL);
+ }
+
+ return(undo_page + start);
+}
diff --git a/innobase/include/univ.i b/innobase/include/univ.i
new file mode 100644
index 00000000000..d60c297f3c4
--- /dev/null
+++ b/innobase/include/univ.i
@@ -0,0 +1,166 @@
+/***************************************************************************
+Version control for database, common definitions, and include files
+
+(c) 1994 - 2000 Innobase Oy
+
+Created 1/20/1994 Heikki Tuuri
+****************************************************************************/
+
+#ifndef univ_i
+#define univ_i
+
+#if (defined(_WIN32) || defined(_WIN64))
+#define __WIN__
+#include <windows.h>
+
+
+#else
+/* The Unix version */
+
+/* Include two header files from MySQL to make the Unix flavor used
+in compiling more Posix-compatible. We assume that 'innobase' is a
+subdirectory of 'mysql'. */
+#include <global.h>
+#include <my_pthread.h>
+
+#undef PACKAGE
+#undef VERSION
+
+/* Include the header file generated by GNU autoconf */
+#include "../ib_config.h"
+
+#ifdef HAVE_PREAD
+#define HAVE_PWRITE
+#endif
+
+#endif
+
+/* DEBUG VERSION CONTROL
+ ===================== */
+/* Make a non-inline debug version */
+/*
+#define UNIV_DEBUG
+#define UNIV_MEM_DEBUG
+#define UNIV_SYNC_DEBUG
+#define UNIV_SEARCH_DEBUG
+
+#define UNIV_IBUF_DEBUG
+
+#define UNIV_SYNC_PERF_STAT
+#define UNIV_SEARCH_PERF_STAT
+*/
+#define UNIV_LIGHT_MEM_DEBUG
+
+#define YYDEBUG 1
+
+/*
+#define UNIV_SQL_DEBUG
+#define UNIV_LOG_DEBUG
+*/
+ /* the above option prevents forcing of log to disk
+ at a buffer page write: it should be tested with this
+ option off; also some ibuf tests are suppressed */
+/*
+#define UNIV_BASIC_LOG_DEBUG
+*/
+ /* the above option enables basic recovery debugging:
+ new allocated file pages are reset */
+
+#if (!defined(UNIV_DEBUG) && !defined(INSIDE_HA_INNOBASE_CC))
+/* Definition for inline version */
+
+#ifdef __WIN__
+#define UNIV_INLINE __inline
+#else
+/* config.h contains the right def for 'inline' for the current compiler */
+#define UNIV_INLINE extern inline
+
+#endif
+
+#else
+/* If we want to compile a noninlined version we use the following macro
+definitions: */
+
+#define UNIV_NONINL
+#define UNIV_INLINE
+
+#endif /* UNIV_DEBUG */
+
+#ifdef _WIN32
+#define UNIV_WORD_SIZE 4
+#elif defined(_WIN64)
+#define UNIV_WORD_SIZE 8
+#else
+/* config.h generated by GNU autoconf will define SIZEOF_INT in Posix */
+#define UNIV_WORD_SIZE SIZEOF_INT
+#endif
+
+/* The following alignment is used in memory allocations in memory heap
+management to ensure correct alignment for doubles etc. */
+#define UNIV_MEM_ALIGNMENT 8
+
+/* The following alignment is used in aligning lints etc. */
+#define UNIV_WORD_ALIGNMENT UNIV_WORD_SIZE
+
+/*
+ DATABASE VERSION CONTROL
+ ========================
+*/
+
+/* The universal page size of the database */
+#define UNIV_PAGE_SIZE (2 * 8192) /* NOTE! Currently, this has to be a
+ power of 2 */
+/* The 2-logarithm of UNIV_PAGE_SIZE: */
+#define UNIV_PAGE_SIZE_SHIFT 14
+
+/* Maximum number of parallel threads in a parallelized operation */
+#define UNIV_MAX_PARALLELISM 32
+
+/*
+ UNIVERSAL TYPE DEFINITIONS
+ ==========================
+*/
+
+/* Note that inside MySQL 'byte' is defined as char on Linux! */
+#define byte unsigned char
+
+/* Another basic type we use is unsigned long integer which is intended to be
+equal to the word size of the machine. */
+
+typedef unsigned long int ulint;
+
+typedef long int lint;
+
+/* The following type should be at least a 64-bit floating point number */
+typedef double utfloat;
+
+/* The 'undefined' value for a ulint */
+#define ULINT_UNDEFINED ((ulint)(-1))
+
+/* The undefined 32-bit unsigned integer */
+#define ULINT32_UNDEFINED 0xFFFFFFFF
+
+/* Maximum value for a ulint */
+#define ULINT_MAX ((ulint)(-2))
+
+/* This 'ibool' type is used within Innobase. Remember that different included
+headers may define 'bool' differently. Do not assume that 'bool' is a ulint! */
+#define ibool ulint
+
+#ifndef TRUE
+
+#define TRUE 1
+#define FALSE 0
+
+#endif
+
+/* The following number as the length of a logical field means that the field
+has the SQL NULL as its value. */
+#define UNIV_SQL_NULL ULINT_UNDEFINED
+
+#include <stdio.h>
+#include "ut0dbg.h"
+#include "ut0ut.h"
+#include "db0err.h"
+
+#endif
diff --git a/innobase/include/univold.i b/innobase/include/univold.i
new file mode 100644
index 00000000000..8bcd28e180f
--- /dev/null
+++ b/innobase/include/univold.i
@@ -0,0 +1,164 @@
+/***************************************************************************
+Version control for database, common definitions, and include files
+
+(c) 1994 - 2000 Innobase Oy
+
+Created 1/20/1994 Heikki Tuuri
+****************************************************************************/
+
+#ifndef univ_i
+#define univ_i
+
+#define UNIV_INTEL
+#define UNIV_PENTIUM
+/* If UNIV_WINNT is not defined, we assume Windows 95 */
+
+#define UNIV_WINNT
+#define UNIV_WINNT4
+#define __NT__
+
+#define UNIV_VISUALC
+
+#define __WIN__
+#define _WIN32_WINNT 0x0400
+
+/* DEBUG VERSION CONTROL
+ ===================== */
+/* Make a non-inline debug version */
+/*
+#define UNIV_DEBUG
+#define UNIV_MEM_DEBUG
+#define UNIV_SYNC_DEBUG
+#define UNIV_SEARCH_DEBUG
+
+#define UNIV_IBUF_DEBUG
+
+#define UNIV_SEARCH_PERF_STAT
+#define UNIV_SYNC_PERF_STAT
+*/
+#define UNIV_LIGHT_MEM_DEBUG
+
+#define YYDEBUG 1
+/*
+#define UNIV_SQL_DEBUG
+#define UNIV_LOG_DEBUG
+*/
+ /* the above option prevents forcing of log to disk
+ at a buffer page write: it should be tested with this
+ option off; also some ibuf tests are suppressed */
+/*
+#define UNIV_BASIC_LOG_DEBUG
+*/
+ /* the above option enables basic recovery debugging:
+ new allocated file pages are reset */
+
+/* The debug version is slower, thus we may change the length of test loops
+depending on the UNIV_DBC parameter */
+#ifdef UNIV_DEBUG
+#define UNIV_DBC 1
+#else
+#define UNIV_DBC 100
+#endif
+
+#ifndef UNIV_DEBUG
+/* Definition for inline version */
+
+#ifdef UNIV_VISUALC
+#define UNIV_INLINE __inline
+#elif defined(UNIV_GNUC)
+#define UNIV_INLINE extern __inline__
+#endif
+
+#else
+/* If we want to compile a noninlined version we use the following macro
+definitions: */
+
+#define UNIV_NONINL
+#define UNIV_INLINE
+
+#endif /* UNIV_DEBUG */
+/* If the compiler does not know inline specifier, we use: */
+/*
+#define UNIV_INLINE static
+*/
+
+
+/*
+ MACHINE VERSION CONTROL
+ =======================
+*/
+
+#ifdef UNIV_PENTIUM
+
+/* In a 32-bit computer word size is 4 */
+#define UNIV_WORD_SIZE 4
+
+/* The following alignment is used in memory allocations in memory heap
+management to ensure correct alignment for doubles etc. */
+#define UNIV_MEM_ALIGNMENT 8
+
+/* The following alignment is used in aligning lints etc. */
+#define UNIV_WORD_ALIGNMENT UNIV_WORD_SIZE
+
+#endif
+
+/*
+ DATABASE VERSION CONTROL
+ ========================
+*/
+
+/* The universal page size of the database */
+#define UNIV_PAGE_SIZE (2 * 8192)/* NOTE! Currently, this has to be a
+ power of 2 and divisible by
+ UNIV_MEM_ALIGNMENT */
+
+/* Do non-buffered io in buffer pool read/write operations */
+#define UNIV_NON_BUFFERED_IO
+
+/* Maximum number of parallel threads in a parallelized operation */
+#define UNIV_MAX_PARALLELISM 32
+
+/*
+ UNIVERSAL TYPE DEFINITIONS
+ ==========================
+*/
+
+
+typedef unsigned char byte;
+
+/* An other basic type we use is unsigned long integer which is intended to be
+equal to the word size of the machine. */
+
+typedef unsigned long int ulint;
+
+typedef long int lint;
+
+/* The following type should be at least a 64-bit floating point number */
+typedef double utfloat;
+
+/* The 'undefined' value for a ulint */
+#define ULINT_UNDEFINED ((ulint)(-1))
+
+/* The undefined 32-bit unsigned integer */
+#define ULINT32_UNDEFINED 0xFFFFFFFF
+
+/* Maximum value for a ulint */
+#define ULINT_MAX ((ulint)(-2))
+
+
+/* Definition of the boolean type */
+typedef ulint bool;
+
+#define TRUE 1
+#define FALSE 0
+
+/* The following number as the length of a logical field means that the field
+has the SQL NULL as its value. */
+#define UNIV_SQL_NULL ULINT_UNDEFINED
+
+#include <stdio.h>
+#include "ut0dbg.h"
+#include "ut0ut.h"
+#include "db0err.h"
+
+#endif
diff --git a/innobase/include/univoldmysql.i b/innobase/include/univoldmysql.i
new file mode 100644
index 00000000000..269b584d073
--- /dev/null
+++ b/innobase/include/univoldmysql.i
@@ -0,0 +1,181 @@
+/***************************************************************************
+Version control for database, common definitions, and include files
+
+(c) 1994 - 1996 Innobase Oy
+
+Created 1/20/1994 Heikki Tuuri
+****************************************************************************/
+
+#ifndef univ_i
+#define univ_i
+
+#define UNIV_INTEL
+#define UNIV_PENTIUM
+/* If UNIV_WINNT is not defined, we assume Windows 95 */
+
+#define UNIV_WINNT
+#define UNIV_WINNT4
+
+#define UNIV_VISUALC
+
+/* DEBUG VERSION CONTROL
+ ===================== */
+/* Make a profiler version where mutex_fence does not use CPUID and therefore
+is not totally safe. The sync-library must be recompiled before profiling. */
+/*
+#define UNIV_PROFILE
+*/
+/* When the following flag is defined, also mutex lock word reset to 0
+in mutex_exit is performed using a serializing instruction, which does not
+allow speculative reads be performed before memory writes */
+/*
+#define SYNC_SERIALIZE_MUTEX_RESET
+*/
+/* Make a non-inline debug version */
+
+#define UNIV_DEBUG
+#define UNIV_MEM_DEBUG
+#define UNIV_SYNC_DEBUG
+#define UNIV_SEARCH_DEBUG
+
+#define UNIV_IBUF_DEBUG
+
+#define UNIV_SEARCH_PERF_STAT
+#define UNIV_SYNC_PERF_STAT
+
+
+#define UNIV_LIGHT_MEM_DEBUG
+
+#define YYDEBUG 1
+/*
+#define UNIV_SQL_DEBUG
+#define UNIV_LOG_DEBUG
+*/
+ /* the above option prevents forcing of log to disk
+ at a buffer page write: it should be tested with this
+ option off; also some ibuf tests are suppressed */
+/*
+#define UNIV_BASIC_LOG_DEBUG
+*/
+ /* the above option enables basic recovery debugging:
+ new allocated file pages are reset */
+
+/* The debug version is slower, thus we may change the length of test loops
+depending on the UNIV_DBC parameter */
+#ifdef UNIV_DEBUG
+#define UNIV_DBC 1
+#else
+#define UNIV_DBC 100
+#endif
+
+#ifndef UNIV_DEBUG
+/* Definition for inline version */
+
+#ifdef UNIV_VISUALC
+#define UNIV_INLINE __inline
+#elif defined(UNIV_GNUC)
+#define UNIV_INLINE extern __inline__
+#endif
+
+#else
+/* If we want to compile a noninlined version we use the following macro
+definitions: */
+
+#define UNIV_NONINL
+#define UNIV_INLINE
+
+#endif /* UNIV_DEBUG */
+/* If the compiler does not know inline specifier, we use: */
+/*
+#define UNIV_INLINE static
+*/
+
+
+/*
+ MACHINE VERSION CONTROL
+ =======================
+*/
+
+#ifdef UNIV_PENTIUM
+
+/* In a 32-bit computer word size is 4 */
+#define UNIV_WORD_SIZE 4
+
+/* The following alignment is used in memory allocations in memory heap
+management to ensure correct alignment for doubles etc. */
+#define UNIV_MEM_ALIGNMENT 8
+
+/* The following alignment is used in aligning lints etc. */
+#define UNIV_WORD_ALIGNMENT UNIV_WORD_SIZE
+
+#endif
+
+/*
+ DATABASE VERSION CONTROL
+ ========================
+*/
+
+/* The universal page size of the database */
+#define UNIV_PAGE_SIZE 8192 /* NOTE! Currently, this has to be a
+ power of 2 and divisible by
+ UNIV_MEM_ALIGNMENT */
+/* 2-based logarithm of UNIV_PAGE_SIZE */
+#define UNIV_PAGE_SIZE_SHIFT 13
+
+/* Do asynchronous io in buffer pool read/write operations */
+#ifdef UNIV_WINNT
+#define UNIV_ASYNC_IO
+#endif
+
+/* Do non-buffered io in buffer pool read/write operations */
+#define UNIV_NON_BUFFERED_IO
+
+/* Maximum number of parallel threads in a parallelized operation */
+#define UNIV_MAX_PARALLELISM 32
+
+/*
+ UNIVERSAL TYPE DEFINITIONS
+ ==========================
+*/
+
+/*
+typedef unsigned char byte;
+*/
+
+/* An other basic type we use is unsigned long integer which is intended to be
+equal to the word size of the machine. */
+
+typedef unsigned long int ulint;
+
+typedef long int lint;
+
+/* The following type should be at least a 64-bit floating point number */
+typedef double utfloat;
+
+/* The 'undefined' value for a ulint */
+#define ULINT_UNDEFINED ((ulint)(-1))
+
+/* The undefined 32-bit unsigned integer */
+#define ULINT32_UNDEFINED 0xFFFFFFFF
+
+/* Maximum value for a ulint */
+#define ULINT_MAX ((ulint)(-2))
+
+/* Definition of the boolean type */
+#ifndef bool
+typedef ulint bool;
+#endif
+
+#define TRUE 1
+#define FALSE 0
+
+/* The following number as the length of a logical field means that the field
+has the SQL NULL as its value. */
+#define UNIV_SQL_NULL ULINT_UNDEFINED
+
+#include <stdio.h>
+#include "ut0dbg.h"
+#include "ut0ut.h"
+#include "db0err.h"
+
+#endif
diff --git a/innobase/include/usr0sess.h b/innobase/include/usr0sess.h
new file mode 100644
index 00000000000..365f828ecfc
--- /dev/null
+++ b/innobase/include/usr0sess.h
@@ -0,0 +1,318 @@
+/******************************************************
+Sessions
+
+(c) 1996 Innobase Oy
+
+Created 6/25/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef usr0sess_h
+#define usr0sess_h
+
+#include "univ.i"
+#include "ut0byte.h"
+#include "hash0hash.h"
+#include "trx0types.h"
+#include "srv0srv.h"
+#include "trx0types.h"
+#include "usr0types.h"
+#include "que0types.h"
+#include "data0data.h"
+#include "rem0rec.h"
+#include "com0com.h"
+
+/* The session system global data structure */
+extern sess_sys_t* sess_sys;
+
+/*************************************************************************
+Sets the session id in a client message. */
+
+void
+sess_cli_msg_set_sess(
+/*==================*/
+ byte* str, /* in/out: message string */
+ dulint sess_id);/* in: session id */
+/***************************************************************************
+Sets the message type of a message from the client. */
+UNIV_INLINE
+void
+sess_cli_msg_set_type(
+/*==================*/
+ byte* str, /* in: message string */
+ ulint type); /* in: message type */
+/***************************************************************************
+Gets the message type of a message from the server. */
+UNIV_INLINE
+ulint
+sess_srv_msg_get_type(
+/*==================*/
+ /* out: message type */
+ byte* str); /* in: message string */
+/***************************************************************************
+Creates a session sytem at database start. */
+
+void
+sess_sys_init_at_db_start(void);
+/*===========================*/
+/*************************************************************************
+Opens a session. */
+
+sess_t*
+sess_open(
+/*======*/
+ /* out, own: session object */
+ com_endpoint_t* endpoint, /* in: communication endpoint used
+ for communicating with the client */
+ byte* addr_buf, /* in: client address */
+ ulint addr_len); /* in: client address length */
+/*************************************************************************
+Closes a session, freeing the memory occupied by it. */
+
+void
+sess_close(
+/*=======*/
+ sess_t* sess); /* in, own: session object */
+/*************************************************************************
+Raises an SQL error. */
+
+void
+sess_raise_error_low(
+/*=================*/
+ trx_t* trx, /* in: transaction */
+ ulint err_no, /* in: error number */
+ ulint type, /* in: more info of the error, or 0 */
+ dict_table_t* table, /* in: dictionary table or NULL */
+ dict_index_t* index, /* in: table index or NULL */
+ dtuple_t* tuple, /* in: tuple to insert or NULL */
+ rec_t* rec, /* in: record or NULL */
+ char* err_str);/* in: arbitrary null-terminated error string,
+ or NULL */
+/*************************************************************************
+Closes a session, freeing the memory occupied by it, if it is in a state
+where it should be closed. */
+
+ibool
+sess_try_close(
+/*===========*/
+ /* out: TRUE if closed */
+ sess_t* sess); /* in, own: session object */
+/*************************************************************************
+Initializes the first fields of a message to client. */
+
+void
+sess_srv_msg_init(
+/*==============*/
+ sess_t* sess, /* in: session object */
+ byte* buf, /* in: message buffer, must be at least of size
+ SESS_SRV_MSG_DATA */
+ ulint type); /* in: message type */
+/*************************************************************************
+Sends a simple message to client. */
+
+void
+sess_srv_msg_send_simple(
+/*=====================*/
+ sess_t* sess, /* in: session object */
+ ulint type, /* in: message type */
+ ulint rel_kernel); /* in: SESS_RELEASE_KERNEL or
+ SESS_NOT_RELEASE_KERNEL */
+/***************************************************************************
+Processes a message from a client. NOTE: May release the kernel mutex
+temporarily. */
+
+void
+sess_receive_msg_rel_kernel(
+/*========================*/
+ sess_t* sess, /* in: session */
+ byte* str, /* in: message string */
+ ulint len); /* in: message length */
+/***************************************************************************
+When a command has been completed, this function sends the message about it
+to the client. */
+
+void
+sess_command_completed_message(
+/*===========================*/
+ sess_t* sess, /* in: session */
+ byte* msg, /* in: message buffer */
+ ulint len); /* in: message data length */
+/***********************************************************************
+Starts a new connection and a session, or starts a query based on a client
+message. This is called by a SRV_COM thread. */
+
+void
+sess_process_cli_msg(
+/*=================*/
+ byte* str, /* in: message string */
+ ulint len, /* in: string length */
+ byte* addr, /* in: address string */
+ ulint alen); /* in: address length */
+
+
+/* The session handle. All fields are protected by the kernel mutex */
+struct sess_struct{
+ dulint id; /* session id */
+ dulint usr_id; /* user id */
+ hash_node_t hash; /* hash chain node */
+ ulint refer_count; /* reference count to the session
+ object: when this drops to zero
+ and the session has no query graphs
+ left, discarding the session object
+ is allowed */
+ dulint error_count; /* if this counter has increased while
+ a thread is parsing an SQL command,
+ its graph should be discarded */
+ ibool disconnecting; /* TRUE if the session is to be
+ disconnected when its reference
+ count drops to 0 */
+ ulint state; /* state of the session */
+ dulint msgs_sent; /* count of messages sent to the
+ client */
+ dulint msgs_recv; /* count of messages received from the
+ client */
+ ibool client_waits; /* when the session receives a message
+ from the client, this set to TRUE, and
+ when the session sends a message to
+ the client this is set to FALSE */
+ trx_t* trx; /* transaction object permanently
+ assigned for the session: the
+ transaction instance designated by the
+ trx id changes, but the memory
+ structure is preserved */
+ ulint next_graph_id; /* next query graph id to assign */
+ UT_LIST_BASE_NODE_T(que_t)
+ graphs; /* query graphs belonging to this
+ session */
+ /*------------------------------*/
+ ulint err_no; /* latest error number, 0 if none */
+ char* err_str; /* latest error string */
+ ulint err_len; /* error string length */
+ /*------------------------------*/
+ com_endpoint_t* endpoint; /* server communications endpoint used
+ to communicate with the client */
+ char* addr_buf; /* client address string */
+ ulint addr_len; /* client address string length */
+ /*------------------------------*/
+ byte* big_msg; /* if the client sends a message which
+ does not fit in a single packet,
+ it is assembled in this buffer; if
+ this field is not NULL, it is assumed
+ that the message should be catenated
+ here */
+ ulint big_msg_size; /* size of the big message buffer */
+ ulint big_msg_len; /* length of data in the big message
+ buffer */
+};
+
+/* The session system; this is protected by the kernel mutex */
+struct sess_sys_struct{
+ ulint state; /* state of the system:
+ SESS_SYS_RUNNING or
+ SESS_SYS_SHUTTING_DOWN */
+ sess_t* shutdown_req; /* if shutdown was requested by some
+ session, confirmation of shutdown
+ completion should be sent to this
+ session */
+ dulint free_sess_id; /* first unused session id */
+ hash_table_t* hash; /* hash table of the sessions */
+};
+
+
+/*---------------------------------------------------*/
+/* The format of an incoming message from a client */
+#define SESS_CLI_MSG_CHECKSUM 0 /* the checksum should be the first
+ field in the message */
+#define SESS_CLI_MSG_SESS_ID 4 /* this is set to 0 if the client
+ wants to connect and establish
+ a new session */
+#define SESS_CLI_MSG_SESS_ID_CHECK 12 /* checksum of the sess id field */
+#define SESS_CLI_MSG_TYPE 16
+#define SESS_CLI_MSG_NO 20
+#define SESS_CLI_MSG_CONTINUE 28 /* 0, or SESS_MSG_FIRST_PART
+ SESS_MSG_MIDDLE_PART, or
+ SESS_MSG_LAST_PART */
+#define SESS_CLI_MSG_CONT_SIZE 32 /* size of a multipart message in
+ kilobytes (rounded upwards) */
+#define SESS_CLI_MSG_DATA 36
+/*---------------------------------------------------*/
+
+/* Client-to-session message types */
+#define SESS_CLI_CONNECT 1
+#define SESS_CLI_PREPARE 2
+#define SESS_CLI_EXECUTE 3
+#define SESS_CLI_BREAK_EXECUTION 4
+
+/* Client-to-session statement command types */
+#define SESS_COMM_FETCH_NEXT 1
+#define SESS_COMM_FETCH_PREV 2
+#define SESS_COMM_FETCH_FIRST 3
+#define SESS_COMM_FETCH_LAST 4
+#define SESS_COMM_FETCH_NTH 5
+#define SESS_COMM_FETCH_NTH_LAST 6
+#define SESS_COMM_EXECUTE 7
+#define SESS_COMM_NO_COMMAND 8
+
+/*---------------------------------------------------*/
+/* The format of an outgoing message from a session to the client */
+#define SESS_SRV_MSG_CHECKSUM 0 /* the checksum should be the first
+ field in the message */
+#define SESS_SRV_MSG_SESS_ID 4
+#define SESS_SRV_MSG_TYPE 12
+#define SESS_SRV_MSG_NO 16
+#define SESS_SRV_MSG_CONTINUE 24 /* 0, or SESS_MSG_FIRST_PART
+ SESS_MSG_MIDDLE_PART, or
+ SESS_MSG_LAST_PART */
+#define SESS_SRV_MSG_CONT_SIZE 28 /* size of a multipart message
+ in kilobytes (rounded upward) */
+#define SESS_SRV_MSG_DATA 32
+/*---------------------------------------------------*/
+
+/* Session-to-client message types */
+#define SESS_SRV_ACCEPT_CONNECT 1
+#define SESS_SRV_SUCCESS 2
+#define SESS_SRV_ERROR 3
+
+/* Multipart messages */
+#define SESS_MSG_SINGLE_PART 0
+#define SESS_MSG_FIRST_PART 1
+#define SESS_MSG_MIDDLE_PART 2
+#define SESS_MSG_LAST_PART 3
+
+/* Error numbers */
+#define SESS_ERR_NONE 0
+#define SESS_ERR_TRX_COMMITTED 1
+#define SESS_ERR_TRX_ROLLED_BACK 2
+#define SESS_ERR_SESSION_DISCONNECTED 3
+#define SESS_ERR_REPLY_FAILED 4
+#define SESS_ERR_CANNOT_BREAK_OP 5
+#define SESS_ERR_MSG_LOST 6
+#define SESS_ERR_MSG_CORRUPTED 7
+#define SESS_ERR_EXTRANEOUS_MSG 8
+#define SESS_ERR_OUT_OF_MEMORY 9
+#define SESS_ERR_SQL_ERROR 10
+#define SESS_ERR_STMT_NOT_FOUND 11
+#define SESS_ERR_STMT_NOT_READY 12
+#define SESS_ERR_EXTRANEOUS_SRV_MSG 13
+#define SESS_ERR_BREAK_BY_CLIENT 14
+
+/* Session states */
+#define SESS_ACTIVE 1
+#define SESS_ERROR 2 /* session contains an error message
+ which has not yet been communicated
+ to the client */
+/* Session system states */
+#define SESS_SYS_RUNNING 1
+#define SESS_SYS_SHUTTING_DOWN 2
+
+/* Session hash table size */
+#define SESS_HASH_SIZE 1024
+
+/* Flags used in sess_srv_msg_send */
+#define SESS_RELEASE_KERNEL 1
+#define SESS_NOT_RELEASE_KERNEL 2
+
+#ifndef UNIV_NONINL
+#include "usr0sess.ic"
+#endif
+
+#endif
diff --git a/innobase/include/usr0sess.ic b/innobase/include/usr0sess.ic
new file mode 100644
index 00000000000..ee2592c7963
--- /dev/null
+++ b/innobase/include/usr0sess.ic
@@ -0,0 +1,31 @@
+/******************************************************
+Sessions
+
+(c) 1996 Innobase Oy
+
+Created 6/25/1996 Heikki Tuuri
+*******************************************************/
+
+/***************************************************************************
+Sets the message type of a message from the client. */
+UNIV_INLINE
+void
+sess_cli_msg_set_type(
+/*==================*/
+ byte* str, /* in: message string */
+ ulint type) /* in: message type */
+{
+ mach_write_to_4(str + SESS_CLI_MSG_TYPE, type);
+}
+
+/***************************************************************************
+Gets the message type of a message from the server. */
+UNIV_INLINE
+ulint
+sess_srv_msg_get_type(
+/*==================*/
+ /* out: message type */
+ byte* str) /* in: message string */
+{
+ return(mach_read_from_4(str + SESS_SRV_MSG_TYPE));
+}
diff --git a/innobase/include/usr0types.h b/innobase/include/usr0types.h
new file mode 100644
index 00000000000..67070ccce27
--- /dev/null
+++ b/innobase/include/usr0types.h
@@ -0,0 +1,16 @@
+/******************************************************
+Users and sessions global types
+
+(c) 1996 Innobase Oy
+
+Created 6/25/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef usr0types_h
+#define usr0types_h
+
+typedef struct sess_struct sess_t;
+typedef struct sess_sys_struct sess_sys_t;
+typedef struct sess_sig_struct sess_sig_t;
+
+#endif
diff --git a/innobase/include/ut0byte.h b/innobase/include/ut0byte.h
new file mode 100644
index 00000000000..77795ee0708
--- /dev/null
+++ b/innobase/include/ut0byte.h
@@ -0,0 +1,229 @@
+/**********************************************************************
+Utilities for byte operations
+
+(c) 1994, 1995 Innobase Oy
+
+Created 1/20/1994 Heikki Tuuri
+***********************************************************************/
+
+#ifndef ut0byte_h
+#define ut0byte_h
+
+
+#include "univ.i"
+
+/* Type definition for a 64-bit unsigned integer, which works also
+in 32-bit machines. NOTE! Access the fields only with the accessor
+functions. This definition appears here only for the compiler to
+know the size of a dulint. */
+
+typedef struct dulint_struct dulint;
+struct dulint_struct{
+ ulint high; /* most significant 32 bits */
+ ulint low; /* least significant 32 bits */
+};
+
+/* Zero value for a dulint */
+extern dulint ut_dulint_zero;
+
+/* Maximum value for a dulint */
+extern dulint ut_dulint_max;
+
+/***********************************************************
+Creates a 64-bit dulint out of two ulints. */
+UNIV_INLINE
+dulint
+ut_dulint_create(
+/*=============*/
+ /* out: created dulint */
+ ulint high, /* in: high-order 32 bits */
+ ulint low); /* in: low-order 32 bits */
+/***********************************************************
+Gets the high-order 32 bits of a dulint. */
+UNIV_INLINE
+ulint
+ut_dulint_get_high(
+/*===============*/
+ /* out: 32 bits in ulint */
+ dulint d); /* in: dulint */
+/***********************************************************
+Gets the low-order 32 bits of a dulint. */
+UNIV_INLINE
+ulint
+ut_dulint_get_low(
+/*==============*/
+ /* out: 32 bits in ulint */
+ dulint d); /* in: dulint */
+/***********************************************************
+Tests if a dulint is zero. */
+UNIV_INLINE
+ibool
+ut_dulint_is_zero(
+/*==============*/
+ /* out: TRUE if zero */
+ dulint a); /* in: dulint */
+/***********************************************************
+Compares two dulints. */
+UNIV_INLINE
+int
+ut_dulint_cmp(
+/*==========*/
+ /* out: -1 if a < b, 0 if a == b,
+ 1 if a > b */
+ dulint a, /* in: dulint */
+ dulint b); /* in: dulint */
+/***********************************************************
+Calculates the max of two dulints. */
+UNIV_INLINE
+dulint
+ut_dulint_get_max(
+/*==============*/
+ /* out: max(a, b) */
+ dulint a, /* in: dulint */
+ dulint b); /* in: dulint */
+/***********************************************************
+Calculates the min of two dulints. */
+UNIV_INLINE
+dulint
+ut_dulint_get_min(
+/*==============*/
+ /* out: min(a, b) */
+ dulint a, /* in: dulint */
+ dulint b); /* in: dulint */
+/***********************************************************
+Adds a ulint to a dulint. */
+UNIV_INLINE
+dulint
+ut_dulint_add(
+/*==========*/
+ /* out: sum a + b */
+ dulint a, /* in: dulint */
+ ulint b); /* in: ulint */
+/***********************************************************
+Subtracts a ulint from a dulint. */
+UNIV_INLINE
+dulint
+ut_dulint_subtract(
+/*===============*/
+ /* out: a - b */
+ dulint a, /* in: dulint */
+ ulint b); /* in: ulint, b <= a */
+/***********************************************************
+Subtracts a dulint from another. NOTE that the difference must be positive
+and smaller that 4G. */
+UNIV_INLINE
+ulint
+ut_dulint_minus(
+/*============*/
+ /* out: a - b */
+ dulint a, /* in: dulint; NOTE a must be >= b and at most
+ 2 to power 32 - 1 greater */
+ dulint b); /* in: dulint */
+/************************************************************
+Rounds a dulint downward to a multiple of a power of 2. */
+UNIV_INLINE
+dulint
+ut_dulint_align_down(
+/*=================*/
+ /* out: rounded value */
+ dulint n, /* in: number to be rounded */
+ ulint align_no); /* in: align by this number which must be a
+ power of 2 */
+/************************************************************
+Rounds a dulint upward to a multiple of a power of 2. */
+UNIV_INLINE
+dulint
+ut_dulint_align_up(
+/*===============*/
+ /* out: rounded value */
+ dulint n, /* in: number to be rounded */
+ ulint align_no); /* in: align by this number which must be a
+ power of 2 */
+/***********************************************************
+Increments a dulint variable by 1. */
+#define UT_DULINT_INC(D)\
+{\
+ if ((D).low == 0xFFFFFFFF) {\
+ (D).high = (D).high + 1;\
+ (D).low = 0;\
+ } else {\
+ (D).low = (D).low + 1;\
+ }\
+}
+/***********************************************************
+Tests if two dulints are equal. */
+#define UT_DULINT_EQ(D1, D2) (((D1).low == (D2).low)\
+ && ((D1).high == (D2).high))
+/****************************************************************
+Sort function for dulint arrays. */
+void
+ut_dulint_sort(dulint* arr, dulint* aux_arr, ulint low, ulint high);
+/*===============================================================*/
+/************************************************************
+The following function calculates the value of an integer n rounded
+to the least product of align_no which is >= n. align_no has to be a
+power of 2. */
+UNIV_INLINE
+ulint
+ut_calc_align(
+/*==========*/
+ /* out: rounded value */
+ ulint n, /* in: number to be rounded */
+ ulint align_no); /* in: align by this number */
+/************************************************************
+The following function calculates the value of an integer n rounded
+to the biggest product of align_no which is <= n. align_no has to be a
+power of 2. */
+UNIV_INLINE
+ulint
+ut_calc_align_down(
+/*===============*/
+ /* out: rounded value */
+ ulint n, /* in: number to be rounded */
+ ulint align_no); /* in: align by this number */
+/*************************************************************
+The following function rounds up a pointer to the nearest aligned address. */
+UNIV_INLINE
+void*
+ut_align(
+/*=====*/
+ /* out: aligned pointer */
+ void* ptr, /* in: pointer */
+ ulint align_no); /* in: align by this number */
+/*************************************************************
+The following function rounds down a pointer to the nearest
+aligned address. */
+UNIV_INLINE
+void*
+ut_align_down(
+/*==========*/
+ /* out: aligned pointer */
+ void* ptr, /* in: pointer */
+ ulint align_no); /* in: align by this number */
+/*********************************************************************
+Gets the nth bit of a ulint. */
+UNIV_INLINE
+ibool
+ut_bit_get_nth(
+/*===========*/
+ /* out: TRUE if nth bit is 1; 0th bit is defined to
+ be the least significant */
+ ulint a, /* in: ulint */
+ ulint n); /* in: nth bit requested */
+/*********************************************************************
+Sets the nth bit of a ulint. */
+UNIV_INLINE
+ulint
+ut_bit_set_nth(
+/*===========*/
+ /* out: the ulint with the bit set as requested */
+ ulint a, /* in: ulint */
+ ulint n, /* in: nth bit requested */
+ ibool val); /* in: value for the bit to set */
+
+
+#ifndef UNIV_NONINL
+#include "ut0byte.ic"
+#endif
+
+#endif
diff --git a/innobase/include/ut0byte.ic b/innobase/include/ut0byte.ic
new file mode 100644
index 00000000000..b8170392c8f
--- /dev/null
+++ b/innobase/include/ut0byte.ic
@@ -0,0 +1,360 @@
+/******************************************************************
+Utilities for byte operations
+
+(c) 1994, 1995 Innobase Oy
+
+Created 5/30/1994 Heikki Tuuri
+*******************************************************************/
+
+/***********************************************************
+Creates a 64-bit dulint out of two ulints. */
+UNIV_INLINE
+dulint
+ut_dulint_create(
+/*=============*/
+ /* out: created dulint */
+ ulint high, /* in: high-order 32 bits */
+ ulint low) /* in: low-order 32 bits */
+{
+ dulint res;
+
+ ut_ad(high <= 0xFFFFFFFF);
+ ut_ad(low <= 0xFFFFFFFF);
+
+ res.high = high;
+ res.low = low;
+
+ return(res);
+}
+
+/***********************************************************
+Gets the high-order 32 bits of a dulint. */
+UNIV_INLINE
+ulint
+ut_dulint_get_high(
+/*===============*/
+ /* out: 32 bits in ulint */
+ dulint d) /* in: dulint */
+{
+ return(d.high);
+}
+
+/***********************************************************
+Gets the low-order 32 bits of a dulint. */
+UNIV_INLINE
+ulint
+ut_dulint_get_low(
+/*==============*/
+ /* out: 32 bits in ulint */
+ dulint d) /* in: dulint */
+{
+ return(d.low);
+}
+
+/***********************************************************
+Tests if a dulint is zero. */
+UNIV_INLINE
+ibool
+ut_dulint_is_zero(
+/*==============*/
+ /* out: TRUE if zero */
+ dulint a) /* in: dulint */
+{
+ if ((a.low == 0) && (a.high == 0)) {
+
+ return(TRUE);
+ }
+
+ return(FALSE);
+}
+
+/***********************************************************
+Compares two dulints. */
+UNIV_INLINE
+int
+ut_dulint_cmp(
+/*==========*/
+ /* out: -1 if a < b, 0 if a == b,
+ 1 if a > b */
+ dulint a, /* in: dulint */
+ dulint b) /* in: dulint */
+{
+ if (a.high > b.high) {
+ return(1);
+ } else if (a.high < b.high) {
+ return(-1);
+ } else if (a.low > b.low) {
+ return(1);
+ } else if (a.low < b.low) {
+ return(-1);
+ } else {
+ return(0);
+ }
+}
+
+/***********************************************************
+Calculates the max of two dulints. */
+UNIV_INLINE
+dulint
+ut_dulint_get_max(
+/*==============*/
+ /* out: max(a, b) */
+ dulint a, /* in: dulint */
+ dulint b) /* in: dulint */
+{
+ if (ut_dulint_cmp(a, b) > 0) {
+
+ return(a);
+ }
+
+ return(b);
+}
+
+/***********************************************************
+Calculates the min of two dulints. */
+UNIV_INLINE
+dulint
+ut_dulint_get_min(
+/*==============*/
+ /* out: min(a, b) */
+ dulint a, /* in: dulint */
+ dulint b) /* in: dulint */
+{
+ if (ut_dulint_cmp(a, b) > 0) {
+
+ return(b);
+ }
+
+ return(a);
+}
+
+/***********************************************************
+Adds a ulint to a dulint. */
+UNIV_INLINE
+dulint
+ut_dulint_add(
+/*==========*/
+ /* out: sum a + b */
+ dulint a, /* in: dulint */
+ ulint b) /* in: ulint */
+{
+ if (0xFFFFFFFF - b >= a.low) {
+ a.low += b;
+
+ return(a);
+ }
+
+ a.low = a.low - (0xFFFFFFFF - b) - 1;
+
+ a.high++;
+
+ return(a);
+}
+
+/***********************************************************
+Subtracts a ulint from a dulint. */
+UNIV_INLINE
+dulint
+ut_dulint_subtract(
+/*===============*/
+ /* out: a - b */
+ dulint a, /* in: dulint */
+ ulint b) /* in: ulint, b <= a */
+{
+ if (a.low >= b) {
+ a.low -= b;
+
+ return(a);
+ }
+
+ b -= a.low + 1;
+
+ a.low = 0xFFFFFFFF - b;
+
+ ut_ad(a.high > 0);
+
+ a.high--;
+
+ return(a);
+}
+
+/***********************************************************
+Subtracts a dulint from another. NOTE that the difference must be positive
+and smaller that 4G. */
+UNIV_INLINE
+ulint
+ut_dulint_minus(
+/*============*/
+ /* out: a - b */
+ dulint a, /* in: dulint; NOTE a must be >= b and at most
+ 2 to power 32 - 1 greater */
+ dulint b) /* in: dulint */
+{
+ ulint diff;
+
+ if (a.high == b.high) {
+ ut_ad(a.low >= b.low);
+
+ return(a.low - b.low);
+ }
+
+ ut_ad(a.high == b.high + 1);
+
+ diff = (ulint)(0xFFFFFFFF - b.low);
+ diff += 1 + a.low;
+
+ ut_ad(diff > a.low);
+
+ return(diff);
+}
+
+/************************************************************
+Rounds a dulint downward to a multiple of a power of 2. */
+UNIV_INLINE
+dulint
+ut_dulint_align_down(
+/*=================*/
+ /* out: rounded value */
+ dulint n, /* in: number to be rounded */
+ ulint align_no) /* in: align by this number which must be a
+ power of 2 */
+{
+ ulint low, high;
+
+ ut_ad(align_no > 0);
+ ut_ad(((align_no - 1) & align_no) == 0);
+
+ low = ut_dulint_get_low(n);
+ high = ut_dulint_get_high(n);
+
+ low = low & ~(align_no - 1);
+
+ return(ut_dulint_create(high, low));
+}
+
+/************************************************************
+Rounds a dulint upward to a multiple of a power of 2. */
+UNIV_INLINE
+dulint
+ut_dulint_align_up(
+/*===============*/
+ /* out: rounded value */
+ dulint n, /* in: number to be rounded */
+ ulint align_no) /* in: align by this number which must be a
+ power of 2 */
+{
+ return(ut_dulint_align_down(ut_dulint_add(n, align_no - 1), align_no));
+}
+
+/************************************************************
+The following function calculates the value of an integer n rounded
+to the least product of align_no which is >= n. align_no
+has to be a power of 2. */
+UNIV_INLINE
+ulint
+ut_calc_align(
+/*==========*/
+ /* out: rounded value */
+ ulint n, /* in: number to be rounded */
+ ulint align_no) /* in: align by this number */
+{
+ ut_ad(align_no > 0);
+ ut_ad(((align_no - 1) & align_no) == 0);
+
+ return((n + align_no - 1) & ~(align_no - 1));
+}
+
+/*************************************************************
+The following function rounds up a pointer to the nearest aligned address. */
+UNIV_INLINE
+void*
+ut_align(
+/*=====*/
+ /* out: aligned pointer */
+ void* ptr, /* in: pointer */
+ ulint align_no) /* in: align by this number */
+{
+ ut_ad(align_no > 0);
+ ut_ad(((align_no - 1) & align_no) == 0);
+ ut_ad(ptr);
+
+ ut_ad(sizeof(void*) == sizeof(ulint));
+
+ return((void*)((((ulint)ptr) + align_no - 1) & ~(align_no - 1)));
+}
+
+/************************************************************
+The following function calculates the value of an integer n rounded
+to the biggest product of align_no which is <= n. align_no has to be a
+power of 2. */
+UNIV_INLINE
+ulint
+ut_calc_align_down(
+/*===============*/
+ /* out: rounded value */
+ ulint n, /* in: number to be rounded */
+ ulint align_no) /* in: align by this number */
+{
+ ut_ad(align_no > 0);
+ ut_ad(((align_no - 1) & align_no) == 0);
+
+ return(n & ~(align_no - 1));
+}
+
+/*************************************************************
+The following function rounds down a pointer to the nearest
+aligned address. */
+UNIV_INLINE
+void*
+ut_align_down(
+/*==========*/
+ /* out: aligned pointer */
+ void* ptr, /* in: pointer */
+ ulint align_no) /* in: align by this number */
+{
+ ut_ad(align_no > 0);
+ ut_ad(((align_no - 1) & align_no) == 0);
+ ut_ad(ptr);
+
+ ut_ad(sizeof(void*) == sizeof(ulint));
+
+ return((void*)((((ulint)ptr)) & ~(align_no - 1)));
+}
+
+/*********************************************************************
+Gets the nth bit of a ulint. */
+UNIV_INLINE
+ibool
+ut_bit_get_nth(
+/*===========*/
+ /* out: TRUE if nth bit is 1; 0th bit is defined to
+ be the least significant */
+ ulint a, /* in: ulint */
+ ulint n) /* in: nth bit requested */
+{
+ ut_ad(n < 8 * sizeof(ulint));
+ ut_ad(TRUE == 1);
+
+ return(1 & (a >> n));
+}
+
+/*********************************************************************
+Sets the nth bit of a ulint. */
+UNIV_INLINE
+ulint
+ut_bit_set_nth(
+/*===========*/
+ /* out: the ulint with the bit set as requested */
+ ulint a, /* in: ulint */
+ ulint n, /* in: nth bit requested */
+ ibool val) /* in: value for the bit to set */
+{
+ ut_ad(n < 8 * sizeof(ulint));
+ ut_ad(TRUE == 1);
+
+ if (val) {
+ return((1 << n) | a);
+ } else {
+ return(~(1 << n) & a);
+ }
+}
diff --git a/innobase/include/ut0dbg.h b/innobase/include/ut0dbg.h
new file mode 100644
index 00000000000..cf49f4f993f
--- /dev/null
+++ b/innobase/include/ut0dbg.h
@@ -0,0 +1,78 @@
+/*********************************************************************
+Debug utilities for Innobase
+
+(c) 1994, 1995 Innobase Oy
+
+Created 1/30/1994 Heikki Tuuri
+**********************************************************************/
+
+#ifndef ut0dbg_h
+#define ut0dbg_h
+
+#include <assert.h>
+#include <stdlib.h>
+#include "univ.i"
+#include "os0thread.h"
+
+extern ulint ut_dbg_zero; /* This is used to eliminate
+ compiler warnings */
+extern ibool ut_dbg_stop_threads;
+
+extern ulint* ut_dbg_null_ptr;
+
+
+#define ut_a(EXPR)\
+{\
+ ulint dbg_i;\
+\
+ if (!((ulint)(EXPR) + ut_dbg_zero)) {\
+ /* printf(\
+ "Assertion failure in thread %lu in file %s line %lu\n",\
+ os_thread_get_curr_id(), __FILE__, (ulint)__LINE__);\
+ printf(\
+ "we generate a memory trap on purpose to start the debugger\n");*/\
+ ut_dbg_stop_threads = TRUE;\
+ dbg_i = *(ut_dbg_null_ptr);\
+ if (dbg_i) {\
+ ut_dbg_null_ptr = NULL;\
+ }\
+ }\
+ if (ut_dbg_stop_threads) {\
+ printf("Thread %lu stopped in file %s line %lu\n",\
+ os_thread_get_curr_id(), __FILE__, (ulint)__LINE__);\
+ os_thread_sleep(1000000000);\
+ }\
+}
+
+#define ut_error {\
+ ulint dbg_i;\
+ printf(\
+ "Assertion failure in thread %lu in file %s line %lu\n",\
+ os_thread_get_curr_id(), __FILE__, (ulint)__LINE__);\
+ printf("Generates memory trap on purpose for stack debugging\n");\
+ ut_dbg_stop_threads = TRUE;\
+ dbg_i = *(ut_dbg_null_ptr);\
+ printf("%lu", dbg_i);\
+}
+
+
+
+#ifdef UNIV_DEBUG
+#define ut_ad(EXPR) ut_a(EXPR)
+#define ut_d(EXPR) {EXPR;}
+#else
+#define ut_ad(EXPR)
+#define ut_d(EXPR)
+#endif
+
+
+#define UT_NOT_USED(A) A = A
+
+
+
+
+
+
+
+#endif
+
diff --git a/innobase/include/ut0lst.h b/innobase/include/ut0lst.h
new file mode 100644
index 00000000000..d290c476963
--- /dev/null
+++ b/innobase/include/ut0lst.h
@@ -0,0 +1,215 @@
+/**********************************************************************
+List utilities
+
+(c) 1995 Innobase Oy
+
+Created 9/10/1995 Heikki Tuuri
+***********************************************************************/
+
+#ifndef ut0lst_h
+#define ut0lst_h
+
+#include "univ.i"
+
+/* This module implements the two-way linear list which should be used
+if a list is used in the database. Note that a single struct may belong
+to two or more lists, provided that the list are given different names.
+An example of the usage of the lists can be found in fil0fil.c. */
+
+/***********************************************************************
+This macro expands to the unnamed type definition of a struct which acts
+as the two-way list base node. The base node contains pointers
+to both ends of the list and a count of nodes in the list (excluding
+the base node from the count). TYPE should be the list node type name. */
+
+#define UT_LIST_BASE_NODE_T(TYPE)\
+struct {\
+ ulint count; /* count of nodes in list */\
+ TYPE * start; /* pointer to list start, NULL if empty */\
+ TYPE * end; /* pointer to list end, NULL if empty */\
+}\
+
+/***********************************************************************
+This macro expands to the unnamed type definition of a struct which
+should be embedded in the nodes of the list, the node type must be a struct.
+This struct contains the pointers to next and previous nodes in the list.
+The name of the field in the node struct should be the name given
+to the list. TYPE should be the list node type name. Example of usage:
+
+typedef struct LRU_node_struct LRU_node_t;
+struct LRU_node_struct {
+ UT_LIST_NODE_T(LRU_node_t) LRU_list;
+ ...
+}
+The example implements an LRU list of name LRU_list. Its nodes are of type
+LRU_node_t.
+*/
+
+#define UT_LIST_NODE_T(TYPE)\
+struct {\
+ TYPE * prev; /* pointer to the previous node,\
+ NULL if start of list */\
+ TYPE * next; /* pointer to next node, NULL if end of list */\
+}\
+
+/***********************************************************************
+Initializes the base node of a two-way list. */
+
+#define UT_LIST_INIT(BASE)\
+{\
+ (BASE).count = 0;\
+ (BASE).start = NULL;\
+ (BASE).end = NULL;\
+}\
+
+/***********************************************************************
+Adds the node as the first element in a two-way linked list.
+BASE has to be the base node (not a pointer to it). N has to be
+the pointer to the node to be added to the list. NAME is the list name. */
+
+#define UT_LIST_ADD_FIRST(NAME, BASE, N)\
+{\
+ ut_ad(N);\
+ ((BASE).count)++;\
+ ((N)->NAME).next = (BASE).start;\
+ ((N)->NAME).prev = NULL;\
+ if ((BASE).start != NULL) {\
+ (((BASE).start)->NAME).prev = (N);\
+ }\
+ (BASE).start = (N);\
+ if ((BASE).end == NULL) {\
+ (BASE).end = (N);\
+ }\
+}\
+
+/***********************************************************************
+Adds the node as the last element in a two-way linked list.
+BASE has to be the base node (not a pointer to it). N has to be
+the pointer to the node to be added to the list. NAME is the list name. */
+
+#define UT_LIST_ADD_LAST(NAME, BASE, N)\
+{\
+ ut_ad(N);\
+ ((BASE).count)++;\
+ ((N)->NAME).prev = (BASE).end;\
+ ((N)->NAME).next = NULL;\
+ if ((BASE).end != NULL) {\
+ (((BASE).end)->NAME).next = (N);\
+ }\
+ (BASE).end = (N);\
+ if ((BASE).start == NULL) {\
+ (BASE).start = (N);\
+ }\
+}\
+
+/***********************************************************************
+Inserts a NODE2 after NODE1 in a list.
+BASE has to be the base node (not a pointer to it). NAME is the list
+name, NODE1 and NODE2 are pointers to nodes. */
+
+#define UT_LIST_INSERT_AFTER(NAME, BASE, NODE1, NODE2)\
+{\
+ ut_ad(NODE1);\
+ ut_ad(NODE2);\
+ ((BASE).count)++;\
+ ((NODE2)->NAME).prev = (NODE1);\
+ ((NODE2)->NAME).next = ((NODE1)->NAME).next;\
+ if (((NODE1)->NAME).next != NULL) {\
+ ((((NODE1)->NAME).next)->NAME).prev = (NODE2);\
+ }\
+ ((NODE1)->NAME).next = (NODE2);\
+ if ((BASE).end == (NODE1)) {\
+ (BASE).end = (NODE2);\
+ }\
+}\
+
+/***********************************************************************
+Removes a node from a two-way linked list. BASE has to be the base node
+(not a pointer to it). N has to be the pointer to the node to be removed
+from the list. NAME is the list name. */
+
+#define UT_LIST_REMOVE(NAME, BASE, N)\
+{\
+ ut_ad(N);\
+ ut_a((BASE).count > 0);\
+ ((BASE).count)--;\
+ if (((N)->NAME).next != NULL) {\
+ ((((N)->NAME).next)->NAME).prev = ((N)->NAME).prev;\
+ } else {\
+ (BASE).end = ((N)->NAME).prev;\
+ }\
+ if (((N)->NAME).prev != NULL) {\
+ ((((N)->NAME).prev)->NAME).next = ((N)->NAME).next;\
+ } else {\
+ (BASE).start = ((N)->NAME).next;\
+ }\
+}\
+
+/************************************************************************
+Gets the next node in a two-way list. NAME is the name of the list
+and N is pointer to a node. */
+
+#define UT_LIST_GET_NEXT(NAME, N)\
+ (((N)->NAME).next)
+
+/************************************************************************
+Gets the previous node in a two-way list. NAME is the name of the list
+and N is pointer to a node. */
+
+#define UT_LIST_GET_PREV(NAME, N)\
+ (((N)->NAME).prev)
+
+/************************************************************************
+Alternative macro to get the number of nodes in a two-way list, i.e.,
+its length. BASE is the base node (not a pointer to it). */
+
+#define UT_LIST_GET_LEN(BASE)\
+ (BASE).count
+
+/************************************************************************
+Gets the first node in a two-way list, or returns NULL,
+if the list is empty. BASE is the base node (not a pointer to it). */
+
+#define UT_LIST_GET_FIRST(BASE)\
+ (BASE).start
+
+/************************************************************************
+Gets the last node in a two-way list, or returns NULL,
+if the list is empty. BASE is the base node (not a pointer to it). */
+
+#define UT_LIST_GET_LAST(BASE)\
+ (BASE).end
+
+/************************************************************************
+Checks the consistency of a two-way list. NAME is the name of the list,
+TYPE is the node type, and BASE is the base node (not a pointer to it). */
+
+#define UT_LIST_VALIDATE(NAME, TYPE, BASE)\
+{\
+ ulint ut_list_i_313;\
+ TYPE * ut_list_node_313;\
+\
+ ut_list_node_313 = (BASE).start;\
+\
+ for (ut_list_i_313 = 0; ut_list_i_313 < (BASE).count;\
+ ut_list_i_313++) {\
+ ut_a(ut_list_node_313);\
+ ut_list_node_313 = (ut_list_node_313->NAME).next;\
+ }\
+\
+ ut_a(ut_list_node_313 == NULL);\
+\
+ ut_list_node_313 = (BASE).end;\
+\
+ for (ut_list_i_313 = 0; ut_list_i_313 < (BASE).count;\
+ ut_list_i_313++) {\
+ ut_a(ut_list_node_313);\
+ ut_list_node_313 = (ut_list_node_313->NAME).prev;\
+ }\
+\
+ ut_a(ut_list_node_313 == NULL);\
+}\
+
+
+#endif
+
diff --git a/innobase/include/ut0mem.h b/innobase/include/ut0mem.h
new file mode 100644
index 00000000000..4d266f34c17
--- /dev/null
+++ b/innobase/include/ut0mem.h
@@ -0,0 +1,64 @@
+/***********************************************************************
+Memory primitives
+
+(c) 1994, 1995 Innobase Oy
+
+Created 5/30/1994 Heikki Tuuri
+************************************************************************/
+
+#ifndef ut0mem_h
+#define ut0mem_h
+
+#include <string.h>
+#include <stdlib.h>
+#include "univ.i"
+
+UNIV_INLINE
+void*
+ut_memcpy(void* dest, void* sour, ulint n);
+
+UNIV_INLINE
+void*
+ut_memmove(void* dest, void* sour, ulint n);
+
+UNIV_INLINE
+int
+ut_memcmp(void* str1, void* str2, ulint n);
+
+
+void*
+ut_malloc(ulint n);
+
+UNIV_INLINE
+void
+ut_free(void* ptr);
+
+UNIV_INLINE
+char*
+ut_strcpy(char* dest, char* sour);
+
+UNIV_INLINE
+ulint
+ut_strlen(char* str);
+
+UNIV_INLINE
+int
+ut_strcmp(void* str1, void* str2);
+
+/**************************************************************************
+Catenates two strings into newly allocated memory. The memory must be freed
+using mem_free. */
+
+char*
+ut_str_catenate(
+/*============*/
+ /* out, own: catenated null-terminated string */
+ char* str1, /* in: null-terminated string */
+ char* str2); /* in: null-terminated string */
+
+#ifndef UNIV_NONINL
+#include "ut0mem.ic"
+#endif
+
+#endif
+
diff --git a/innobase/include/ut0mem.ic b/innobase/include/ut0mem.ic
new file mode 100644
index 00000000000..fc4b6bd8be5
--- /dev/null
+++ b/innobase/include/ut0mem.ic
@@ -0,0 +1,57 @@
+/***********************************************************************
+Memory primitives
+
+(c) 1994, 1995 Innobase Oy
+
+Created 5/30/1994 Heikki Tuuri
+************************************************************************/
+
+UNIV_INLINE
+void*
+ut_memcpy(void* dest, void* sour, ulint n)
+{
+ return(memcpy(dest, sour, n));
+}
+
+UNIV_INLINE
+void*
+ut_memmove(void* dest, void* sour, ulint n)
+{
+ return(memmove(dest, sour, n));
+}
+
+UNIV_INLINE
+int
+ut_memcmp(void* str1, void* str2, ulint n)
+{
+ return(memcmp(str1, str2, n));
+}
+
+UNIV_INLINE
+void
+ut_free(void* ptr)
+{
+ free(ptr);
+}
+
+UNIV_INLINE
+char*
+ut_strcpy(char* dest, char* sour)
+{
+ return(strcpy(dest, sour));
+}
+
+UNIV_INLINE
+ulint
+ut_strlen(char* str)
+{
+ return(strlen(str));
+}
+
+UNIV_INLINE
+int
+ut_strcmp(void* str1, void* str2)
+{
+ return(strcmp((char*)str1, (char*)str2));
+}
+
diff --git a/innobase/include/ut0rnd.h b/innobase/include/ut0rnd.h
new file mode 100644
index 00000000000..a30251e6da0
--- /dev/null
+++ b/innobase/include/ut0rnd.h
@@ -0,0 +1,121 @@
+/**********************************************************************
+Random numbers and hashing
+
+(c) 1994, 1995 Innobase Oy
+
+Created 1/20/1994 Heikki Tuuri
+***********************************************************************/
+
+#ifndef ut0rnd_h
+#define ut0rnd_h
+
+#include "univ.i"
+
+#include "ut0byte.h"
+
+/* The 'character code' for end of field or string (used
+in folding records */
+#define UT_END_OF_FIELD 257
+
+/************************************************************
+This is used to set the random number seed. */
+UNIV_INLINE
+void
+ut_rnd_set_seed(
+/*============*/
+ ulint seed); /* in: seed */
+/************************************************************
+The following function generates a series of 'random' ulint integers. */
+UNIV_INLINE
+ulint
+ut_rnd_gen_next_ulint(
+/*==================*/
+ /* out: the next 'random' number */
+ ulint rnd); /* in: the previous random number value */
+/*************************************************************
+The following function generates 'random' ulint integers which
+enumerate the value space (let there be N of them) of ulint integers
+in a pseudo random fashion. Note that the same integer is repeated
+always after N calls to the generator. */
+UNIV_INLINE
+ulint
+ut_rnd_gen_ulint(void);
+/*==================*/
+ /* out: the 'random' number */
+/************************************************************
+Generates a random integer from a given interval. */
+UNIV_INLINE
+ulint
+ut_rnd_interval(
+/*============*/
+ /* out: the 'random' number */
+ ulint low, /* in: low limit; can generate also this value */
+ ulint high); /* in: high limit; can generate also this value */
+/*************************************************************
+Generates a random iboolean value. */
+UNIV_INLINE
+ibool
+ut_rnd_gen_ibool(void);
+/*=================*/
+ /* out: the random value */
+/***********************************************************
+The following function generates a hash value for a ulint integer
+to a hash table of size table_size, which should be a prime or some
+random number to work reliably. */
+UNIV_INLINE
+ulint
+ut_hash_ulint(
+/*=========*/
+ /* out: hash value */
+ ulint key, /* in: value to be hashed */
+ ulint table_size); /* in: hash table size */
+/*****************************************************************
+Folds a pair of ulints. */
+UNIV_INLINE
+ulint
+ut_fold_ulint_pair(
+/*===============*/
+ /* out: folded value */
+ ulint n1, /* in: ulint */
+ ulint n2); /* in: ulint */
+/*****************************************************************
+Folds a dulint. */
+UNIV_INLINE
+ulint
+ut_fold_dulint(
+/*===========*/
+ /* out: folded value */
+ dulint d); /* in: dulint */
+/*****************************************************************
+Folds a character string ending in the null character. */
+UNIV_INLINE
+ulint
+ut_fold_string(
+/*===========*/
+ /* out: folded value */
+ char* str); /* in: null-terminated string */
+/*****************************************************************
+Folds a binary string. */
+UNIV_INLINE
+ulint
+ut_fold_binary(
+/*===========*/
+ /* out: folded value */
+ byte* str, /* in: string of bytes */
+ ulint len); /* in: length */
+/***************************************************************
+Looks for a prime number slightly greater than the given argument.
+The prime is chosen so that it is not near any power of 2. */
+
+ulint
+ut_find_prime(
+/*==========*/
+ /* out: prime */
+ ulint n); /* in: positive number > 100 */
+
+
+#ifndef UNIV_NONINL
+#include "ut0rnd.ic"
+#endif
+
+#endif
diff --git a/innobase/include/ut0rnd.ic b/innobase/include/ut0rnd.ic
new file mode 100644
index 00000000000..e166a26fe86
--- /dev/null
+++ b/innobase/include/ut0rnd.ic
@@ -0,0 +1,222 @@
+/******************************************************************
+Random numbers and hashing
+
+(c) 1994, 1995 Innobase Oy
+
+Created 5/30/1994 Heikki Tuuri
+*******************************************************************/
+
+#define UT_HASH_RANDOM_MASK 1463735687
+#define UT_HASH_RANDOM_MASK2 1653893711
+#define UT_RND1 151117737
+#define UT_RND2 119785373
+#define UT_RND3 85689495
+#define UT_RND4 76595339
+#define UT_SUM_RND2 98781234
+#define UT_SUM_RND3 126792457
+#define UT_SUM_RND4 63498502
+#define UT_XOR_RND1 187678878
+#define UT_XOR_RND2 143537923
+
+extern ulint ut_rnd_ulint_counter;
+
+/************************************************************
+This is used to set the random number seed. */
+UNIV_INLINE
+void
+ut_rnd_set_seed(
+/*============*/
+ ulint seed) /* in: seed */
+{
+ ut_rnd_ulint_counter = seed;
+}
+
+/************************************************************
+The following function generates a series of 'random' ulint integers. */
+UNIV_INLINE
+ulint
+ut_rnd_gen_next_ulint(
+/*==================*/
+ /* out: the next 'random' number */
+ ulint rnd) /* in: the previous random number value */
+{
+ ulint n_bits;
+
+ n_bits = 8 * sizeof(ulint);
+
+ rnd = UT_RND2 * rnd + UT_SUM_RND3;
+ rnd = UT_XOR_RND1 ^ rnd;
+ rnd = (rnd << 20) + (rnd >> (n_bits - 20));
+ rnd = UT_RND3 * rnd + UT_SUM_RND4;
+ rnd = UT_XOR_RND2 ^ rnd;
+ rnd = (rnd << 20) + (rnd >> (n_bits - 20));
+ rnd = UT_RND1 * rnd + UT_SUM_RND2;
+
+ return(rnd);
+}
+
+/************************************************************
+The following function generates 'random' ulint integers which
+enumerate the value space of ulint integers in a pseudo random
+fashion. Note that the same integer is repeated always after
+2 to power 32 calls to the generator (if ulint is 32-bit). */
+UNIV_INLINE
+ulint
+ut_rnd_gen_ulint(void)
+/*==================*/
+ /* out: the 'random' number */
+{
+ ulint rnd;
+ ulint n_bits;
+
+ n_bits = 8 * sizeof(ulint);
+
+ ut_rnd_ulint_counter =
+ UT_RND1 * ut_rnd_ulint_counter + UT_RND2;
+
+ rnd = ut_rnd_gen_next_ulint(ut_rnd_ulint_counter);
+
+ return(rnd);
+}
+
+/************************************************************
+Generates a random integer from a given interval. */
+UNIV_INLINE
+ulint
+ut_rnd_interval(
+/*============*/
+ /* out: the 'random' number */
+ ulint low, /* in: low limit; can generate also this value */
+ ulint high) /* in: high limit; can generate also this value */
+{
+ ulint rnd;
+
+ ut_ad(high >= low);
+
+ if (low == high) {
+
+ return(low);
+ }
+
+ rnd = ut_rnd_gen_ulint();
+
+ return(low + (rnd % (high - low + 1)));
+}
+
+/*************************************************************
+Generates a random iboolean value. */
+UNIV_INLINE
+ibool
+ut_rnd_gen_ibool(void)
+/*=================*/
+ /* out: the random value */
+{
+ ulint x;
+
+ x = ut_rnd_gen_ulint();
+
+ if (((x >> 20) + (x >> 15)) & 1) {
+
+ return(TRUE);
+ }
+
+ return(FALSE);
+}
+
+/***********************************************************
+The following function generates a hash value for a ulint integer
+to a hash table of size table_size, which should be a prime
+or some random number for the hash table to work reliably. */
+UNIV_INLINE
+ulint
+ut_hash_ulint(
+/*=========*/
+ /* out: hash value */
+ ulint key, /* in: value to be hashed */
+ ulint table_size) /* in: hash table size */
+{
+ key = key ^ UT_HASH_RANDOM_MASK2;
+
+ return(key % table_size);
+}
+
+/*****************************************************************
+Folds a pair of ulints. */
+UNIV_INLINE
+ulint
+ut_fold_ulint_pair(
+/*===============*/
+ /* out: folded value */
+ ulint n1, /* in: ulint */
+ ulint n2) /* in: ulint */
+{
+ return(((((n1 ^ n2 ^ UT_HASH_RANDOM_MASK2) << 8) + n1)
+ ^ UT_HASH_RANDOM_MASK) + n2);
+}
+
+/*****************************************************************
+Folds a dulint. */
+UNIV_INLINE
+ulint
+ut_fold_dulint(
+/*===========*/
+ /* out: folded value */
+ dulint d) /* in: dulint */
+{
+ return(ut_fold_ulint_pair(ut_dulint_get_low(d),
+ ut_dulint_get_high(d)));
+}
+
+/*****************************************************************
+Folds a character string ending in the null character. */
+UNIV_INLINE
+ulint
+ut_fold_string(
+/*===========*/
+ /* out: folded value */
+ char* str) /* in: null-terminated string */
+{
+ #ifdef UNIV_DEBUG
+ ulint i = 0;
+ #endif
+ ulint fold = 0;
+
+ ut_ad(str);
+
+ while (*str != '\0') {
+
+ #ifdef UNIV_DEBUG
+ i++;
+ ut_a(i < 100);
+ #endif
+
+ fold = ut_fold_ulint_pair(fold, (ulint)(*str));
+ str++;
+ }
+
+ return(fold);
+}
+
+/*****************************************************************
+Folds a binary string. */
+UNIV_INLINE
+ulint
+ut_fold_binary(
+/*===========*/
+ /* out: folded value */
+ byte* str, /* in: string of bytes */
+ ulint len) /* in: length */
+{
+ ulint i;
+ ulint fold = 0;
+
+ ut_ad(str);
+
+ for (i = 0; i < len; i++) {
+ fold = ut_fold_ulint_pair(fold, (ulint)(*str));
+
+ str++;
+ }
+
+ return(fold);
+}
diff --git a/innobase/include/ut0sort.h b/innobase/include/ut0sort.h
new file mode 100644
index 00000000000..d0a3d34e79e
--- /dev/null
+++ b/innobase/include/ut0sort.h
@@ -0,0 +1,91 @@
+/**********************************************************************
+Sort utility
+
+(c) 1995 Innobase Oy
+
+Created 11/9/1995 Heikki Tuuri
+***********************************************************************/
+
+#ifndef ut0sort_h
+#define ut0sort_h
+
+#include "univ.i"
+
+/* This module gives a macro definition of the body of
+a standard sort function for an array of elements of any
+type. The comparison function is given as a parameter to
+the macro. The sort algorithm is mergesort which has logarithmic
+worst case.
+*/
+
+/***********************************************************************
+This macro expands to the body of a standard sort function.
+The sort function uses mergesort and must be defined separately
+for each type of array.
+Also the comparison function has to be defined individually
+for each array cell type. SORT_FUN is the sort function name.
+The function takes the array to be sorted (ARR),
+the array of auxiliary space (AUX_ARR) of same size,
+and the low (LOW), inclusive, and high (HIGH), noninclusive,
+limits for the sort interval as arguments.
+CMP_FUN is the comparison function name. It takes as arguments
+two elements from the array and returns 1, if the first is bigger,
+0 if equal, and -1 if the second bigger. For an eaxmaple of use
+see test program in tsut.c. */
+
+#define UT_SORT_FUNCTION_BODY(SORT_FUN, ARR, AUX_ARR, LOW, HIGH, CMP_FUN)\
+{\
+ ulint ut_sort_mid77;\
+ ulint ut_sort_i77;\
+ ulint ut_sort_low77;\
+ ulint ut_sort_high77;\
+\
+ ut_ad((LOW) < (HIGH));\
+ ut_ad(ARR);\
+ ut_ad(AUX_ARR);\
+\
+ if ((LOW) == (HIGH) - 1) {\
+ return;\
+ } else if ((LOW) == (HIGH) - 2) {\
+ if (CMP_FUN((ARR)[LOW], (ARR)[(HIGH) - 1]) > 0) {\
+ (AUX_ARR)[LOW] = (ARR)[LOW];\
+ (ARR)[LOW] = (ARR)[(HIGH) - 1];\
+ (ARR)[(HIGH) - 1] = (AUX_ARR)[LOW];\
+ }\
+ return;\
+ }\
+\
+ ut_sort_mid77 = ((LOW) + (HIGH)) / 2;\
+\
+ SORT_FUN((ARR), (AUX_ARR), (LOW), ut_sort_mid77);\
+ SORT_FUN((ARR), (AUX_ARR), ut_sort_mid77, (HIGH));\
+\
+ ut_sort_low77 = (LOW);\
+ ut_sort_high77 = ut_sort_mid77;\
+\
+ for (ut_sort_i77 = (LOW); ut_sort_i77 < (HIGH); ut_sort_i77++) {\
+\
+ if (ut_sort_low77 >= ut_sort_mid77) {\
+ (AUX_ARR)[ut_sort_i77] = (ARR)[ut_sort_high77];\
+ ut_sort_high77++;\
+ } else if (ut_sort_high77 >= (HIGH)) {\
+ (AUX_ARR)[ut_sort_i77] = (ARR)[ut_sort_low77];\
+ ut_sort_low77++;\
+ } else if (CMP_FUN((ARR)[ut_sort_low77],\
+ (ARR)[ut_sort_high77]) > 0) {\
+ (AUX_ARR)[ut_sort_i77] = (ARR)[ut_sort_high77];\
+ ut_sort_high77++;\
+ } else {\
+ (AUX_ARR)[ut_sort_i77] = (ARR)[ut_sort_low77];\
+ ut_sort_low77++;\
+ }\
+ }\
+\
+ for (ut_sort_i77 = (LOW); ut_sort_i77 < (HIGH); ut_sort_i77++) {\
+ (ARR)[ut_sort_i77] = (AUX_ARR)[ut_sort_i77];\
+ }\
+}\
+
+
+#endif
+
diff --git a/innobase/include/ut0ut.h b/innobase/include/ut0ut.h
new file mode 100644
index 00000000000..05d4f455c58
--- /dev/null
+++ b/innobase/include/ut0ut.h
@@ -0,0 +1,174 @@
+/**********************************************************************
+Various utilities
+
+(c) 1994, 1995 Innobase Oy
+
+Created 1/20/1994 Heikki Tuuri
+***********************************************************************/
+
+#ifndef ut0ut_h
+#define ut0ut_h
+
+#include <time.h>
+#include <ctype.h>
+
+#include "univ.i"
+
+
+typedef time_t ib_time_t;
+
+/**********************************************************
+Calculates the minimum of two ulints. */
+UNIV_INLINE
+ulint
+ut_min(
+/*===*/
+ /* out: minimum */
+ ulint n1, /* in: first number */
+ ulint n2); /* in: second number */
+/**********************************************************
+Calculates the maximum of two ulints. */
+UNIV_INLINE
+ulint
+ut_max(
+/*===*/
+ /* out: maximum */
+ ulint n1, /* in: first number */
+ ulint n2); /* in: second number */
+/********************************************************************
+Calculates minimum of two ulint-pairs. */
+UNIV_INLINE
+void
+ut_pair_min(
+/*========*/
+ ulint* a, /* out: more significant part of minimum */
+ ulint* b, /* out: less significant part of minimum */
+ ulint a1, /* in: more significant part of first pair */
+ ulint b1, /* in: less significant part of first pair */
+ ulint a2, /* in: more significant part of second pair */
+ ulint b2); /* in: less significant part of second pair */
+/**********************************************************
+Compares two ulints. */
+UNIV_INLINE
+int
+ut_ulint_cmp(
+/*=========*/
+ /* out: 1 if a > b, 0 if a == b, -1 if a < b */
+ ulint a, /* in: ulint */
+ ulint b); /* in: ulint */
+/***********************************************************
+Compares two pairs of ulints. */
+UNIV_INLINE
+int
+ut_pair_cmp(
+/*========*/
+ /* out: -1 if a < b, 0 if a == b,
+ 1 if a > b */
+ ulint a1, /* in: more significant part of first pair */
+ ulint a2, /* in: less significant part of first pair */
+ ulint b1, /* in: more significant part of second pair */
+ ulint b2); /* in: less significant part of second pair */
+/*****************************************************************
+Calculates fast the remainder when divided by a power of two. */
+UNIV_INLINE
+ulint
+ut_2pow_remainder(
+/*==============*/ /* out: remainder */
+ ulint n, /* in: number to be divided */
+ ulint m); /* in: divisor; power of 2 */
+/*****************************************************************
+Calculates fast value rounded to a multiple of a power of 2. */
+UNIV_INLINE
+ulint
+ut_2pow_round(
+/*==========*/ /* out: value of n rounded down to nearest
+ multiple of m */
+ ulint n, /* in: number to be rounded */
+ ulint m); /* in: divisor; power of 2 */
+/*****************************************************************
+Calculates fast the 2-logarithm of a number, rounded upward to an
+integer. */
+UNIV_INLINE
+ulint
+ut_2_log(
+/*=====*/
+ /* out: logarithm in the base 2, rounded upward */
+ ulint n); /* in: number */
+/*****************************************************************
+Calculates 2 to power n. */
+UNIV_INLINE
+ulint
+ut_2_exp(
+/*=====*/
+ /* out: 2 to power n */
+ ulint n); /* in: number */
+/*****************************************************************
+Calculates fast the number rounded up to the nearest power of 2. */
+UNIV_INLINE
+ulint
+ut_2_power_up(
+/*==========*/
+ /* out: first power of 2 which is >= n */
+ ulint n); /* in: number != 0 */
+/****************************************************************
+Sort function for ulint arrays. */
+
+void
+ut_ulint_sort(ulint* arr, ulint* aux_arr, ulint low, ulint high);
+/*============================================================*/
+/************************************************************
+The following function returns a clock time in milliseconds. */
+
+ulint
+ut_clock(void);
+/**************************************************************
+Returns system time. We do not specify the format of the time returned:
+the only way to manipulate it is to use the function ut_difftime. */
+
+ib_time_t
+ut_time(void);
+/*=========*/
+/**************************************************************
+Returns the difference of two times in seconds. */
+
+double
+ut_difftime(
+/*========*/
+ /* out: time2 - time1 expressed in seconds */
+ ib_time_t time2, /* in: time */
+ ib_time_t time1); /* in: time */
+/*****************************************************************
+Runs an idle loop on CPU. The argument gives the desired delay
+in microseconds on 100 MHz Pentium + Visual C++. */
+
+ulint
+ut_delay(
+/*=====*/
+ /* out: dummy value */
+ ulint delay); /* in: delay in microseconds on 100 MHz Pentium */
+/*****************************************************************
+Prints the contents of a memory buffer in hex and ascii. */
+
+void
+ut_print_buf(
+/*=========*/
+ byte* buf, /* in: memory buffer */
+ ulint len); /* in: length of the buffer */
+/*****************************************************************
+Prints the contents of a memory buffer in hex and ascii. */
+
+ulint
+ut_sprintf_buf(
+/*===========*/
+ /* out: printed length in bytes */
+ char* str, /* in: buffer to print to */
+ byte* buf, /* in: memory buffer */
+ ulint len); /* in: length of the buffer */
+
+
+#ifndef UNIV_NONINL
+#include "ut0ut.ic"
+#endif
+
+#endif
+
diff --git a/innobase/include/ut0ut.ic b/innobase/include/ut0ut.ic
new file mode 100644
index 00000000000..90f25d2b382
--- /dev/null
+++ b/innobase/include/ut0ut.ic
@@ -0,0 +1,196 @@
+/******************************************************************
+Various utilities
+
+(c) 1994, 1995 Innobase Oy
+
+Created 5/30/1994 Heikki Tuuri
+*******************************************************************/
+
+/**********************************************************
+Calculates the minimum of two ulints. */
+UNIV_INLINE
+ulint
+ut_min(
+/*===*/
+ /* out: minimum */
+ ulint n1, /* in: first number */
+ ulint n2) /* in: second number */
+{
+ return((n1 <= n2) ? n1 : n2);
+}
+
+/**********************************************************
+Calculates the maximum of two ulints. */
+UNIV_INLINE
+ulint
+ut_max(
+/*===*/
+ /* out: maximum */
+ ulint n1, /* in: first number */
+ ulint n2) /* in: second number */
+{
+ return((n1 <= n2) ? n2 : n1);
+}
+
+/********************************************************************
+Calculates minimum of two ulint-pairs. */
+UNIV_INLINE
+void
+ut_pair_min(
+/*========*/
+ ulint* a, /* out: more significant part of minimum */
+ ulint* b, /* out: less significant part of minimum */
+ ulint a1, /* in: more significant part of first pair */
+ ulint b1, /* in: less significant part of first pair */
+ ulint a2, /* in: more significant part of second pair */
+ ulint b2) /* in: less significant part of second pair */
+{
+ if (a1 == a2) {
+ *a = a1;
+ *b = ut_min(b1, b2);
+ } else if (a1 < a2) {
+ *a = a1;
+ *b = b1;
+ } else {
+ *a = a2;
+ *b = b2;
+ }
+}
+
+/**********************************************************
+Compares two ulints. */
+UNIV_INLINE
+int
+ut_ulint_cmp(
+/*=========*/
+ /* out: 1 if a > b, 0 if a == b, -1 if a < b */
+ ulint a, /* in: ulint */
+ ulint b) /* in: ulint */
+{
+ if (a < b) {
+ return(-1);
+ } else if (a == b) {
+ return(0);
+ } else {
+ return(1);
+ }
+}
+
+/***********************************************************
+Compares two pairs of ulints. */
+UNIV_INLINE
+int
+ut_pair_cmp(
+/*========*/
+ /* out: -1 if a < b, 0 if a == b, 1 if a > b */
+ ulint a1, /* in: more significant part of first pair */
+ ulint a2, /* in: less significant part of first pair */
+ ulint b1, /* in: more significant part of second pair */
+ ulint b2) /* in: less significant part of second pair */
+{
+ if (a1 > b1) {
+ return(1);
+ } else if (a1 < b1) {
+ return(-1);
+ } else if (a2 > b2) {
+ return(1);
+ } else if (a2 < b2) {
+ return(-1);
+ } else {
+ return(0);
+ }
+}
+
+/*****************************************************************
+Calculates fast the remainder when divided by a power of two. */
+UNIV_INLINE
+ulint
+ut_2pow_remainder(
+/*==============*/ /* out: remainder */
+ ulint n, /* in: number to be divided */
+ ulint m) /* in: divisor; power of 2 */
+{
+ ut_ad(0x80000000 % m == 0);
+
+ return(n & (m - 1));
+}
+
+/*****************************************************************
+Calculates fast a value rounded to a multiple of a power of 2. */
+UNIV_INLINE
+ulint
+ut_2pow_round(
+/*==========*/ /* out: value of n rounded down to nearest
+ multiple of m */
+ ulint n, /* in: number to be rounded */
+ ulint m) /* in: divisor; power of 2 */
+{
+ ut_ad(0x80000000 % m == 0);
+
+ return(n & ~(m - 1));
+}
+
+/*****************************************************************
+Calculates fast the 2-logarithm of a number, rounded upward to an
+integer. */
+UNIV_INLINE
+ulint
+ut_2_log(
+/*=====*/
+ /* out: logarithm in the base 2, rounded upward */
+ ulint n) /* in: number != 0 */
+{
+ ulint res;
+
+ res = 0;
+
+ ut_ad(n > 0);
+
+ n = n - 1;
+
+ for (;;) {
+ n = n / 2;
+
+ if (n == 0) {
+ break;
+ }
+
+ res++;
+ }
+
+ return(res + 1);
+}
+
+/*****************************************************************
+Calculates 2 to power n. */
+UNIV_INLINE
+ulint
+ut_2_exp(
+/*=====*/
+ /* out: 2 to power n */
+ ulint n) /* in: number */
+{
+ return(1 << n);
+}
+
+/*****************************************************************
+Calculates fast the number rounded up to the nearest power of 2. */
+UNIV_INLINE
+ulint
+ut_2_power_up(
+/*==========*/
+ /* out: first power of 2 which is >= n */
+ ulint n) /* in: number != 0 */
+{
+ ulint res;
+
+ res = 1;
+
+ ut_ad(n > 0);
+
+ while (res < n) {
+ res = res * 2;
+ }
+
+ return(res);
+}