Added Innobase to source distribution

Docs/manual.texi: Added Innobase documentation configure.in: Incremented version include/my_base.h: Added option for Innobase myisam/mi_check.c: cleanup mysql-test/t/bdb.test: cleanup mysql-test/t/innobase.test: Extended with new tests from bdb.test mysql-test/t/merge.test: Added test of SHOW create mysys/my_init.c: Fix for UNIXWARE 7 scripts/mysql_install_db.sh: Always write how to start mysqld scripts/safe_mysqld.sh: Fixed type sql/ha_innobase.cc: Update to new version sql/ha_innobase.h: Update to new version sql/handler.h: Added 'update_table_comment()' and 'append_create_info()' sql/sql_delete.cc: Fixes for Innobase sql/sql_select.cc: Fixes for Innobase sql/sql_show.cc: Append create information (for MERGE tables) sql/sql_update.cc: Fixes for Innobase
author: unknown <monty@donna.mysql.com> 2001-02-17 14:19:19 +0200
committer: unknown <monty@donna.mysql.com> 2001-02-17 14:19:19 +0200
commit: 2662b59306ef0cd495fa6e2edf7129e58a11393a (patch)
tree: bfe39951a73e906579ab819bf5198ad8f3a64a36 /innobase/row
parent: 66de55a56bdcf2f7a9c0c4f8e19b3e761475e202 (diff)
download: mariadb-git-2662b59306ef0cd495fa6e2edf7129e58a11393a.tar.gz
14 files changed, 10265 insertions, 0 deletions
diff --git a/innobase/row/Makefile.am b/innobase/row/Makefile.am
new file mode 100644
index 00000000000..e4fcbe8f715
--- /dev/null
+++ b/innobase/row/Makefile.am
@@ -0,0 +1,25 @@
+# Copyright (C) 2000 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+# & Innobase Oy
+# 
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+# 
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+# 
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+include ../include/Makefile.i
+
+libs_LIBRARIES =	librow.a
+
+librow_a_SOURCES =	row0ins.c row0mysql.c row0purge.c row0row.c row0sel.c\
+			row0uins.c row0umod.c row0undo.c row0upd.c row0vers.c
+
+EXTRA_PROGRAMS =	
diff --git a/innobase/row/makefilewin b/innobase/row/makefilewin
new file mode 100644
index 00000000000..c17240c6119
--- /dev/null
+++ b/innobase/row/makefilewin
@@ -0,0 +1,34 @@
+include ..\include\makefile.i
+
+row.lib: row0mysql.obj row0upd.obj row0sel.obj row0umod.obj row0uins.obj row0ins.obj row0upd.obj row0undo.obj row0purge.obj row0vers.obj row0row.obj
+	lib -out:..\libs\row.lib row0mysql.obj row0sel.obj row0umod.obj row0uins.obj row0ins.obj row0upd.obj row0undo.obj row0purge.obj row0vers.obj row0row.obj
+
+row0mysql.obj: row0mysql.c
+	$(CCOM) $(CFL) -c row0mysql.c
+
+row0ins.obj: row0ins.c
+	$(CCOM) $(CFL) -c row0ins.c
+
+row0sel.obj: row0sel.c
+	$(CCOM) $(CFL) -c row0sel.c
+
+row0upd.obj: row0upd.c
+	$(CCOM) $(CFL) -c row0upd.c
+
+row0undo.obj: row0undo.c
+	$(CCOM) $(CFL) -c row0undo.c
+
+row0purge.obj: row0purge.c
+	$(CCOM) $(CFL) -c row0purge.c
+
+row0row.obj: row0row.c
+	$(CCOM) $(CFL) -c row0row.c
+
+row0vers.obj: row0vers.c
+	$(CCOM) $(CFL) -c row0vers.c
+
+row0umod.obj: row0umod.c
+	$(CCOM) $(CFL) -c row0umod.c
+
+row0uins.obj: row0uins.c
+	$(CCOM) $(CFL) -c row0uins.c
diff --git a/innobase/row/row0ins.c b/innobase/row/row0ins.c
new file mode 100644
index 00000000000..4502cb8235f
--- /dev/null
+++ b/innobase/row/row0ins.c
@@ -0,0 +1,1018 @@
+/******************************************************
+Insert into a table
+
+(c) 1996 Innobase Oy
+
+Created 4/20/1996 Heikki Tuuri
+*******************************************************/
+
+#include "row0ins.h"
+
+#ifdef UNIV_NONINL
+#include "row0ins.ic"
+#endif
+
+#include "dict0dict.h"
+#include "dict0boot.h"
+#include "trx0undo.h"
+#include "btr0btr.h"
+#include "btr0cur.h"
+#include "mach0data.h"
+#include "que0que.h"
+#include "row0upd.h"
+#include "row0sel.h"
+#include "row0row.h"
+#include "rem0cmp.h"
+#include "lock0lock.h"
+#include "log0log.h"
+#include "eval0eval.h"
+#include "data0data.h"
+#include "usr0sess.h"
+
+#define	ROW_INS_PREV	1
+#define	ROW_INS_NEXT	2
+
+/*************************************************************************
+Creates an insert node struct. */
+
+ins_node_t*
+ins_node_create(
+/*============*/
+					/* out, own: insert node struct */
+	ulint		ins_type,	/* in: INS_VALUES, ... */
+	dict_table_t*	table, 		/* in: table where to insert */
+	mem_heap_t*	heap)		/* in: mem heap where created */
+{
+	ins_node_t*	node;
+
+	node = mem_heap_alloc(heap, sizeof(ins_node_t));
+
+	node->common.type = QUE_NODE_INSERT;
+
+	node->ins_type = ins_type;
+
+	node->state = INS_NODE_SET_IX_LOCK;
+	node->table = table;
+	node->index = NULL;
+	node->entry = NULL;
+
+	node->select = NULL;
+	
+	node->trx_id = ut_dulint_zero;
+	
+	node->entry_sys_heap = mem_heap_create(128);
+
+	node->magic_n = INS_NODE_MAGIC_N;	
+	
+	return(node);
+}
+
+/***************************************************************
+Creates an entry template for each index of a table. */
+static
+void
+ins_node_create_entry_list(
+/*=======================*/
+	ins_node_t*	node)	/* in: row insert node */
+{
+	dict_index_t*	index;
+	dtuple_t*	entry;
+
+	ut_ad(node->entry_sys_heap);
+
+	UT_LIST_INIT(node->entry_list);
+
+	index = dict_table_get_first_index(node->table);
+	
+	while (index != NULL) {
+		entry = row_build_index_entry(node->row, index,
+							node->entry_sys_heap);
+		UT_LIST_ADD_LAST(tuple_list, node->entry_list, entry);
+
+		index = dict_table_get_next_index(index);
+	}
+}
+
+/*********************************************************************
+Adds system field buffers to a row. */
+static
+void
+row_ins_alloc_sys_fields(
+/*=====================*/
+	ins_node_t*	node)	/* in: insert node */
+{
+	dtuple_t*	row;
+	dict_table_t*	table;
+	mem_heap_t*	heap;
+	dict_col_t*	col;
+	dfield_t*	dfield;
+	ulint		len;
+	byte*		ptr;
+
+	row = node->row;
+	table = node->table;
+	heap = node->entry_sys_heap;
+
+	ut_ad(row && table && heap);
+	ut_ad(dtuple_get_n_fields(row) == dict_table_get_n_cols(table));
+
+	/* 1. Allocate buffer for row id */
+
+	col = dict_table_get_sys_col(table, DATA_ROW_ID);
+	
+	dfield = dtuple_get_nth_field(row, dict_col_get_no(col));
+
+	ptr = mem_heap_alloc(heap, DATA_ROW_ID_LEN);
+				
+	dfield_set_data(dfield, ptr, DATA_ROW_ID_LEN);
+
+	node->row_id_buf = ptr;
+
+	if (table->type == DICT_TABLE_CLUSTER_MEMBER) {
+
+		/* 2. Fill in the dfield for mix id */
+
+		col = dict_table_get_sys_col(table, DATA_MIX_ID);
+	
+		dfield = dtuple_get_nth_field(row, dict_col_get_no(col));
+
+		len = mach_dulint_get_compressed_size(table->mix_id);
+		ptr = mem_heap_alloc(heap, DATA_MIX_ID_LEN);
+				
+		mach_dulint_write_compressed(ptr, table->mix_id);
+		dfield_set_data(dfield, ptr, len);
+	}
+
+	/* 3. Allocate buffer for trx id */
+
+	col = dict_table_get_sys_col(table, DATA_TRX_ID);
+	
+	dfield = dtuple_get_nth_field(row, dict_col_get_no(col));
+	ptr = mem_heap_alloc(heap, DATA_TRX_ID_LEN);
+				
+	dfield_set_data(dfield, ptr, DATA_TRX_ID_LEN);
+
+	node->trx_id_buf = ptr;
+
+	/* 4. Allocate buffer for roll ptr */
+
+	col = dict_table_get_sys_col(table, DATA_ROLL_PTR);
+	
+	dfield = dtuple_get_nth_field(row, dict_col_get_no(col));
+	ptr = mem_heap_alloc(heap, DATA_ROLL_PTR_LEN);
+				
+	dfield_set_data(dfield, ptr, DATA_ROLL_PTR_LEN);
+}
+
+/*************************************************************************
+Sets a new row to insert for an INS_DIRECT node. This function is only used
+if we have constructed the row separately, which is a rare case; this
+function is quite slow. */
+
+void
+ins_node_set_new_row(
+/*=================*/
+	ins_node_t*	node,	/* in: insert node */
+	dtuple_t*	row)	/* in: new row (or first row) for the node */
+{
+	node->state = INS_NODE_SET_IX_LOCK;
+	node->index = NULL;
+	node->entry = NULL;
+
+	node->row = row;
+
+	mem_heap_empty(node->entry_sys_heap);
+
+	/* Create templates for index entries */
+			
+	ins_node_create_entry_list(node);
+
+	/* Allocate from entry_sys_heap buffers for sys fields */
+
+	row_ins_alloc_sys_fields(node);
+
+	/* As we allocated a new trx id buf, the trx id should be written
+	there again: */
+
+	node->trx_id = ut_dulint_zero;
+}
+
+/***********************************************************************
+Does an insert operation by updating a delete marked existing record
+in the index. This situation can occur if the delete marked record is
+kept in the index for consistent reads. */
+static
+ulint
+row_ins_sec_index_entry_by_modify(
+/*==============================*/
+				/* out: DB_SUCCESS or error code */
+	btr_cur_t*	cursor,	/* in: B-tree cursor */
+	que_thr_t*	thr,	/* in: query thread */
+	mtr_t*		mtr)	/* in: mtr */
+{
+	ulint	err;
+
+	ut_ad(((cursor->index)->type & DICT_CLUSTERED) == 0);
+	ut_ad(rec_get_deleted_flag(btr_cur_get_rec(cursor)));
+	
+	/* We just remove the delete mark from the secondary index record */
+	err = btr_cur_del_mark_set_sec_rec(0, cursor, FALSE, thr, mtr);
+
+	return(err);
+}
+
+/***********************************************************************
+Does an insert operation by delete unmarking and updating a delete marked
+existing record in the index. This situation can occur if the delete marked
+record is kept in the index for consistent reads. */
+static
+ulint
+row_ins_clust_index_entry_by_modify(
+/*================================*/
+				/* out: DB_SUCCESS, DB_FAIL, or error code */
+	ulint		mode,	/* in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE,
+				depending on whether mtr holds just a leaf
+				latch or also a tree latch */
+	btr_cur_t*	cursor,	/* in: B-tree cursor */
+	dtuple_t*	entry,	/* in: index entry to insert */
+	que_thr_t*	thr,	/* in: query thread */
+	mtr_t*		mtr)	/* in: mtr */
+{
+	mem_heap_t*	heap;
+	rec_t*		rec;
+	upd_t*		update;
+	ulint		err;
+	
+	ut_ad((cursor->index)->type & DICT_CLUSTERED);
+	
+	rec = btr_cur_get_rec(cursor);
+
+	ut_ad(rec_get_deleted_flag(rec));	
+
+	heap = mem_heap_create(1024);
+	
+	/* Build an update vector containing all the fields to be modified;
+	NOTE that this vector may contain also system columns! */
+	
+	update = row_upd_build_difference(cursor->index, entry, rec, heap); 
+
+	if (mode == BTR_MODIFY_LEAF) {
+		/* Try optimistic updating of the record, keeping changes
+		within the page */
+
+		err = btr_cur_optimistic_update(0, cursor, update, 0, thr,
+									mtr);
+		if ((err == DB_OVERFLOW) || (err == DB_UNDERFLOW)) {
+			err = DB_FAIL;
+		}
+	} else  {
+		ut_ad(mode == BTR_MODIFY_TREE);
+		err = btr_cur_pessimistic_update(0, cursor, update, 0, thr,
+									mtr);
+	}
+	
+	mem_heap_free(heap);
+
+	return(err);
+}
+
+/*******************************************************************
+Checks if a unique key violation to rec would occur at the index entry
+insert. */
+static
+ibool
+row_ins_dupl_error_with_rec(
+/*========================*/
+				/* out: TRUE if error */
+	rec_t*		rec,	/* in: user record */
+	dtuple_t*	entry,	/* in: entry to insert */
+	dict_index_t*	index,	/* in: index */
+	trx_t*		trx)	/* in: inserting transaction */
+{
+	ulint	matched_fields;
+	ulint	matched_bytes;
+	ulint	n_unique;
+	trx_t*	impl_trx;
+	
+	n_unique = dict_index_get_n_unique(index);
+
+	matched_fields = 0;
+	matched_bytes = 0;
+
+	cmp_dtuple_rec_with_match(entry, rec, &matched_fields, &matched_bytes);
+
+	if (matched_fields < n_unique) {
+
+			return(FALSE);
+	}
+
+	if (!rec_get_deleted_flag(rec)) {
+
+			return(TRUE);
+	}
+
+	/* If we get here, the record has its delete mark set. It is still
+	a unique key violation if the transaction which set the delete mark
+	is currently active and is not trx itself. We check if some
+	transaction has an implicit x-lock on the record. */
+
+	mutex_enter(&kernel_mutex);
+
+	if (index->type & DICT_CLUSTERED) {
+		impl_trx = lock_clust_rec_some_has_impl(rec, index);
+	} else {
+		impl_trx = lock_sec_rec_some_has_impl_off_kernel(rec, index);
+	}
+
+	mutex_exit(&kernel_mutex);
+
+	if (impl_trx && impl_trx != trx) {
+
+		return(TRUE);
+	}
+
+	return(FALSE);
+}	
+	
+/*******************************************************************
+Scans a unique non-clustered index at a given index entry to determine
+whether a uniqueness violation has occurred for the key value of the entry. */
+static
+ulint
+row_ins_scan_sec_index_for_duplicate(
+/*=================================*/
+				/* out: DB_SUCCESS or DB_DUPLICATE_KEY */
+	dict_index_t*	index,	/* in: non-clustered unique index */
+	dtuple_t*	entry,	/* in: index entry */
+	trx_t*		trx)	/* in: inserting transaction */
+{
+	ulint		dupl_count	= 0;
+	int		cmp;
+	ulint		n_fields_cmp;
+	rec_t*		rec;
+	btr_pcur_t	pcur;
+	mtr_t		mtr;
+
+	mtr_start(&mtr);
+
+	/* Store old value on n_fields_cmp */
+
+	n_fields_cmp = dtuple_get_n_fields_cmp(entry);
+
+	dtuple_set_n_fields_cmp(entry, dict_index_get_n_unique(index));
+	
+	btr_pcur_open_on_user_rec(index, entry, PAGE_CUR_GE,
+				BTR_SEARCH_LEAF, &pcur, &mtr);
+
+	/* Scan index records and check that there are no duplicates */
+
+	for (;;) {
+		if (btr_pcur_is_after_last_in_tree(&pcur, &mtr)) {
+
+			break;
+		}
+
+		rec = btr_pcur_get_rec(&pcur);
+
+		cmp = cmp_dtuple_rec(entry, rec);
+
+		if (cmp == 0) {
+			if (row_ins_dupl_error_with_rec(rec, entry, index,
+								trx)) {
+				dupl_count++;
+
+				if (dupl_count > 1) {
+					/* printf(
+					"Duplicate key in index %s\n",
+				     				index->name);
+					dtuple_print(entry); */
+				}
+			}
+		}
+
+		if (cmp < 0) {
+			break;
+		}
+
+		ut_a(cmp == 0);
+
+		btr_pcur_move_to_next_user_rec(&pcur, &mtr);
+	}
+
+	mtr_commit(&mtr);
+
+	/* Restore old value */
+	dtuple_set_n_fields_cmp(entry, n_fields_cmp);
+
+	ut_a(dupl_count >= 1);
+
+	if (dupl_count > 1) {
+
+		return(DB_DUPLICATE_KEY);
+	}
+
+	return(DB_SUCCESS);
+}
+
+/*******************************************************************
+Tries to check if a unique key violation error would occur at an index entry
+insert. */
+static
+ulint
+row_ins_duplicate_error(
+/*====================*/
+				/* out: DB_SUCCESS if no error
+				DB_DUPLICATE_KEY if error,
+				DB_STRONG_FAIL if this is a non-clustered
+				index record and we cannot determine yet
+				if there will be an error: in this last
+				case we must call
+				row_ins_scan_sec_index_for_duplicate
+				AFTER the insertion of the record! */
+	btr_cur_t*	cursor,	/* in: B-tree cursor */
+	dtuple_t*	entry,	/* in: entry to insert */
+	trx_t*		trx,	/* in: inserting transaction */
+	mtr_t*		mtr,	/* in: mtr */
+	rec_t**		dupl_rec)/* out: record with which duplicate error */
+{
+	rec_t*	rec;
+	page_t*	page;
+	ulint	n_unique;
+
+	ut_ad(cursor->index->type & DICT_UNIQUE);
+
+	/* NOTE: For unique non-clustered indexes there may be any number
+	of delete marked records with the same value for the non-clustered
+	index key (remember multiversioning), and which differ only in
+	the row refererence part of the index record, containing the
+	clustered index key fields. For such a secondary index record,
+	to avoid race condition, we must FIRST do the insertion and after
+	that check that the uniqueness condition is not breached! */
+	
+	/* NOTE: A problem is that in the B-tree node pointers on an
+	upper level may match more to the entry than the actual existing
+	user records on the leaf level. So, even if low_match would suggest
+	that a duplicate key violation may occur, this may not be the case. */
+
+	n_unique = dict_index_get_n_unique(cursor->index);
+	
+	if (cursor->low_match >= n_unique) {
+		
+		rec = btr_cur_get_rec(cursor);
+		page = buf_frame_align(rec);
+
+		if (rec != page_get_infimum_rec(page)) {
+
+			if (row_ins_dupl_error_with_rec(rec, entry,
+							cursor->index, trx)) {
+				*dupl_rec = rec;
+
+				return(DB_DUPLICATE_KEY);
+			}
+		}
+	}
+
+	if (cursor->up_match >= n_unique) {
+
+		rec = page_rec_get_next(btr_cur_get_rec(cursor));
+		page = buf_frame_align(rec);
+
+		if (rec != page_get_supremum_rec(page)) {
+
+			if (row_ins_dupl_error_with_rec(rec, entry,
+							cursor->index, trx)) {
+				*dupl_rec = rec;
+
+				return(DB_DUPLICATE_KEY);
+			}
+		}
+
+		ut_a(!(cursor->index->type & DICT_CLUSTERED));
+						/* This should never happen */
+	}
+
+	if (cursor->index->type & DICT_CLUSTERED) {
+
+		return(DB_SUCCESS);
+	}
+
+	/* It was a non-clustered index: we must scan the index after the
+	insertion to be sure if there will be duplicate key error */
+	
+	return(DB_STRONG_FAIL);
+}
+
+/*******************************************************************
+Checks if an index entry has long enough common prefix with an existing
+record so that the intended insert of the entry must be changed to a modify of
+the existing record. In the case of a clustered index, the prefix must be
+n_unique fields long, and in the case of a secondary index, all fields must be
+equal. */
+UNIV_INLINE
+ulint
+row_ins_must_modify(
+/*================*/
+				/* out: 0 if no update, ROW_INS_PREV if
+				previous should be updated; currently we
+				do the search so that only the low_match
+				record can match enough to the search tuple,
+				not the next record */
+	btr_cur_t*	cursor)	/* in: B-tree cursor */
+{
+	ulint	enough_match;
+	rec_t*	rec;
+	page_t*	page;
+	
+	/* NOTE: (compare to the note in row_ins_duplicate_error) Because node
+	pointers on upper levels of the B-tree may match more to entry than
+	to actual user records on the leaf level, we have to check if the
+	candidate record is actually a user record. In a clustered index
+	node pointers contain index->n_unique first fields, and in the case
+	of a secondary index, all fields of the index. */
+
+	enough_match = dict_index_get_n_unique_in_tree(cursor->index);
+	
+	if (cursor->low_match >= enough_match) {
+
+		rec = btr_cur_get_rec(cursor);
+		page = buf_frame_align(rec);
+
+		if (rec != page_get_infimum_rec(page)) {
+
+			return(ROW_INS_PREV);
+		}
+	}
+
+	return(0);
+}
+
+/*******************************************************************
+Tries to insert an index entry to an index. If the index is clustered
+and a record with the same unique key is found, the other record is
+necessarily marked deleted by a committed transaction, or a unique key
+violation error occurs. The delete marked record is then updated to an
+existing record, and we must write an undo log record on the delete
+marked record. If the index is secondary, and a record with exactly the
+same fields is found, the other record is necessarily marked deleted.
+It is then unmarked. Otherwise, the entry is just inserted to the index. */
+
+ulint
+row_ins_index_entry_low(
+/*====================*/
+				/* out: DB_SUCCESS, DB_LOCK_WAIT, DB_FAIL
+				if pessimistic retry needed, or error code */
+	ulint		mode,	/* in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE,
+				depending on whether we wish optimistic or
+				pessimistic descent down the index tree */
+	dict_index_t*	index,	/* in: index */
+	dtuple_t*	entry,	/* in: index entry to insert */
+	que_thr_t*	thr)	/* in: query thread */
+{
+	btr_cur_t	cursor;		
+	ulint		dupl		= DB_SUCCESS;
+	ulint		modify;
+	rec_t*		dummy_rec;
+	rec_t*		rec;
+	rec_t*		dupl_rec;	/* Note that this may be undefined
+					for a non-clustered index even if
+					there is a duplicate key */
+	ulint		err;
+	ulint		n_unique;
+	mtr_t		mtr;
+	
+	log_free_check();
+	mtr_start(&mtr);
+
+	cursor.thr = thr;
+
+	/* Note that we use PAGE_CUR_LE as the search mode, because then
+	the function will return in both low_match and up_match of the
+	cursor sensible values */
+	
+	btr_cur_search_to_nth_level(index, 0, entry, PAGE_CUR_LE,
+					mode | BTR_INSERT, &cursor, 0, &mtr);
+
+	if (cursor.flag == BTR_CUR_INSERT_TO_IBUF) {
+		/* The insertion was made to the insert buffer already during
+		the search: we are done */
+
+		err = DB_SUCCESS;
+
+		goto function_exit;
+	}	
+					
+	n_unique = dict_index_get_n_unique(index);
+
+	if (index->type & DICT_UNIQUE && (cursor.up_match >= n_unique
+					 || cursor.low_match >= n_unique)) {
+
+		dupl = row_ins_duplicate_error(&cursor, entry,
+					thr_get_trx(thr), &mtr, &dupl_rec);
+		if (dupl == DB_DUPLICATE_KEY) {
+
+			/* printf("Duplicate key in index %s lm %lu\n",
+				     cursor->index->name, cursor->low_match);
+			rec_print(rec);
+			dtuple_print(entry); */
+
+			err = dupl;
+
+			goto function_exit;			
+		}
+	}
+
+	modify = row_ins_must_modify(&cursor);
+
+	if (modify != 0) {
+		/* There is already an index entry with a long enough common
+		prefix, we must convert the insert into a modify of an
+		existing record */
+
+		if (modify == ROW_INS_NEXT) {
+			rec = page_rec_get_next(btr_cur_get_rec(&cursor));
+
+			btr_cur_position(index, rec, &cursor);
+		}
+
+		if (index->type & DICT_CLUSTERED) {
+			err = row_ins_clust_index_entry_by_modify(mode,
+								&cursor, entry,
+								thr, &mtr);
+		} else {
+			err = row_ins_sec_index_entry_by_modify(&cursor,
+								thr, &mtr);
+		}
+		
+	} else if (mode == BTR_MODIFY_LEAF) {
+		err = btr_cur_optimistic_insert(0, &cursor, entry,
+							&dummy_rec, thr, &mtr);
+	} else {
+		ut_ad(mode == BTR_MODIFY_TREE);
+		err = btr_cur_pessimistic_insert(0, &cursor, entry,
+							&dummy_rec, thr, &mtr);
+	}
+function_exit:
+	mtr_commit(&mtr);
+
+	if (err == DB_SUCCESS && dupl == DB_STRONG_FAIL) {
+		/* We were not able to determine before the insertion
+		whether there will be a duplicate key error: do the check
+		now */
+	
+		err = row_ins_scan_sec_index_for_duplicate(index, entry,
+							thr_get_trx(thr));
+	}
+
+	ut_ad(err != DB_DUPLICATE_KEY || index->type & DICT_CLUSTERED
+		|| DB_DUPLICATE_KEY ==
+		row_ins_scan_sec_index_for_duplicate(index, entry,
+							thr_get_trx(thr)));
+	return(err);
+}
+
+/*******************************************************************
+Inserts an index entry to index. Tries first optimistic, then pessimistic
+descent down the tree. If the entry matches enough to a delete marked record,
+performs the insert by updating or delete unmarking the delete marked
+record. */
+
+ulint
+row_ins_index_entry(
+/*================*/
+				/* out: DB_SUCCESS, DB_LOCK_WAIT,
+				DB_DUPLICATE_KEY, or some other error code */
+	dict_index_t*	index,	/* in: index */
+	dtuple_t*	entry,	/* in: index entry to insert */
+	que_thr_t*	thr)	/* in: query thread */
+{
+	ulint	err;
+
+	/* Try first optimistic descent to the B-tree */
+
+	err = row_ins_index_entry_low(BTR_MODIFY_LEAF, index, entry, thr);
+	
+	if (err != DB_FAIL) {
+
+		return(err);
+	}
+
+	/* Try then pessimistic descent to the B-tree */
+
+	err = row_ins_index_entry_low(BTR_MODIFY_TREE, index, entry, thr);
+
+	return(err);
+}
+
+/***************************************************************
+Sets the values of the dtuple fields in entry from the values of appropriate
+columns in row. */
+UNIV_INLINE
+void
+row_ins_index_entry_set_vals(
+/*=========================*/
+	dtuple_t*	entry,	/* in: index entry to make */
+	dtuple_t*	row)	/* in: row */
+{
+	dfield_t*	field;
+	dfield_t*	row_field;
+	ulint		n_fields;
+	ulint		i;
+
+	ut_ad(entry && row);
+
+	n_fields = dtuple_get_n_fields(entry);
+
+	for (i = 0; i < n_fields; i++) {
+		field = dtuple_get_nth_field(entry, i);
+
+		row_field = dtuple_get_nth_field(row, field->col_no);
+
+		field->data = row_field->data;
+		field->len = row_field->len;
+	}
+}
+
+/***************************************************************
+Inserts a single index entry to the table. */
+UNIV_INLINE
+ulint
+row_ins_index_entry_step(
+/*=====================*/
+				/* out: DB_SUCCESS if operation successfully
+				completed, else error code or DB_LOCK_WAIT */
+	ins_node_t*	node,	/* in: row insert node */
+	que_thr_t*	thr)	/* in: query thread */
+{
+	ulint	err;
+
+	ut_ad(dtuple_check_typed(node->row));
+	
+	row_ins_index_entry_set_vals(node->entry, node->row);
+	
+	ut_ad(dtuple_check_typed(node->entry));
+
+	err = row_ins_index_entry(node->index, node->entry, thr);
+
+	return(err);
+}
+
+/***************************************************************
+Allocates a row id for row and inits the node->index field. */
+UNIV_INLINE
+void
+row_ins_alloc_row_id_step(
+/*======================*/
+	ins_node_t*	node)	/* in: row insert node */
+{
+	dulint	row_id;
+	
+	ut_ad(node->state == INS_NODE_ALLOC_ROW_ID);
+	
+	if (dict_table_get_first_index(node->table)->type & DICT_UNIQUE) {
+
+		/* No row id is stored if the clustered index is unique */
+
+		return;
+	}
+	
+	/* Fill in row id value to row */
+
+	row_id = dict_sys_get_new_row_id();
+
+	dict_sys_write_row_id(node->row_id_buf, row_id);
+}
+
+/***************************************************************
+Gets a row to insert from the values list. */
+UNIV_INLINE
+void
+row_ins_get_row_from_values(
+/*========================*/
+	ins_node_t*	node)	/* in: row insert node */
+{
+	que_node_t*	list_node;
+	dfield_t*	dfield;
+	dtuple_t*	row;
+	ulint		i;
+	
+	/* The field values are copied in the buffers of the select node and
+	it is safe to use them until we fetch from select again: therefore
+	we can just copy the pointers */
+
+	row = node->row; 
+
+	i = 0;
+	list_node = node->values_list;
+
+	while (list_node) {
+		eval_exp(list_node);
+
+		dfield = dtuple_get_nth_field(row, i);
+		dfield_copy_data(dfield, que_node_get_val(list_node));
+
+		i++;
+		list_node = que_node_get_next(list_node);
+	}
+}
+
+/***************************************************************
+Gets a row to insert from the select list. */
+UNIV_INLINE
+void
+row_ins_get_row_from_select(
+/*========================*/
+	ins_node_t*	node)	/* in: row insert node */
+{
+	que_node_t*	list_node;
+	dfield_t*	dfield;
+	dtuple_t*	row;
+	ulint		i;
+
+	/* The field values are copied in the buffers of the select node and
+	it is safe to use them until we fetch from select again: therefore
+	we can just copy the pointers */
+
+	row = node->row; 
+
+	i = 0;
+	list_node = node->select->select_list;
+
+	while (list_node) {
+		dfield = dtuple_get_nth_field(row, i);
+		dfield_copy_data(dfield, que_node_get_val(list_node));
+
+		i++;
+		list_node = que_node_get_next(list_node);
+	}
+}
+	
+/***************************************************************
+Inserts a row to a table. */
+
+ulint
+row_ins(
+/*====*/
+				/* out: DB_SUCCESS if operation successfully
+				completed, else error code or DB_LOCK_WAIT */
+	ins_node_t*	node,	/* in: row insert node */
+	que_thr_t*	thr)	/* in: query thread */
+{
+	ulint	err;
+	
+	ut_ad(node && thr);
+
+	if (node->state == INS_NODE_ALLOC_ROW_ID) {
+
+		row_ins_alloc_row_id_step(node);
+	
+		node->index = dict_table_get_first_index(node->table);
+		node->entry = UT_LIST_GET_FIRST(node->entry_list);
+
+		if (node->ins_type == INS_SEARCHED) {
+
+			row_ins_get_row_from_select(node);
+
+		} else if (node->ins_type == INS_VALUES) {
+
+			row_ins_get_row_from_values(node);
+		}
+
+		node->state = INS_NODE_INSERT_ENTRIES;
+	}
+
+	ut_ad(node->state == INS_NODE_INSERT_ENTRIES);
+
+	while (node->index != NULL) {
+		err = row_ins_index_entry_step(node, thr);
+		
+		if (err != DB_SUCCESS) {
+
+			return(err);
+		}
+
+		node->index = dict_table_get_next_index(node->index);
+		node->entry = UT_LIST_GET_NEXT(tuple_list, node->entry);
+	}
+
+	ut_ad(node->entry == NULL);
+	
+	node->state = INS_NODE_ALLOC_ROW_ID;
+	
+	return(DB_SUCCESS);
+}
+
+/***************************************************************
+Inserts a row to a table. This is a high-level function used in SQL execution
+graphs. */
+
+que_thr_t*
+row_ins_step(
+/*=========*/
+				/* out: query thread to run next or NULL */
+	que_thr_t*	thr)	/* in: query thread */
+{
+	ins_node_t*	node;
+	que_node_t*	parent;
+	sel_node_t*	sel_node;
+	trx_t*		trx;
+	ulint		err;
+
+	ut_ad(thr);
+	
+	trx = thr_get_trx(thr);
+
+	node = thr->run_node;
+
+	ut_ad(que_node_get_type(node) == QUE_NODE_INSERT);
+
+	parent = que_node_get_parent(node);
+	sel_node = node->select;
+
+	if (thr->prev_node == parent) {
+		node->state = INS_NODE_SET_IX_LOCK;
+	}
+
+	/* If this is the first time this node is executed (or when
+	execution resumes after wait for the table IX lock), set an
+	IX lock on the table and reset the possible select node. */
+
+	if (node->state == INS_NODE_SET_IX_LOCK) {
+
+		/* It may be that the current session has not yet started
+		its transaction, or it has been committed: */
+		
+		trx_start_if_not_started(trx);
+
+		if (UT_DULINT_EQ(trx->id, node->trx_id)) {
+			/* No need to do IX-locking or write trx id to buf */
+
+			goto same_trx;
+		}	
+
+		trx_write_trx_id(node->trx_id_buf, trx->id);
+
+		err = lock_table(0, node->table, LOCK_IX, thr);
+
+		if (err != DB_SUCCESS) {
+
+			goto error_handling;
+		}
+
+		node->trx_id = trx->id;
+	same_trx:				
+		node->state = INS_NODE_ALLOC_ROW_ID;
+
+		if (node->ins_type == INS_SEARCHED) {
+			/* Reset the cursor */
+			sel_node->state = SEL_NODE_OPEN;
+ 		
+			/* Fetch a row to insert */
+		
+			thr->run_node = sel_node;
+	
+			return(thr);
+		}
+	}
+
+	if ((node->ins_type == INS_SEARCHED)
+				&& (sel_node->state != SEL_NODE_FETCH)) {
+
+		ut_ad(sel_node->state == SEL_NODE_NO_MORE_ROWS);
+
+		/* No more rows to insert */
+		thr->run_node = parent;
+	
+		return(thr);
+	}
+
+	/* DO THE CHECKS OF THE CONSISTENCY CONSTRAINTS HERE */
+
+	err = row_ins(node, thr);
+
+error_handling:
+	trx->error_state = err;
+
+	if (err == DB_SUCCESS) {
+		/* Ok: do nothing */
+
+	} else if (err == DB_LOCK_WAIT) {
+
+		return(NULL);
+	} else {
+		/* SQL error detected */
+
+		return(NULL);
+	}
+
+	/* DO THE TRIGGER ACTIONS HERE */
+
+	if (node->ins_type == INS_SEARCHED) {
+		/* Fetch a row to insert */
+		
+		thr->run_node = sel_node;
+	} else {
+		thr->run_node = que_node_get_parent(node);
+	}
+
+	return(thr);
+} 
diff --git a/innobase/row/row0mysql.c b/innobase/row/row0mysql.c
new file mode 100644
index 00000000000..13d84ffd358
--- /dev/null
+++ b/innobase/row/row0mysql.c
@@ -0,0 +1,1116 @@
+/******************************************************
+Interface between Innobase row operations and MySQL.
+Contains also create table and other data dictionary operations.
+
+(c) 2000 Innobase Oy
+
+Created 9/17/2000 Heikki Tuuri
+*******************************************************/
+
+#include "row0mysql.h"
+
+#ifdef UNIV_NONINL
+#include "row0mysql.ic"
+#endif
+
+#include "row0ins.h"
+#include "row0sel.h"
+#include "row0upd.h"
+#include "row0row.h"
+#include "que0que.h"
+#include "pars0pars.h"
+#include "dict0dict.h"
+#include "dict0crea.h"
+#include "trx0roll.h"
+#include "trx0purge.h"
+#include "lock0lock.h"
+
+/***********************************************************************
+Reads a MySQL format variable-length field (like VARCHAR) length and
+returns pointer to the field data. */
+
+byte*
+row_mysql_read_var_ref_noninline(
+/*=============================*/
+			/* out: field + 2 */
+	ulint*	len,	/* out: variable-length field length */
+	byte*	field)	/* in: field */
+{
+	return(row_mysql_read_var_ref(len, field));
+}
+
+/***********************************************************************
+Stores a reference to a BLOB in the MySQL format. */
+
+void
+row_mysql_store_blob_ref(
+/*=====================*/
+	byte*	dest,		/* in: where to store */
+	ulint	col_len,	/* in: dest buffer size: determines into
+				how many bytes the BLOB length is stored,
+				this may vary from 1 to 4 bytes */
+	byte*	data,		/* in: BLOB data */
+	ulint	len)		/* in: BLOB length */
+{
+	/* In dest there are 1 - 4 bytes reserved for the BLOB length,
+	and after that 8 bytes reserved for the pointer to the data.
+	In 32-bit architectures we only use the first 4 bytes of the pointer
+	slot. */
+
+	mach_write_to_n_little_endian(dest, col_len - 8, len);
+
+	ut_memcpy(dest + col_len - 8, (byte*)&data, sizeof(byte*));	
+}
+
+/***********************************************************************
+Reads a reference to a BLOB in the MySQL format. */
+
+byte*
+row_mysql_read_blob_ref(
+/*====================*/
+				/* out: pointer to BLOB data */
+	ulint*	len,		/* out: BLOB length */
+	byte*	ref,		/* in: BLOB reference in the MySQL format */
+	ulint	col_len)	/* in: BLOB reference length (not BLOB
+				length) */
+{
+	byte*	data;
+
+	*len = mach_read_from_n_little_endian(ref, col_len - 8);
+
+	ut_memcpy((byte*)&data, ref + col_len - 8, sizeof(byte*));
+
+	return(data);
+}
+
+/******************************************************************
+Convert a row in the MySQL format to a row in the Innobase format. */
+static
+void
+row_mysql_convert_row_to_innobase(
+/*==============================*/
+	dtuple_t*	row,		/* in/out: Innobase row where the
+					field type information is already
+					copied there, or will be copied
+					later */
+	byte*		buf,		/* in/out: buffer to use in converting
+					data in columns; this must be at least
+					the size of mysql_rec! */
+	row_prebuilt_t*	prebuilt,	/* in: prebuilt struct where template
+					must be of type ROW_MYSQL_WHOLE_ROW */
+	byte*		mysql_rec)	/* in: row in the MySQL format;
+					NOTE: do not discard as long as
+					row is used, as row may contain
+					pointers to this record! */
+{
+	mysql_row_templ_t*	templ;	
+	dfield_t*		dfield;
+	ulint			i;
+	
+	ut_ad(prebuilt->template_type == ROW_MYSQL_WHOLE_ROW);
+	ut_ad(prebuilt->mysql_template);
+
+	for (i = 0; i < prebuilt->n_template; i++) {
+
+		templ = prebuilt->mysql_template + i;
+		dfield = dtuple_get_nth_field(row, i);
+
+		if (templ->mysql_null_bit_mask != 0) {
+			/* Column may be SQL NULL */
+
+			if (mysql_rec[templ->mysql_null_byte_offset] &
+ 					(byte) (templ->mysql_null_bit_mask)) {
+
+				/* It is SQL NULL */
+
+				dfield_set_data(dfield, NULL, UNIV_SQL_NULL);
+
+				goto next_column;
+			}
+		}			
+		
+		row_mysql_store_col_in_innobase_format(dfield,
+					prebuilt->ins_upd_rec_buff
+						+ templ->mysql_col_offset,
+					mysql_rec + templ->mysql_col_offset,
+					templ->mysql_col_len,
+					templ->type, templ->is_unsigned);
+next_column:
+		;
+	} 
+}
+
+/********************************************************************
+Handles user errors and lock waits detected by the database engine. */
+
+ibool
+row_mysql_handle_errors(
+/*====================*/
+				/* out: TRUE if it was a lock wait and
+				we should continue running the query thread */
+	ulint*		new_err,/* out: possible new error encountered in
+				rollback, or the old error which was
+				during the function entry */
+	trx_t*		trx,	/* in: transaction */
+	que_thr_t*	thr,	/* in: query thread */
+	trx_savept_t*	savept)	/* in: savepoint */
+{
+	ibool	timeout_expired;
+	ulint	err;
+
+handle_new_error:
+	err = trx->error_state;
+	
+	ut_a(err != DB_SUCCESS);
+	
+	trx->error_state = DB_SUCCESS;
+
+	if (err == DB_DUPLICATE_KEY) {
+		if (savept) {
+			/* Roll back the latest, possibly incomplete
+			insertion or update */
+
+			trx_general_rollback_for_mysql(trx, TRUE, savept);
+		}			
+	} else if (err == DB_TOO_BIG_RECORD) {
+		if (savept) {
+			/* Roll back the latest, possibly incomplete
+			insertion or update */
+
+			trx_general_rollback_for_mysql(trx, TRUE, savept);
+		}			
+	} else if (err == DB_LOCK_WAIT) {
+
+		timeout_expired = srv_suspend_mysql_thread(thr);
+
+		if (timeout_expired) {
+			trx->error_state = DB_DEADLOCK;
+
+			que_thr_stop_for_mysql(thr);
+
+			goto handle_new_error;
+		}
+
+		*new_err = err;
+
+		return(TRUE);
+
+	} else if (err == DB_DEADLOCK) {
+
+		/* Roll back the whole transaction */
+
+		trx_general_rollback_for_mysql(trx, FALSE, NULL);
+
+	} else if (err == DB_OUT_OF_FILE_SPACE) {
+
+		/* Roll back the whole transaction */
+
+		trx_general_rollback_for_mysql(trx, FALSE, NULL);
+	} else if (err == DB_MUST_GET_MORE_FILE_SPACE) {
+
+		ut_a(0); /* TODO: print something to MySQL error log */
+	} else {
+		ut_a(0);
+	}		
+
+	if (trx->error_state != DB_SUCCESS) {
+		*new_err = trx->error_state;
+	} else {
+		*new_err = err;
+	}
+	
+	trx->error_state = DB_SUCCESS;
+
+	return(FALSE);
+}
+
+/************************************************************************
+Create a prebuilt struct for a MySQL table handle. */
+
+row_prebuilt_t*
+row_create_prebuilt(
+/*================*/
+				/* out, own: a prebuilt struct */
+	dict_table_t*	table)	/* in: Innobase table handle */
+{
+	row_prebuilt_t*	prebuilt;
+	mem_heap_t*	heap;
+	dict_index_t*	clust_index;
+	dtuple_t*	ref;
+	ulint		ref_len;
+	ulint		i;
+	
+	heap = mem_heap_create(128);
+
+	prebuilt = mem_heap_alloc(heap, sizeof(row_prebuilt_t));
+
+	prebuilt->table = table;
+
+	prebuilt->trx = NULL;
+
+	prebuilt->sql_stat_start = TRUE;
+
+	prebuilt->index = NULL;
+	prebuilt->n_template = 0;
+	prebuilt->mysql_template = NULL;
+
+	prebuilt->heap = heap;
+	prebuilt->ins_node = NULL;
+
+	prebuilt->ins_upd_rec_buff = NULL;
+	
+	prebuilt->upd_node = NULL;
+	prebuilt->ins_graph = NULL;
+	prebuilt->upd_graph = NULL;
+
+  	prebuilt->pcur = btr_pcur_create_for_mysql();
+  	prebuilt->clust_pcur = btr_pcur_create_for_mysql();
+
+	prebuilt->select_lock_type = LOCK_NONE;
+
+	prebuilt->sel_graph = NULL;
+
+	prebuilt->search_tuple = dtuple_create(heap,
+						dict_table_get_n_cols(table));
+	
+	clust_index = dict_table_get_first_index(table);
+
+	ref_len = dict_index_get_n_unique(clust_index);
+
+	ref = dtuple_create(heap, ref_len);
+
+	dict_index_copy_types(ref, clust_index, ref_len);
+
+	prebuilt->clust_ref = ref;
+
+	for (i = 0; i < MYSQL_FETCH_CACHE_SIZE; i++) {
+		prebuilt->fetch_cache[i] = NULL;
+	}
+
+	prebuilt->n_fetch_cached = 0;
+
+	prebuilt->blob_heap = NULL;
+
+	prebuilt->old_vers_heap = NULL;
+	
+	return(prebuilt);
+}
+
+/************************************************************************
+Free a prebuilt struct for a MySQL table handle. */
+
+void
+row_prebuilt_free(
+/*==============*/
+	row_prebuilt_t*	prebuilt)	/* in, own: prebuilt struct */
+{
+	ulint	i;
+
+	btr_pcur_free_for_mysql(prebuilt->pcur);
+	btr_pcur_free_for_mysql(prebuilt->clust_pcur);
+
+	if (prebuilt->mysql_template) {
+		mem_free(prebuilt->mysql_template);
+	}
+
+	if (prebuilt->ins_graph) {
+		que_graph_free_recursive(prebuilt->ins_graph);
+	}
+
+	if (prebuilt->sel_graph) {
+		que_graph_free_recursive(prebuilt->sel_graph);
+	}
+	
+	if (prebuilt->upd_graph) {
+		que_graph_free_recursive(prebuilt->upd_graph);
+	}
+	
+	if (prebuilt->blob_heap) {
+		mem_heap_free(prebuilt->blob_heap);
+	}
+
+	if (prebuilt->old_vers_heap) {
+		mem_heap_free(prebuilt->old_vers_heap);
+	}
+	
+	for (i = 0; i < MYSQL_FETCH_CACHE_SIZE; i++) {
+		if (prebuilt->fetch_cache[i] != NULL) {
+			mem_free(prebuilt->fetch_cache[i]);
+		}
+	}
+
+	mem_heap_free(prebuilt->heap);
+}
+
+/*************************************************************************
+Updates the transaction pointers in query graphs stored in the prebuilt
+struct. */
+
+void
+row_update_prebuilt_trx(
+/*====================*/
+					/* out: prebuilt dtuple */
+	row_prebuilt_t*	prebuilt,	/* in: prebuilt struct in MySQL
+					handle */
+	trx_t*		trx)		/* in: transaction handle */
+{	
+	prebuilt->trx = trx;
+
+	if (prebuilt->ins_graph) {
+		prebuilt->ins_graph->trx = trx;
+	}
+
+	if (prebuilt->upd_graph) {
+		prebuilt->upd_graph->trx = trx;
+	}
+
+	if (prebuilt->sel_graph) {
+		prebuilt->sel_graph->trx = trx;
+	}	
+}
+
+/*************************************************************************
+Gets pointer to a prebuilt dtuple used in insertions. If the insert graph
+has not yet been built in the prebuilt struct, then this function first
+builds it. */
+static
+dtuple_t*
+row_get_prebuilt_insert_row(
+/*========================*/
+					/* out: prebuilt dtuple */
+	row_prebuilt_t*	prebuilt)	/* in: prebuilt struct in MySQL
+					handle */
+{
+	ins_node_t*	node;
+	dtuple_t*	row;
+	dict_table_t*	table	= prebuilt->table;
+
+	ut_ad(prebuilt && table && prebuilt->trx);
+	
+	if (prebuilt->ins_node == NULL) {
+
+		/* Not called before for this handle: create an insert node
+		and query graph to the prebuilt struct */
+
+		node = ins_node_create(INS_DIRECT, table, prebuilt->heap);
+		
+		prebuilt->ins_node = node;
+
+		if (prebuilt->ins_upd_rec_buff == NULL) {
+			prebuilt->ins_upd_rec_buff = mem_heap_alloc(
+						prebuilt->heap,
+						prebuilt->mysql_row_len);
+		}
+		
+		row = dtuple_create(prebuilt->heap,
+					dict_table_get_n_cols(table));
+
+		dict_table_copy_types(row, table);
+
+		ins_node_set_new_row(node, row);
+
+		prebuilt->ins_graph =
+			que_node_get_parent(
+				pars_complete_graph_for_exec(node,
+							prebuilt->trx,
+							prebuilt->heap));
+		prebuilt->ins_graph->state = QUE_FORK_ACTIVE;
+	}
+
+	return(prebuilt->ins_node->row);	
+}
+
+/*************************************************************************
+Updates the table modification counter and calculates new estimates
+for table and index statistics if necessary. */
+UNIV_INLINE
+void
+row_update_statistics_if_needed(
+/*============================*/
+	row_prebuilt_t*	prebuilt)	/* in: prebuilt struct */
+{
+	ulint	counter;
+	ulint	old_counter;
+	
+	counter = prebuilt->table->stat_modif_counter;
+
+	counter += prebuilt->mysql_row_len;
+	prebuilt->table->stat_modif_counter = counter;
+
+	old_counter = prebuilt->table->stat_last_estimate_counter;
+
+	if (counter - old_counter >= DICT_STAT_CALCULATE_INTERVAL
+	    || counter - old_counter >=
+		(UNIV_PAGE_SIZE
+			* prebuilt->table->stat_clustered_index_size / 2)) {
+
+		dict_update_statistics(prebuilt->table);
+	}	
+}
+
+/*************************************************************************
+Does an insert for MySQL. */
+
+int
+row_insert_for_mysql(
+/*=================*/
+					/* out: error code or DB_SUCCESS */
+	byte*		mysql_rec,	/* in: row in the MySQL format */
+	row_prebuilt_t*	prebuilt)	/* in: prebuilt struct in MySQL
+					handle */
+{
+	trx_savept_t	savept;
+	que_thr_t*	thr;
+	ulint		err;
+	ibool		was_lock_wait;
+	trx_t*		trx 		= prebuilt->trx;
+	ins_node_t*	node		= prebuilt->ins_node;
+	
+	ut_ad(trx);
+	ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
+	
+	if (node == NULL) {
+		row_get_prebuilt_insert_row(prebuilt);
+		node = prebuilt->ins_node;
+	}
+
+	row_mysql_convert_row_to_innobase(node->row,
+						prebuilt->ins_upd_rec_buff,
+						prebuilt, mysql_rec);
+	savept = trx_savept_take(trx);
+	
+	thr = que_fork_get_first_thr(prebuilt->ins_graph);
+
+	if (prebuilt->sql_stat_start) {
+		node->state = INS_NODE_SET_IX_LOCK;
+		prebuilt->sql_stat_start = FALSE;
+	} else {
+		node->state = INS_NODE_ALLOC_ROW_ID;
+	}
+	
+	que_thr_move_to_run_state_for_mysql(thr, trx);
+
+run_again:
+	thr->run_node = node;
+	thr->prev_node = node;
+
+	row_ins_step(thr);
+	
+	err = trx->error_state;
+
+	if (err != DB_SUCCESS) {
+		que_thr_stop_for_mysql(thr);
+
+		was_lock_wait = row_mysql_handle_errors(&err, trx, thr,
+								&savept);
+		if (was_lock_wait) {
+			goto run_again;
+		}
+
+		return(err);
+	}
+
+	que_thr_stop_for_mysql_no_error(thr, trx);
+	
+	prebuilt->table->stat_n_rows++;
+
+	if (prebuilt->table->stat_n_rows == 0) {
+		/* Avoid wrap-over */
+		prebuilt->table->stat_n_rows--;
+	}	
+
+	row_update_statistics_if_needed(prebuilt);
+
+	return((int) err);
+}
+
+/*************************************************************************
+Builds a dummy query graph used in selects. */
+
+void
+row_prebuild_sel_graph(
+/*===================*/
+	row_prebuilt_t*	prebuilt)	/* in: prebuilt struct in MySQL
+					handle */
+{
+	sel_node_t*	node;
+
+	ut_ad(prebuilt && prebuilt->trx);
+	
+	if (prebuilt->sel_graph == NULL) {
+
+		node = sel_node_create(prebuilt->heap);
+				
+		prebuilt->sel_graph =
+			que_node_get_parent(
+				pars_complete_graph_for_exec(node,
+							prebuilt->trx,
+							prebuilt->heap));
+
+		prebuilt->sel_graph->state = QUE_FORK_ACTIVE;
+	}
+}
+
+/*************************************************************************
+Gets pointer to a prebuilt update vector used in updates. If the update
+graph has not yet been built in the prebuilt struct, then this function
+first builds it. */
+
+upd_t*
+row_get_prebuilt_update_vector(
+/*===========================*/
+					/* out: prebuilt update vector */
+	row_prebuilt_t*	prebuilt)	/* in: prebuilt struct in MySQL
+					handle */
+{
+	dict_table_t*	table	= prebuilt->table;
+	upd_node_t*	node;
+
+	ut_ad(prebuilt && table && prebuilt->trx);
+	
+	if (prebuilt->upd_node == NULL) {
+
+		/* Not called before for this handle: create an update node
+		and query graph to the prebuilt struct */
+
+		node = upd_node_create(prebuilt->heap);
+		
+		prebuilt->upd_node = node;
+
+		node->in_mysql_interface = TRUE;
+		node->is_delete = FALSE;
+		node->searched_update = FALSE;
+		node->select_will_do_update = FALSE;
+		node->select = NULL;
+		node->pcur = btr_pcur_create_for_mysql();
+		node->table = table;
+
+		node->update = upd_create(dict_table_get_n_cols(table),
+							prebuilt->heap);
+		UT_LIST_INIT(node->columns);
+		node->has_clust_rec_x_lock = TRUE;
+		node->cmpl_info = 0;
+
+		node->table_sym = NULL;
+		node->col_assign_list = NULL;
+		
+		prebuilt->upd_graph =
+			que_node_get_parent(
+				pars_complete_graph_for_exec(node,
+							prebuilt->trx,
+							prebuilt->heap));
+		prebuilt->upd_graph->state = QUE_FORK_ACTIVE;
+	}
+
+	return(prebuilt->upd_node->update);
+}
+
+/*************************************************************************
+Does an update or delete of a row for MySQL. */
+
+int
+row_update_for_mysql(
+/*=================*/
+					/* out: error code or DB_SUCCESS */
+	byte*		mysql_rec,	/* in: the row to be updated, in
+					the MySQL format */
+	row_prebuilt_t*	prebuilt)	/* in: prebuilt struct in MySQL
+					handle */
+{
+	trx_savept_t	savept;
+	ulint		err;
+	que_thr_t*	thr;
+	ibool		was_lock_wait;
+	dict_index_t*	clust_index; 
+	ulint		ref_len;
+	upd_node_t*	node;
+	dict_table_t*	table		= prebuilt->table;
+	trx_t*		trx		= prebuilt->trx;
+	mem_heap_t*	heap;
+	dtuple_t*	search_tuple;
+	dtuple_t*	row_tuple;
+	mtr_t		mtr;
+	
+	ut_ad(prebuilt && trx);
+	ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
+
+	node = prebuilt->upd_node;
+
+	clust_index = dict_table_get_first_index(table);
+
+	if (prebuilt->in_update_remember_pos) {
+		if (prebuilt->index == clust_index) {
+			btr_pcur_copy_stored_position(node->pcur,
+							prebuilt->pcur);
+		} else {
+			btr_pcur_copy_stored_position(node->pcur,
+							prebuilt->clust_pcur);
+		}
+		
+	 	ut_ad(node->pcur->rel_pos == BTR_PCUR_ON);
+
+	 	goto skip_cursor_search;
+	} 	
+
+	/* We have to search for the correct cursor position */
+
+	ref_len = dict_index_get_n_unique(clust_index);
+
+	heap = mem_heap_create(450);
+
+	row_tuple = dtuple_create(heap, dict_table_get_n_cols(table));
+	dict_table_copy_types(row_tuple, table);
+
+	if (prebuilt->ins_upd_rec_buff == NULL) {
+		prebuilt->ins_upd_rec_buff = mem_heap_alloc(prebuilt->heap,
+						prebuilt->mysql_row_len);
+	}
+		
+	row_mysql_convert_row_to_innobase(row_tuple,
+						prebuilt->ins_upd_rec_buff,
+						prebuilt, mysql_rec);
+
+	search_tuple = dtuple_create(heap, ref_len);
+
+	row_build_row_ref_from_row(search_tuple, table, row_tuple);
+
+	mtr_start(&mtr);
+	
+	btr_pcur_open_with_no_init(clust_index, search_tuple, PAGE_CUR_LE,
+					BTR_SEARCH_LEAF, node->pcur, 0, &mtr);	
+
+	btr_pcur_store_position(node->pcur, &mtr);
+	
+	mtr_commit(&mtr);
+
+	mem_heap_free(heap);
+
+skip_cursor_search:
+	savept = trx_savept_take(trx);
+	
+	thr = que_fork_get_first_thr(prebuilt->upd_graph);
+
+	node->state = UPD_NODE_UPDATE_CLUSTERED;
+
+	ut_ad(!prebuilt->sql_stat_start);
+
+	que_thr_move_to_run_state_for_mysql(thr, trx);
+run_again:
+	thr->run_node = node;
+	thr->prev_node = node;
+
+	row_upd_step(thr);
+
+	err = trx->error_state;
+
+	if (err != DB_SUCCESS) {
+		que_thr_stop_for_mysql(thr);
+		
+		if (err == DB_RECORD_NOT_FOUND) {
+			trx->error_state = DB_SUCCESS;
+
+			return((int) err);
+		}
+	
+		was_lock_wait = row_mysql_handle_errors(&err, trx, thr,
+								&savept);
+		if (was_lock_wait) {
+			goto run_again;
+		}
+
+		return(err);
+	}
+
+	que_thr_stop_for_mysql_no_error(thr, trx);
+
+	if (prebuilt->upd_node->is_delete) {
+		if (prebuilt->table->stat_n_rows > 0) {
+			prebuilt->table->stat_n_rows--;
+		}
+	}	
+
+	row_update_statistics_if_needed(prebuilt);
+
+	return((int) err);
+}
+
+/*************************************************************************
+Checks if a table is such that we automatically created a clustered
+index on it (on row id). */
+
+ibool
+row_table_got_default_clust_index(
+/*==============================*/
+	dict_table_t*	table)
+{
+	dict_index_t*	clust_index;
+
+	clust_index = dict_table_get_first_index(table);
+
+	if (dtype_get_mtype(dict_index_get_nth_type(clust_index, 0))
+	 							== DATA_SYS) {
+	 	return(TRUE);
+	}
+
+	return(FALSE);
+}
+
+/*************************************************************************
+Does a table creation operation for MySQL. */
+
+int
+row_create_table_for_mysql(
+/*=======================*/
+				/* out: error code or DB_SUCCESS */
+	dict_table_t*	table,	/* in: table definition */
+	trx_t*		trx)	/* in: transaction handle */
+{
+	tab_node_t*	node;
+	mem_heap_t*	heap;
+	que_thr_t*	thr;
+	ulint		err;
+
+	ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
+	
+	/* Serialize data dictionary operations with dictionary mutex:
+	no deadlocks can occur then in these operations */
+
+	mutex_enter(&(dict_sys->mutex));
+
+	heap = mem_heap_create(512);
+
+	trx->dict_operation = TRUE;
+	
+	node = tab_create_graph_create(table, heap);
+
+	thr = pars_complete_graph_for_exec(node, trx, heap);
+
+	ut_a(thr == que_fork_start_command(que_node_get_parent(thr),
+						SESS_COMM_EXECUTE, 0));
+	que_run_threads(thr);
+
+	err = trx->error_state;
+
+	if (err != DB_SUCCESS) {
+		/* We have special error handling here */
+		ut_a(err == DB_OUT_OF_FILE_SPACE);
+		trx->error_state = DB_SUCCESS;
+		
+		trx_general_rollback_for_mysql(trx, FALSE, NULL);
+
+		row_drop_table_for_mysql(table->name, trx, TRUE);
+
+		trx->error_state = DB_SUCCESS;
+	}
+
+	mutex_exit(&(dict_sys->mutex));
+	que_graph_free((que_t*) que_node_get_parent(thr));
+	
+	return((int) err);
+}
+
+/*************************************************************************
+Does an index creation operation for MySQL. TODO: currently failure
+to create an index results in dropping the whole table! This is no problem
+currently as all indexes must be created at the same time as the table. */
+
+int
+row_create_index_for_mysql(
+/*=======================*/
+					/* out: error number or DB_SUCCESS */
+	dict_index_t*	index,		/* in: index defintion */
+	trx_t*		trx)		/* in: transaction handle */
+{
+	ind_node_t*	node;
+	mem_heap_t*	heap;
+	que_thr_t*	thr;
+	ulint		err;
+	
+	ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
+	
+	/* Serialize data dictionary operations with dictionary mutex:
+	no deadlocks can occur then in these operations */
+
+	mutex_enter(&(dict_sys->mutex));
+
+	heap = mem_heap_create(512);
+
+	trx->dict_operation = TRUE;
+
+	node = ind_create_graph_create(index, heap);
+
+	thr = pars_complete_graph_for_exec(node, trx, heap);
+
+	ut_a(thr == que_fork_start_command(que_node_get_parent(thr),
+						SESS_COMM_EXECUTE, 0));
+	que_run_threads(thr);
+
+	err = trx->error_state;
+
+	if (err != DB_SUCCESS) {
+		/* We have special error handling here */
+		ut_a(err == DB_OUT_OF_FILE_SPACE);
+		
+		trx->error_state = DB_SUCCESS;
+
+		trx_general_rollback_for_mysql(trx, FALSE, NULL);
+
+		row_drop_table_for_mysql(index->table_name, trx, TRUE);
+
+		trx->error_state = DB_SUCCESS;
+	}
+
+	mutex_exit(&(dict_sys->mutex));
+
+	que_graph_free((que_t*) que_node_get_parent(thr));
+	
+	return((int) err);
+}
+
+/*************************************************************************
+Drops a table for MySQL. */
+
+int
+row_drop_table_for_mysql(
+/*=====================*/
+				/* out: error code or DB_SUCCESS */
+	char*	name,		/* in: table name */
+	trx_t*	trx,		/* in: transaction handle */
+	ibool	has_dict_mutex)	/* in: TRUE if the caller already owns the
+				dictionary system mutex */
+{
+	dict_table_t*	table;
+	que_thr_t*	thr;
+	que_t*		graph;
+	ulint		err;
+	char*		str1;
+	char*		str2;
+	ulint		len;
+	char		buf[10000];
+retry:
+	ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
+	ut_a(name != NULL);
+	
+	/* We use the private SQL parser of Innobase to generate the
+	query graphs needed in deleting the dictionary data from system
+	tables in Innobase. Deleting a row from SYS_INDEXES table also
+	frees the file segments of the B-tree associated with the index. */
+
+	str1 =
+	"PROCEDURE DROP_TABLE_PROC () IS\n"
+	"table_id CHAR;\n"
+	"index_id CHAR;\n"
+	"found INT;\n"
+	"BEGIN\n"
+	"SELECT ID INTO table_id\n"
+	"FROM SYS_TABLES\n"
+	"WHERE NAME ='";
+
+	str2 = 
+	"';\n"
+	"IF (SQL % NOTFOUND) THEN\n"
+	"	COMMIT WORK;\n"
+	"	RETURN;\n"
+	"END IF;\n"
+	"found := 1;\n"
+	"WHILE found = 1 LOOP\n"
+	"	SELECT ID INTO index_id\n"
+	"	FROM SYS_INDEXES\n"
+	"	WHERE TABLE_ID = table_id;\n"	
+	"	IF (SQL % NOTFOUND) THEN\n"
+	"		found := 0;\n"
+	"	ELSE"
+	"		DELETE FROM SYS_FIELDS WHERE INDEX_ID = index_id;\n"
+	"		DELETE FROM SYS_INDEXES WHERE ID = index_id;\n"
+	"	END IF;\n"
+	"END LOOP;\n"
+	"DELETE FROM SYS_COLUMNS WHERE TABLE_ID = table_id;\n"
+	"DELETE FROM SYS_TABLES WHERE ID = table_id;\n"
+	"COMMIT WORK;\n"
+	"END;\n";
+
+	len = ut_strlen(str1);
+
+	ut_memcpy(buf, str1, len);
+	ut_memcpy(buf + len, name, ut_strlen(name));
+
+	len += ut_strlen(name);
+
+	ut_memcpy(buf + len, str2, ut_strlen(str2) + 1);
+
+	/* Serialize data dictionary operations with dictionary mutex:
+	no deadlocks can occur then in these operations */
+
+	if (!has_dict_mutex) {
+		mutex_enter(&(dict_sys->mutex));
+	}
+
+	graph = pars_sql(buf);
+
+	ut_a(graph);
+
+	graph->trx = trx;
+	trx->graph = NULL;
+
+	graph->fork_type = QUE_FORK_MYSQL_INTERFACE;
+
+	/* Prevent purge from running while we are dropping the table */
+	rw_lock_s_lock(&(purge_sys->purge_is_running));
+
+	table = dict_table_get_low(name);
+
+	if (!table) {
+		err = DB_TABLE_NOT_FOUND;
+
+		goto funct_exit;
+	}
+
+	/* Check if there are any locks on the table: if yes, it cannot
+	be dropped: we have to wait for the locks to be released  */
+
+	if (lock_is_on_table(table)) {
+
+		err = DB_TABLE_IS_BEING_USED;
+
+		goto funct_exit;
+	}		
+
+	/* TODO: check that MySQL prevents users from accessing the table
+	after this function row_drop_table_for_mysql has been called:
+	otherwise anyone with an open handle to the table could, for example,
+	come to read the table! */
+
+	trx->dict_operation = TRUE;
+	trx->table_id = table->id;
+
+	ut_a(thr = que_fork_start_command(graph, SESS_COMM_EXECUTE, 0));
+
+	que_run_threads(thr);
+
+	err = trx->error_state;
+
+	if (err != DB_SUCCESS) {
+		ut_a(err == DB_OUT_OF_FILE_SPACE);
+
+		err = DB_MUST_GET_MORE_FILE_SPACE;
+		
+		row_mysql_handle_errors(&err, trx, thr, NULL);
+
+		ut_a(0);
+	} else {
+		dict_table_remove_from_cache(table);
+	}
+funct_exit:	
+	rw_lock_s_unlock(&(purge_sys->purge_is_running));
+
+	if (!has_dict_mutex) {
+		mutex_exit(&(dict_sys->mutex));
+	}
+
+	que_graph_free(graph);
+	
+	if (err == DB_TABLE_IS_BEING_USED) {
+		os_thread_sleep(200000);
+
+		goto retry;
+	}
+
+	return((int) err);
+}
+
+/*************************************************************************
+Renames a table for MySQL. */
+
+int
+row_rename_table_for_mysql(
+/*=======================*/
+				/* out: error code or DB_SUCCESS */
+	char*	old_name,	/* in: old table name */
+	char*	new_name,	/* in: new table name */
+	trx_t*	trx)		/* in: transaction handle */
+{
+	dict_table_t*	table;
+	que_thr_t*	thr;
+	que_t*		graph;
+	ulint		err;
+	char*		str1;
+	char*		str2;
+	char*		str3;
+	ulint		len;
+	char		buf[10000];
+
+	ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
+	ut_a(old_name != NULL);
+	ut_a(new_name != NULL);
+
+	str1 =
+	"PROCEDURE RENAME_TABLE_PROC () IS\n"
+	"BEGIN\n"
+	"UPDATE SYS_TABLES SET NAME ='";
+
+	str2 = 
+	"' WHERE NAME = '";
+
+	str3 =
+	"';\n"
+	"COMMIT WORK;\n"
+	"END;\n";
+
+	len = ut_strlen(str1);
+
+	ut_memcpy(buf, str1, len);
+
+	ut_memcpy(buf + len, new_name, ut_strlen(new_name));
+
+	len += ut_strlen(new_name);
+
+	ut_memcpy(buf + len, str2, ut_strlen(str2));
+
+	len += ut_strlen(str2);
+
+	ut_memcpy(buf + len, old_name, ut_strlen(old_name));
+
+	len += ut_strlen(old_name);
+
+	ut_memcpy(buf + len, str3, ut_strlen(str3) + 1);
+	
+	/* Serialize data dictionary operations with dictionary mutex:
+	no deadlocks can occur then in these operations */
+
+	mutex_enter(&(dict_sys->mutex));
+
+	table = dict_table_get_low(old_name);
+
+	graph = pars_sql(buf);
+
+	ut_a(graph);
+
+	graph->trx = trx;
+	trx->graph = NULL;
+
+	graph->fork_type = QUE_FORK_MYSQL_INTERFACE;
+
+	if (!table) {
+		err = DB_TABLE_NOT_FOUND;
+
+		goto funct_exit;
+	}
+
+	ut_a(thr = que_fork_start_command(graph, SESS_COMM_EXECUTE, 0));
+
+	que_run_threads(thr);
+
+	err = trx->error_state;
+
+	if (err != DB_SUCCESS) {
+		row_mysql_handle_errors(&err, trx, thr, NULL);
+	} else {
+		ut_a(dict_table_rename_in_cache(table, new_name));
+	}
+funct_exit:	
+	mutex_exit(&(dict_sys->mutex));
+
+	que_graph_free(graph);
+	
+	return((int) err);
+}
diff --git a/innobase/row/row0purge.c b/innobase/row/row0purge.c
new file mode 100644
index 00000000000..0a6fabe584c
--- /dev/null
+++ b/innobase/row/row0purge.c
@@ -0,0 +1,553 @@
+/******************************************************
+Purge obsolete records
+
+(c) 1997 Innobase Oy
+
+Created 3/14/1997 Heikki Tuuri
+*******************************************************/
+
+#include "row0purge.h"
+
+#ifdef UNIV_NONINL
+#include "row0purge.ic"
+#endif
+
+#include "fsp0fsp.h"
+#include "mach0data.h"
+#include "trx0rseg.h"
+#include "trx0trx.h"
+#include "trx0roll.h"
+#include "trx0undo.h"
+#include "trx0purge.h"
+#include "trx0rec.h"
+#include "que0que.h"
+#include "row0row.h"
+#include "row0upd.h"
+#include "row0vers.h"
+#include "log0log.h"
+
+/************************************************************************
+Creates a purge node to a query graph. */
+
+purge_node_t*
+row_purge_node_create(
+/*==================*/
+				/* out, own: purge node */
+	que_thr_t*	parent,	/* in: parent node, i.e., a thr node */
+	mem_heap_t*	heap)	/* in: memory heap where created */
+{
+	purge_node_t*	node;
+
+	ut_ad(parent && heap);
+
+	node = mem_heap_alloc(heap, sizeof(purge_node_t));
+
+	node->common.type = QUE_NODE_PURGE;
+	node->common.parent = parent;
+
+	node->heap = mem_heap_create(256);
+
+	return(node);
+}
+
+/***************************************************************
+Repositions the pcur in the purge node on the clustered index record,
+if found. */
+static
+ibool
+row_purge_reposition_pcur(
+/*======================*/
+				/* out: TRUE if the record was found */
+	ulint		mode,	/* in: latching mode */
+	purge_node_t*	node,	/* in: row purge node */
+	mtr_t*		mtr)	/* in: mtr */
+{
+	ibool	found;
+
+	if (node->found_clust) {
+		found = btr_pcur_restore_position(mode, &(node->pcur), mtr);
+
+		return(found);
+	}
+
+	found = row_search_on_row_ref(&(node->pcur), mode, node->table,
+							node->ref, mtr);
+	node->found_clust = found;
+
+	if (found) {
+		btr_pcur_store_position(&(node->pcur), mtr);
+	}
+
+	return(found);
+}
+
+/***************************************************************
+Removes a delete marked clustered index record if possible. */
+static
+ibool
+row_purge_remove_clust_if_poss_low(
+/*===============================*/
+				/* out: TRUE if success, or if not found, or
+				if modified after the delete marking */
+	purge_node_t*	node,	/* in: row purge node */
+	que_thr_t*	thr,	/* in: query thread */
+	ulint		mode)	/* in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE */
+{
+	dict_index_t*	index;
+	btr_pcur_t*	pcur;
+	btr_cur_t*	btr_cur;
+	ibool		success;
+	ulint		err;
+	mtr_t		mtr;
+
+	UT_NOT_USED(thr);
+
+	index = dict_table_get_first_index(node->table);
+	
+	pcur = &(node->pcur);
+	btr_cur = btr_pcur_get_btr_cur(pcur);
+
+	mtr_start(&mtr);
+
+	success = row_purge_reposition_pcur(mode, node, &mtr);
+
+	if (!success) {
+		/* The record is already removed */
+
+		btr_pcur_commit_specify_mtr(pcur, &mtr);
+
+		return(TRUE);
+	}
+
+	if (0 != ut_dulint_cmp(node->roll_ptr,
+		row_get_rec_roll_ptr(btr_pcur_get_rec(pcur), index))) {
+		
+		/* Someone else has modified the record later: do not remove */
+		btr_pcur_commit_specify_mtr(pcur, &mtr);
+
+		return(TRUE);
+	}
+
+	if (mode == BTR_MODIFY_LEAF) {
+		success = btr_cur_optimistic_delete(btr_cur, &mtr);
+	} else {
+		ut_ad(mode == BTR_MODIFY_TREE);
+		btr_cur_pessimistic_delete(&err, FALSE, btr_cur, &mtr);
+
+		if (err == DB_SUCCESS) {
+			success = TRUE;
+		} else if (err == DB_OUT_OF_FILE_SPACE) {
+			success = FALSE;
+		} else {
+			ut_a(0);
+		}
+	}
+
+	btr_pcur_commit_specify_mtr(pcur, &mtr);
+
+	return(success);
+}
+		
+/***************************************************************
+Removes a clustered index record if it has not been modified after the delete
+marking. */
+static
+void
+row_purge_remove_clust_if_poss(
+/*===========================*/
+	purge_node_t*	node,	/* in: row purge node */
+	que_thr_t*	thr)	/* in: query thread */
+{
+	ibool	success;
+	ulint	n_tries	= 0;
+	
+/*	printf("Purge: Removing clustered record\n"); */
+
+	success = row_purge_remove_clust_if_poss_low(node, thr,
+							BTR_MODIFY_LEAF);
+	if (success) {
+
+		return;
+	}
+retry:
+	success = row_purge_remove_clust_if_poss_low(node, thr,
+							BTR_MODIFY_TREE);
+	/* The delete operation may fail if we have little
+	file space left: TODO: easiest to crash the database
+	and restart with more file space */
+
+	if (!success && n_tries < BTR_CUR_RETRY_DELETE_N_TIMES) {
+		n_tries++;
+
+		os_thread_sleep(BTR_CUR_RETRY_SLEEP_TIME);
+			
+		goto retry;
+	}
+
+	ut_a(success);
+}
+ 						
+/***************************************************************
+Removes a secondary index entry if possible. */
+static
+ibool
+row_purge_remove_sec_if_poss_low(
+/*=============================*/
+				/* out: TRUE if success or if not found */
+	purge_node_t*	node,	/* in: row purge node */
+	que_thr_t*	thr,	/* in: query thread */
+	dict_index_t*	index,	/* in: index */
+	dtuple_t*	entry,	/* in: index entry */
+	ulint		mode)	/* in: latch mode BTR_MODIFY_LEAF or
+				BTR_MODIFY_TREE */	
+{
+	btr_pcur_t	pcur;
+	btr_cur_t*	btr_cur;
+	ibool		success;
+	ibool		old_has;
+	ibool		found;
+	ulint		err;
+	mtr_t		mtr;
+	mtr_t		mtr_vers;
+	
+	UT_NOT_USED(thr);
+
+	log_free_check();
+	mtr_start(&mtr);
+	
+	found = row_search_index_entry(index, entry, mode, &pcur, &mtr);
+
+	if (!found) {
+		/* Not found */
+
+		/* FIXME: printf("PURGE:........sec entry not found\n"); */
+		/* dtuple_print(entry); */
+
+		btr_pcur_close(&pcur);
+		mtr_commit(&mtr);
+
+		return(TRUE);
+	}
+
+	btr_cur = btr_pcur_get_btr_cur(&pcur);
+	
+	/* We should remove the index record if no later version of the row,
+	which cannot be purged yet, requires its existence. If some requires,
+	we should do nothing. */
+
+	mtr_start(&mtr_vers);
+
+	success = row_purge_reposition_pcur(BTR_SEARCH_LEAF, node, &mtr_vers);
+
+	if (success) {		
+		old_has = row_vers_old_has_index_entry(TRUE,
+					btr_pcur_get_rec(&(node->pcur)),
+					&mtr_vers, index, entry);
+	}
+
+	btr_pcur_commit_specify_mtr(&(node->pcur), &mtr_vers);
+	
+	if (!success || !old_has) {
+		/* Remove the index record */
+
+		if (mode == BTR_MODIFY_LEAF) {		
+			success = btr_cur_optimistic_delete(btr_cur, &mtr);
+		} else {
+			ut_ad(mode == BTR_MODIFY_TREE);
+			btr_cur_pessimistic_delete(&err, FALSE, btr_cur, &mtr);
+
+			if (err == DB_SUCCESS) {
+				success = TRUE;
+			} else if (err == DB_OUT_OF_FILE_SPACE) {
+				success = FALSE;
+			} else {
+				ut_a(0);
+			}
+		}
+	}
+
+	btr_pcur_close(&pcur);
+	mtr_commit(&mtr);
+
+	return(success);
+}
+
+/***************************************************************
+Removes a secondary index entry if possible. */
+UNIV_INLINE
+void
+row_purge_remove_sec_if_poss(
+/*=========================*/
+	purge_node_t*	node,	/* in: row purge node */
+	que_thr_t*	thr,	/* in: query thread */
+	dict_index_t*	index,	/* in: index */
+	dtuple_t*	entry)	/* in: index entry */
+{
+	ibool	success;
+	ulint	n_tries		= 0;
+	
+/*	printf("Purge: Removing secondary record\n"); */
+
+	success = row_purge_remove_sec_if_poss_low(node, thr, index, entry,
+							BTR_MODIFY_LEAF);
+	if (success) {
+
+		return;
+	}
+retry:
+	success = row_purge_remove_sec_if_poss_low(node, thr, index, entry,
+							BTR_MODIFY_TREE);
+	/* The delete operation may fail if we have little
+	file space left: TODO: easiest to crash the database
+	and restart with more file space */
+
+	if (!success && n_tries < BTR_CUR_RETRY_DELETE_N_TIMES) {
+				
+		n_tries++;
+
+		os_thread_sleep(BTR_CUR_RETRY_SLEEP_TIME);
+							
+		goto retry;
+	}
+
+	ut_a(success);
+}
+
+/***************************************************************
+Purges a delete marking of a record. */
+static
+void
+row_purge_del_mark(
+/*===============*/
+	purge_node_t*	node,	/* in: row purge node */
+	que_thr_t*	thr)	/* in: query thread */
+{
+	mem_heap_t*	heap;
+	dtuple_t*	entry;
+	dict_index_t*	index;
+	
+	ut_ad(node && thr);
+
+	heap = mem_heap_create(1024);
+
+	while (node->index != NULL) {
+		index = node->index;
+
+		/* Build the index entry */
+		entry = row_build_index_entry(node->row, index, heap);
+
+		row_purge_remove_sec_if_poss(node, thr, index, entry);
+
+		node->index = dict_table_get_next_index(node->index);
+	}
+
+	mem_heap_free(heap);	
+
+	row_purge_remove_clust_if_poss(node, thr);
+}
+	
+/***************************************************************
+Purges an update of an existing record. */
+static
+void
+row_purge_upd_exist(
+/*================*/
+	purge_node_t*	node,	/* in: row purge node */
+	que_thr_t*	thr)	/* in: query thread */
+{
+	mem_heap_t*	heap;
+	dtuple_t*	entry;
+	dict_index_t*	index;
+	
+	ut_ad(node && thr);
+
+	heap = mem_heap_create(1024);
+
+	while (node->index != NULL) {
+		index = node->index;
+
+		if (row_upd_changes_ord_field(NULL, node->index,
+							node->update)) {
+			/* Build the older version of the index entry */
+			entry = row_build_index_entry(node->row, index, heap);
+
+			row_purge_remove_sec_if_poss(node, thr, index, entry);
+		}
+
+		node->index = dict_table_get_next_index(node->index);
+	}
+
+	mem_heap_free(heap);	
+}
+
+/***************************************************************
+Parses the row reference and other info in a modify undo log record. */
+static
+ibool
+row_purge_parse_undo_rec(
+/*=====================*/
+				/* out: TRUE if purge operation required */
+	purge_node_t*	node,	/* in: row undo node */
+	que_thr_t*	thr)	/* in: query thread */
+{
+	dict_index_t*	clust_index;
+	byte*		ptr;
+	dulint		undo_no;
+	dulint		table_id;
+	dulint		trx_id;
+	dulint		roll_ptr;
+	ulint		info_bits;
+	ulint		type;
+	ulint		cmpl_info;
+	
+	ut_ad(node && thr);
+	
+	ptr = trx_undo_rec_get_pars(node->undo_rec, &type, &cmpl_info,
+							&undo_no, &table_id);
+	node->rec_type = type;
+
+	if (type == TRX_UNDO_UPD_DEL_REC) {
+
+		return(FALSE);
+	}	    		
+
+	ptr = trx_undo_update_rec_get_sys_cols(ptr, &trx_id, &roll_ptr,
+								&info_bits);
+	node->table = NULL;
+
+	if (type == TRX_UNDO_UPD_EXIST_REC
+				&& cmpl_info & UPD_NODE_NO_ORD_CHANGE) {
+
+	    	/* Purge requires no changes to indexes: we may return */
+
+	    	return(FALSE);
+	}
+	
+	/* NOTE that the table has to be explicitly released later */
+
+	/* TODO: currently nothing prevents dropping of table when purge
+	is accessing it! */
+
+ 	mutex_enter(&(dict_sys->mutex));
+
+	node->table = dict_table_get_on_id_low(table_id, thr_get_trx(thr));
+
+	rw_lock_x_lock(&(purge_sys->purge_is_running));
+
+ 	mutex_exit(&(dict_sys->mutex));
+	
+	if (node->table == NULL) {
+		/* The table has been dropped: no need to do purge */
+
+		rw_lock_x_unlock(&(purge_sys->purge_is_running));
+
+		return(FALSE);
+	}
+
+	clust_index = dict_table_get_first_index(node->table);
+
+	ptr = trx_undo_rec_get_row_ref(ptr, clust_index, &(node->ref),
+								node->heap);
+
+	ptr = trx_undo_update_rec_get_update(ptr, clust_index, type, trx_id,
+					roll_ptr, info_bits, node->heap,
+					&(node->update));
+
+	/* Read to the partial row the fields that occur in indexes */
+
+	ptr = trx_undo_rec_get_partial_row(ptr, clust_index, &(node->row),
+								node->heap);
+	return(TRUE);
+}
+
+/***************************************************************
+Fetches an undo log record and does the purge for the recorded operation.
+If none left, or the current purge completed, returns the control to the
+parent node, which is always a query thread node. */
+static
+ulint
+row_purge(
+/*======*/
+				/* out: DB_SUCCESS if operation successfully
+				completed, else error code */
+	purge_node_t*	node,	/* in: row purge node */
+	que_thr_t*	thr)	/* in: query thread */
+{
+	dulint	roll_ptr;
+	ibool	purge_needed;
+	
+	ut_ad(node && thr);
+
+	node->undo_rec = trx_purge_fetch_next_rec(&roll_ptr,
+						&(node->reservation),
+						node->heap);
+	if (!node->undo_rec) {
+		/* Purge completed for this query thread */
+
+		thr->run_node = que_node_get_parent(node);
+
+		return(DB_SUCCESS);
+	}
+
+	node->roll_ptr = roll_ptr;
+
+	if (node->undo_rec == &trx_purge_dummy_rec) {
+		purge_needed = FALSE;
+	} else {
+		purge_needed = row_purge_parse_undo_rec(node, thr);
+	}
+
+	if (purge_needed) {
+		node->found_clust = FALSE;
+	
+		node->index = dict_table_get_next_index(
+				dict_table_get_first_index(node->table));
+
+		if (node->rec_type == TRX_UNDO_UPD_EXIST_REC) {
+			row_purge_upd_exist(node, thr);
+		} else {
+			ut_ad(node->rec_type == TRX_UNDO_DEL_MARK_REC);
+			row_purge_del_mark(node, thr);
+		}
+
+		if (node->found_clust) {
+			btr_pcur_close(&(node->pcur));
+		}
+
+		rw_lock_x_unlock(&(purge_sys->purge_is_running));		
+	}
+
+	/* Do some cleanup */
+	trx_purge_rec_release(node->reservation);
+	mem_heap_empty(node->heap);
+	
+	thr->run_node = node;
+
+	return(DB_SUCCESS);
+}
+
+/***************************************************************
+Does the purge operation for a single undo log record. This is a high-level
+function used in an SQL execution graph. */
+
+que_thr_t*
+row_purge_step(
+/*===========*/
+				/* out: query thread to run next or NULL */
+	que_thr_t*	thr)	/* in: query thread */
+{
+	purge_node_t*	node;
+	ulint		err;
+
+	ut_ad(thr);
+	
+	node = thr->run_node;
+
+	ut_ad(que_node_get_type(node) == QUE_NODE_PURGE);
+
+	err = row_purge(node, thr);
+
+	ut_ad(err == DB_SUCCESS);
+
+	return(thr);
+} 
diff --git a/innobase/row/row0row.c b/innobase/row/row0row.c
new file mode 100644
index 00000000000..f85789fa0d6
--- /dev/null
+++ b/innobase/row/row0row.c
@@ -0,0 +1,652 @@
+/******************************************************
+General row routines
+
+(c) 1996 Innobase Oy
+
+Created 4/20/1996 Heikki Tuuri
+*******************************************************/
+
+#include "row0row.h"
+
+#ifdef UNIV_NONINL
+#include "row0row.ic"
+#endif
+
+#include "dict0dict.h"
+#include "btr0btr.h"
+#include "mach0data.h"
+#include "trx0rseg.h"
+#include "trx0trx.h"
+#include "trx0roll.h"
+#include "trx0undo.h"
+#include "trx0purge.h"
+#include "trx0rec.h"
+#include "que0que.h"
+#include "row0row.h"
+#include "row0upd.h"
+#include "rem0cmp.h"
+#include "read0read.h"
+
+/*************************************************************************
+Reads the trx id or roll ptr field from a clustered index record: this function
+is slower than the specialized inline functions. */
+
+dulint
+row_get_rec_sys_field(
+/*==================*/
+				/* out: value of the field */
+	ulint		type,	/* in: DATA_TRX_ID or DATA_ROLL_PTR */
+	rec_t*		rec,	/* in: record */
+	dict_index_t*	index)	/* in: clustered index */
+{
+	ulint	pos;
+	byte*	field;
+	ulint	len;
+
+	ut_ad(index->type & DICT_CLUSTERED);
+
+	pos = dict_index_get_sys_col_pos(index, type);
+
+	field = rec_get_nth_field(rec, pos, &len);
+
+	if (type == DATA_TRX_ID) {
+
+		return(trx_read_trx_id(field));
+	} else {
+		ut_ad(type == DATA_ROLL_PTR);
+
+		return(trx_read_roll_ptr(field));
+	}
+}
+
+/*************************************************************************
+Sets the trx id or roll ptr field in a clustered index record: this function
+is slower than the specialized inline functions. */
+
+void
+row_set_rec_sys_field(
+/*==================*/
+				/* out: value of the field */
+	ulint		type,	/* in: DATA_TRX_ID or DATA_ROLL_PTR */
+	rec_t*		rec,	/* in: record */
+	dict_index_t*	index,	/* in: clustered index */
+	dulint		val)	/* in: value to set */
+{
+	ulint	pos;
+	byte*	field;
+	ulint	len;
+
+	ut_ad(index->type & DICT_CLUSTERED);
+
+	pos = dict_index_get_sys_col_pos(index, type);
+
+	field = rec_get_nth_field(rec, pos, &len);
+
+	if (type == DATA_TRX_ID) {
+
+		trx_write_trx_id(field, val);
+	} else {
+		ut_ad(type == DATA_ROLL_PTR);
+
+		trx_write_roll_ptr(field, val);
+	}
+}
+
+/*********************************************************************
+When an insert to a table is performed, this function builds the entry which
+has to be inserted to an index on the table. */
+
+dtuple_t*
+row_build_index_entry(
+/*==================*/
+				/* out: index entry which should be inserted */
+	dtuple_t*	row, 	/* in: row which should be inserted to the
+				table */
+	dict_index_t*	index, 	/* in: index on the table */
+	mem_heap_t*	heap)	/* in: memory heap from which the memory for
+				the index entry is allocated */
+{
+	dtuple_t*	entry;
+	ulint		entry_len;
+	dict_field_t*	ind_field;
+	dfield_t*	dfield;
+	dfield_t*	dfield2;
+	dict_col_t*	col;
+	ulint		i;
+
+	ut_ad(row && index && heap);
+	ut_ad(dtuple_check_typed(row));
+	
+	entry_len = dict_index_get_n_fields(index);
+	entry = dtuple_create(heap, entry_len);
+
+	if (index->type & DICT_UNIVERSAL) {
+		dtuple_set_n_fields_cmp(entry, entry_len);
+	} else {
+		dtuple_set_n_fields_cmp(entry,
+				dict_index_get_n_unique_in_tree(index));
+	}
+
+	for (i = 0; i < entry_len; i++) {
+		ind_field = dict_index_get_nth_field(index, i);
+		col = ind_field->col;
+
+		dfield = dtuple_get_nth_field(entry, i);
+
+		dfield2 = dtuple_get_nth_field(row, dict_col_get_no(col));
+
+		dfield_copy(dfield, dfield2);
+		dfield->col_no = dict_col_get_no(col);
+	}
+
+	ut_ad(dtuple_check_typed(entry));
+
+	return(entry);
+}			
+
+/***********************************************************************
+An inverse function to dict_row_build_index_entry. Builds a row from a
+record in a clustered index. */
+
+dtuple_t*
+row_build(
+/*======*/
+				/* out, own: row built; see the NOTE below! */
+	ulint		type,	/* in: ROW_COPY_DATA, or ROW_COPY_POINTERS:
+				the former copies also the data fields to
+				heap as the latter only places pointers to
+				data fields on the index page, and thus is
+				more efficient */
+	dict_index_t*	index,	/* in: clustered index */
+	rec_t*		rec,	/* in: record in the clustered index;
+				NOTE: in the case ROW_COPY_POINTERS
+				the data fields in the row will point
+				directly into this record, therefore,
+				the buffer page of this record must be
+				at least s-latched and the latch held
+				as long as the row dtuple is used! */
+	mem_heap_t*	heap)	/* in: memory heap from which the memory
+				needed is allocated */
+{
+	dtuple_t*	row;
+	dict_table_t*	table;
+	ulint		n_fields;
+	ulint		i;
+	dfield_t*	dfield;
+	byte*		field;
+	ulint		len;
+	ulint		row_len;
+	dict_col_t*	col;
+	byte*		buf; 
+	
+	ut_ad(index && rec && heap);
+	ut_ad(index->type & DICT_CLUSTERED);
+
+	if (type == ROW_COPY_DATA) {
+		/* Take a copy of rec to heap */
+		buf = mem_heap_alloc(heap, rec_get_size(rec));
+		rec = rec_copy(buf, rec);
+	}
+
+	table = index->table;
+	row_len = dict_table_get_n_cols(table);
+
+	row = dtuple_create(heap, row_len);
+
+	dtuple_set_info_bits(row, rec_get_info_bits(rec));
+	
+	n_fields = dict_index_get_n_fields(index);
+
+	ut_ad(n_fields == rec_get_n_fields(rec));
+
+	dict_table_copy_types(row, table);
+
+	for (i = 0; i < n_fields; i++) {
+
+		col = dict_field_get_col(dict_index_get_nth_field(index, i));
+		dfield = dtuple_get_nth_field(row, dict_col_get_no(col));
+		field = rec_get_nth_field(rec, i, &len);
+
+		dfield_set_data(dfield, field, len);
+	}
+
+	ut_ad(dtuple_check_typed(row));
+
+	return(row);
+}
+
+/***********************************************************************
+An inverse function to dict_row_build_index_entry. Builds a row from a
+record in a clustered index. */
+
+void
+row_build_to_tuple(
+/*===============*/
+	dtuple_t*	row,	/* in/out: row built; see the NOTE below! */
+	dict_index_t*	index,	/* in: clustered index */
+	rec_t*		rec)	/* in: record in the clustered index;
+				NOTE: the data fields in the row will point
+				directly into this record, therefore,
+				the buffer page of this record must be
+				at least s-latched and the latch held
+				as long as the row dtuple is used! */
+{
+	dict_table_t*	table;
+	ulint		n_fields;
+	ulint		i;
+	dfield_t*	dfield;
+	byte*		field;
+	ulint		len;
+	ulint		row_len;
+	dict_col_t*	col;
+	
+	ut_ad(index && rec);
+	ut_ad(index->type & DICT_CLUSTERED);
+
+	table = index->table;
+	row_len = dict_table_get_n_cols(table);
+
+	dtuple_set_info_bits(row, rec_get_info_bits(rec));
+	
+	n_fields = dict_index_get_n_fields(index);
+
+	ut_ad(n_fields == rec_get_n_fields(rec));
+
+	dict_table_copy_types(row, table);
+
+	for (i = 0; i < n_fields; i++) {
+
+		col = dict_field_get_col(dict_index_get_nth_field(index, i));
+		dfield = dtuple_get_nth_field(row, dict_col_get_no(col));
+		field = rec_get_nth_field(rec, i, &len);
+
+		dfield_set_data(dfield, field, len);
+	}
+
+	ut_ad(dtuple_check_typed(row));
+}
+
+/***********************************************************************
+Converts an index record to a typed data tuple. */
+
+dtuple_t*
+row_rec_to_index_entry(
+/*===================*/
+				/* out, own: index entry built; see the
+				NOTE below! */
+	ulint		type,	/* in: ROW_COPY_DATA, or ROW_COPY_POINTERS:
+				the former copies also the data fields to
+				heap as the latter only places pointers to
+				data fields on the index page */
+	dict_index_t*	index,	/* in: index */
+	rec_t*		rec,	/* in: record in the index;
+				NOTE: in the case ROW_COPY_POINTERS
+				the data fields in the row will point
+				directly into this record, therefore,
+				the buffer page of this record must be
+				at least s-latched and the latch held
+				as long as the dtuple is used! */
+	mem_heap_t*	heap)	/* in: memory heap from which the memory
+				needed is allocated */
+{
+	dtuple_t*	entry;
+	dfield_t*	dfield;
+	ulint		i;
+	byte*		field;
+	ulint		len;
+	ulint		rec_len;
+	byte*		buf;
+	
+	ut_ad(rec && heap && index);
+	
+	if (type == ROW_COPY_DATA) {
+		/* Take a copy of rec to heap */
+		buf = mem_heap_alloc(heap, rec_get_size(rec));
+		rec = rec_copy(buf, rec);
+	}
+
+	rec_len = rec_get_n_fields(rec);
+	
+	entry = dtuple_create(heap, rec_len);
+
+	dtuple_set_n_fields_cmp(entry,
+				dict_index_get_n_unique_in_tree(index));
+	ut_ad(rec_len == dict_index_get_n_fields(index));
+
+	dict_index_copy_types(entry, index, rec_len);
+
+	dtuple_set_info_bits(entry, rec_get_info_bits(rec));
+
+	for (i = 0; i < rec_len; i++) {
+
+		dfield = dtuple_get_nth_field(entry, i);
+		field = rec_get_nth_field(rec, i, &len);
+
+		dfield_set_data(dfield, field, len);
+	}
+
+	ut_ad(dtuple_check_typed(entry));
+
+	return(entry);
+}
+
+/***********************************************************************
+Builds from a secondary index record a row reference with which we can
+search the clustered index record. */
+
+dtuple_t*
+row_build_row_ref(
+/*==============*/
+				/* out, own: row reference built; see the
+				NOTE below! */
+	ulint		type,	/* in: ROW_COPY_DATA, or ROW_COPY_POINTERS:
+				the former copies also the data fields to
+				heap, whereas the latter only places pointers
+				to data fields on the index page */
+	dict_index_t*	index,	/* in: index */
+	rec_t*		rec,	/* in: record in the index;
+				NOTE: in the case ROW_COPY_POINTERS
+				the data fields in the row will point
+				directly into this record, therefore,
+				the buffer page of this record must be
+				at least s-latched and the latch held
+				as long as the row reference is used! */
+	mem_heap_t*	heap)	/* in: memory heap from which the memory
+				needed is allocated */
+{
+	dict_table_t*	table;
+	dict_index_t*	clust_index;
+	dfield_t*	dfield;
+	dict_col_t*	col;
+	dtuple_t*	ref;
+	byte*		field;
+	ulint		len;
+	ulint		ref_len;
+	ulint		pos;
+	byte*		buf;
+	ulint		i;
+	
+	ut_ad(index && rec && heap);
+	
+	if (type == ROW_COPY_DATA) {
+		/* Take a copy of rec to heap */
+
+		buf = mem_heap_alloc(heap, rec_get_size(rec));
+
+		rec = rec_copy(buf, rec);
+	}
+
+	table = index->table;
+	
+	clust_index = dict_table_get_first_index(table);
+
+	ref_len = dict_index_get_n_unique(clust_index);
+
+	ref = dtuple_create(heap, ref_len);
+
+	dict_index_copy_types(ref, clust_index, ref_len);
+
+	for (i = 0; i < ref_len; i++) {
+		dfield = dtuple_get_nth_field(ref, i);
+
+		col = dict_field_get_col(
+				dict_index_get_nth_field(clust_index, i));
+		pos = dict_index_get_nth_col_pos(index, dict_col_get_no(col));
+
+		if (pos != ULINT_UNDEFINED) {	
+			field = rec_get_nth_field(rec, pos, &len);
+
+			dfield_set_data(dfield, field, len);
+		} else {
+			ut_ad(table->type == DICT_TABLE_CLUSTER_MEMBER);
+			ut_ad(i == table->mix_len);
+
+			dfield_set_data(dfield,
+				 mem_heap_alloc(heap, table->mix_id_len),
+							table->mix_id_len);
+			ut_memcpy(dfield_get_data(dfield), table->mix_id_buf,
+							table->mix_id_len);
+		}	
+	}
+
+	ut_ad(dtuple_check_typed(ref));
+
+	return(ref);
+}
+
+/***********************************************************************
+Builds from a secondary index record a row reference with which we can
+search the clustered index record. */
+
+void
+row_build_row_ref_in_tuple(
+/*=======================*/
+	dtuple_t*	ref,	/* in/out: row reference built; see the
+				NOTE below! */
+	dict_index_t*	index,	/* in: index */
+	rec_t*		rec)	/* in: record in the index;
+				NOTE: the data fields in ref will point
+				directly into this record, therefore,
+				the buffer page of this record must be
+				at least s-latched and the latch held
+				as long as the row reference is used! */
+{
+	dict_table_t*	table;
+	dict_index_t*	clust_index;
+	dfield_t*	dfield;
+	dict_col_t*	col;
+	byte*		field;
+	ulint		len;
+	ulint		ref_len;
+	ulint		pos;
+	ulint		i;
+	
+	ut_ad(ref && index && rec);
+	
+	table = index->table;
+	
+	clust_index = dict_table_get_first_index(table);
+
+	ref_len = dict_index_get_n_unique(clust_index);
+
+	ut_ad(ref_len == dtuple_get_n_fields(ref));
+	
+	dict_index_copy_types(ref, clust_index, ref_len);
+
+	for (i = 0; i < ref_len; i++) {
+		dfield = dtuple_get_nth_field(ref, i);
+
+		col = dict_field_get_col(
+				dict_index_get_nth_field(clust_index, i));
+		pos = dict_index_get_nth_col_pos(index, dict_col_get_no(col));
+
+		if (pos != ULINT_UNDEFINED) {	
+			field = rec_get_nth_field(rec, pos, &len);
+
+			dfield_set_data(dfield, field, len);
+		} else {
+			ut_ad(table->type == DICT_TABLE_CLUSTER_MEMBER);
+			ut_ad(i == table->mix_len);
+			ut_a(0);
+		}	
+	}
+
+	ut_ad(dtuple_check_typed(ref));
+}
+
+/***********************************************************************
+From a row build a row reference with which we can search the clustered
+index record. */
+
+void
+row_build_row_ref_from_row(
+/*=======================*/
+	dtuple_t*	ref,	/* in/out: row reference built; see the
+				NOTE below! ref must have the right number
+				of fields! */
+	dict_table_t*	table,	/* in: table */
+	dtuple_t*	row)	/* in: row
+				NOTE: the data fields in ref will point
+				directly into data of this row */
+{
+	dict_index_t*	clust_index;
+	dfield_t*	dfield;
+	dfield_t*	dfield2;
+	dict_col_t*	col;
+	ulint		ref_len;
+	ulint		i;
+	
+	ut_ad(ref && table && row);
+		
+	clust_index = dict_table_get_first_index(table);
+
+	ref_len = dict_index_get_n_unique(clust_index);
+
+	ut_ad(ref_len == dtuple_get_n_fields(ref));
+	
+	for (i = 0; i < ref_len; i++) {
+		dfield = dtuple_get_nth_field(ref, i);
+
+		col = dict_field_get_col(
+				dict_index_get_nth_field(clust_index, i));
+
+		dfield2 = dtuple_get_nth_field(row, dict_col_get_no(col));
+
+		dfield_copy(dfield, dfield2);
+	}
+
+	ut_ad(dtuple_check_typed(ref));
+}
+
+/*******************************************************************
+Searches the clustered index record for a row, if we have the row reference. */
+
+ibool
+row_search_on_row_ref(
+/*==================*/
+				/* out: TRUE if found */
+	btr_pcur_t*	pcur,	/* in/out: persistent cursor, which must
+				be closed by the caller */
+	ulint		mode,	/* in: BTR_MODIFY_LEAF, ... */
+	dict_table_t*	table,	/* in: table */
+	dtuple_t*	ref,	/* in: row reference */
+	mtr_t*		mtr)	/* in: mtr */
+{
+	ulint		low_match;	
+	rec_t*		rec;
+	dict_index_t*	index;
+	page_t*		page;	
+
+	ut_ad(dtuple_check_typed(ref));
+
+	index = dict_table_get_first_index(table);
+
+	btr_pcur_open(index, ref, PAGE_CUR_LE, mode, pcur, mtr);
+	
+	low_match = btr_pcur_get_low_match(pcur);
+
+	rec = btr_pcur_get_rec(pcur);
+	page = buf_frame_align(rec);
+
+	if (rec == page_get_infimum_rec(page)) {
+
+		return(FALSE);
+	}
+
+	if (low_match != dtuple_get_n_fields(ref)) {
+
+		return(FALSE);
+	}
+
+	return(TRUE);
+}
+
+/*************************************************************************
+Fetches the clustered index record for a secondary index record. The latches
+on the secondary index record are preserved. */
+
+rec_t*
+row_get_clust_rec(
+/*==============*/
+				/* out: record or NULL, if no record found */
+	ulint		mode,	/* in: BTR_MODIFY_LEAF, ... */
+	rec_t*		rec,	/* in: record in a secondary index */
+	dict_index_t*	index,	/* in: secondary index */
+	dict_index_t**	clust_index,/* out: clustered index */
+	mtr_t*		mtr)	/* in: mtr */
+{
+	mem_heap_t*	heap;
+	dtuple_t*	ref;
+	dict_table_t*	table;
+	btr_pcur_t	pcur;
+	ibool		found;
+	rec_t*		clust_rec;
+	
+	ut_ad((index->type & DICT_CLUSTERED) == 0);
+
+	table = index->table;
+
+	heap = mem_heap_create(256);
+
+	ref = row_build_row_ref(ROW_COPY_POINTERS, index, rec, heap);
+
+	found = row_search_on_row_ref(&pcur, mode, table, ref, mtr);
+
+	clust_rec = btr_pcur_get_rec(&pcur);
+
+	mem_heap_free(heap);
+
+	btr_pcur_close(&pcur);
+
+	*clust_index = dict_table_get_first_index(table);
+
+	if (!found) {
+
+		return(NULL);
+	}
+
+	return(clust_rec);
+}
+
+/*******************************************************************
+Searches an index record. */
+
+ibool
+row_search_index_entry(
+/*===================*/
+				/* out: TRUE if found */
+	dict_index_t*	index,	/* in: index */
+	dtuple_t*	entry,	/* in: index entry */
+	ulint		mode,	/* in: BTR_MODIFY_LEAF, ... */
+	btr_pcur_t*	pcur,	/* in/out: persistent cursor, which must
+				be closed by the caller */
+	mtr_t*		mtr)	/* in: mtr */
+{
+	ulint	n_fields;
+	ulint	low_match;
+	page_t*	page;
+	rec_t*	rec;
+
+	ut_ad(dtuple_check_typed(entry));
+	
+	btr_pcur_open(index, entry, PAGE_CUR_LE, mode, pcur, mtr);
+	low_match = btr_pcur_get_low_match(pcur);
+
+	rec = btr_pcur_get_rec(pcur);
+	page = buf_frame_align(rec);
+
+	n_fields = dtuple_get_n_fields(entry);
+
+	if (rec == page_get_infimum_rec(page)) {
+
+		return(FALSE);
+	}
+
+	if (low_match != n_fields) {
+		/* Not found */
+
+		return(FALSE);
+	}
+
+	return(TRUE);
+}
diff --git a/innobase/row/row0sel.c b/innobase/row/row0sel.c
new file mode 100644
index 00000000000..bd7af5743d8
--- /dev/null
+++ b/innobase/row/row0sel.c
@@ -0,0 +1,2732 @@
+/*******************************************************
+Select
+
+(c) 1997 Innobase Oy
+
+Created 12/19/1997 Heikki Tuuri
+*******************************************************/
+
+#include "row0sel.h"
+
+#ifdef UNIV_NONINL
+#include "row0sel.ic"
+#endif
+
+#include "dict0dict.h"
+#include "dict0boot.h"
+#include "trx0undo.h"
+#include "trx0trx.h"
+#include "btr0btr.h"
+#include "btr0cur.h"
+#include "btr0sea.h"
+#include "mach0data.h"
+#include "que0que.h"
+#include "row0upd.h"
+#include "row0row.h"
+#include "row0vers.h"
+#include "rem0cmp.h"
+#include "lock0lock.h"
+#include "eval0eval.h"
+#include "pars0sym.h"
+#include "pars0pars.h"
+#include "row0mysql.h"
+
+/* Maximum number of rows to prefetch; MySQL interface has another parameter */
+#define SEL_MAX_N_PREFETCH	16
+
+/* Number of rows fetched, after which to start prefetching; MySQL interface
+has another parameter */
+#define SEL_PREFETCH_LIMIT	1
+
+/* When a select has accessed about this many pages, it returns control back
+to que_run_threads: this is to allow canceling runaway queries */
+
+#define SEL_COST_LIMIT	100
+
+/* Flags for search shortcut */
+#define SEL_FOUND	0
+#define	SEL_EXHAUSTED	1
+#define SEL_RETRY	2
+
+/*************************************************************************
+Creates a select node struct. */
+
+sel_node_t*
+sel_node_create(
+/*============*/
+				/* out, own: select node struct */
+	mem_heap_t*	heap)	/* in: memory heap where created */
+{
+	sel_node_t*	node;
+
+	node = mem_heap_alloc(heap, sizeof(sel_node_t));
+	node->common.type = QUE_NODE_SELECT;
+	node->state = SEL_NODE_OPEN;
+
+	node->select_will_do_update = FALSE;
+	node->latch_mode = BTR_SEARCH_LEAF;
+
+	node->plans = NULL;
+	
+	return(node);
+}
+
+/*************************************************************************
+Frees the memory private to a select node when a query graph is freed,
+does not free the heap where the node was originally created. */
+
+void
+sel_node_free_private(
+/*==================*/
+	sel_node_t*	node)	/* in: select node struct */
+{
+	ulint	i;
+	plan_t*	plan;
+
+	if (node->plans != NULL) {
+		for (i = 0; i < node->n_tables; i++) {
+			plan = sel_node_get_nth_plan(node, i);
+
+			btr_pcur_close(&(plan->pcur));
+			btr_pcur_close(&(plan->clust_pcur));
+
+			if (plan->old_vers_heap) {
+				mem_heap_free(plan->old_vers_heap);
+			}
+		}
+	}
+}
+
+/*************************************************************************
+Evaluates the values in a select list. If there are aggregate functions,
+their argument value is added to the aggregate total. */
+UNIV_INLINE
+void
+sel_eval_select_list(
+/*=================*/
+	sel_node_t*	node)	/* in: select node */
+{
+	que_node_t*	exp;
+
+	exp = node->select_list;
+
+	while (exp) {
+		eval_exp(exp);
+
+		exp = que_node_get_next(exp);
+	}
+}
+
+/*************************************************************************
+Assigns the values in the select list to the possible into-variables in
+SELECT ... INTO ... */
+UNIV_INLINE
+void
+sel_assign_into_var_values(
+/*=======================*/
+	sym_node_t*	var,	/* in: first variable in a list of variables */
+	sel_node_t*	node)	/* in: select node */
+{
+	que_node_t*	exp;
+
+	if (var == NULL) {
+
+		return;
+	}
+
+	exp = node->select_list;
+
+	while (var) {
+		ut_ad(exp);
+
+		eval_node_copy_val(var->alias, exp);
+
+		exp = que_node_get_next(exp);
+		var = que_node_get_next(var);
+	}
+}
+
+/*************************************************************************
+Resets the aggregate value totals in the select list of an aggregate type
+query. */
+UNIV_INLINE
+void
+sel_reset_aggregate_vals(
+/*=====================*/
+	sel_node_t*	node)	/* in: select node */
+{
+	func_node_t*	func_node;
+
+	ut_ad(node->is_aggregate);
+
+	func_node = node->select_list;
+
+	while (func_node) {
+		eval_node_set_int_val(func_node, 0);
+
+		func_node = que_node_get_next(func_node);
+	}	
+
+	node->aggregate_already_fetched = FALSE;
+}
+
+/*************************************************************************
+Copies the input variable values when an explicit cursor is opened. */
+UNIV_INLINE
+void
+row_sel_copy_input_variable_vals(
+/*=============================*/
+	sel_node_t*	node)	/* in: select node */
+{
+	sym_node_t*	var;
+
+	var = UT_LIST_GET_FIRST(node->copy_variables);
+
+	while (var) {
+		eval_node_copy_val(var, var->alias);
+
+		var->indirection = NULL;
+
+		var = UT_LIST_GET_NEXT(col_var_list, var);
+	}
+}
+
+/*************************************************************************
+Fetches the column values from a record. */
+static
+void
+row_sel_fetch_columns(
+/*==================*/
+	dict_index_t*	index,	/* in: record index */
+	rec_t*		rec,	/* in: record in a clustered or non-clustered
+				index */
+	sym_node_t*	column)	/* in: first column in a column list, or
+				NULL */
+{
+	dfield_t*	val;
+	ulint		index_type;
+	ulint		field_no;
+	byte*		data;
+	ulint		len;
+	
+	if (index->type & DICT_CLUSTERED) {
+		index_type = SYM_CLUST_FIELD_NO;
+	} else {
+		index_type = SYM_SEC_FIELD_NO;
+	}
+
+	while (column) {
+		field_no = column->field_nos[index_type];
+
+		if (field_no != ULINT_UNDEFINED) {
+	
+			data = rec_get_nth_field(rec, field_no, &len);
+			
+			if (column->copy_val) {
+				eval_node_copy_and_alloc_val(column, data,
+									len);
+			} else {
+				val = que_node_get_val(column);
+				dfield_set_data(val, data, len);
+			}
+		}
+
+		column = UT_LIST_GET_NEXT(col_var_list, column);
+	}
+}
+
+/*************************************************************************
+Allocates a prefetch buffer for a column when prefetch is first time done. */
+static
+void
+sel_col_prefetch_buf_alloc(
+/*=======================*/
+	sym_node_t*	column)	/* in: symbol table node for a column */
+{
+	sel_buf_t*	sel_buf;
+	ulint		i;
+
+	ut_ad(que_node_get_type(column) == QUE_NODE_SYMBOL);
+	
+	column->prefetch_buf = mem_alloc(SEL_MAX_N_PREFETCH
+							* sizeof(sel_buf_t));
+	for (i = 0; i < SEL_MAX_N_PREFETCH; i++) {
+		sel_buf = column->prefetch_buf + i;
+
+		sel_buf->data = NULL;
+
+		sel_buf->val_buf_size = 0;
+	}
+}
+
+/*************************************************************************
+Frees a prefetch buffer for a column, including the dynamically allocated
+memory for data stored there. */
+
+void
+sel_col_prefetch_buf_free(
+/*======================*/
+	sel_buf_t*	prefetch_buf)	/* in, own: prefetch buffer */
+{
+	sel_buf_t*	sel_buf;
+	ulint		i;
+
+	for (i = 0; i < SEL_MAX_N_PREFETCH; i++) {
+		sel_buf = prefetch_buf + i;
+
+		if (sel_buf->val_buf_size > 0) {
+
+			mem_free(sel_buf->data);
+		}
+	}
+}
+
+/*************************************************************************
+Pops the column values for a prefetched, cached row from the column prefetch
+buffers and places them to the val fields in the column nodes. */
+static
+void
+sel_pop_prefetched_row(
+/*===================*/
+	plan_t*	plan)	/* in: plan node for a table */
+{
+	sym_node_t*	column;
+	sel_buf_t*	sel_buf;
+	dfield_t*	val;
+	byte*		data;
+	ulint		len;
+	ulint		val_buf_size;
+	
+	ut_ad(plan->n_rows_prefetched > 0);
+
+	column = UT_LIST_GET_FIRST(plan->columns);
+
+	while (column) {
+		val = que_node_get_val(column);
+
+		if (!column->copy_val) {
+			/* We did not really push any value for the
+			column */
+
+			ut_ad(!column->prefetch_buf);
+			ut_ad(que_node_get_val_buf_size(column) == 0);
+#ifdef UNIV_DEBUG
+			dfield_set_data(val, NULL, 0);
+#endif
+			goto next_col;
+		}
+
+		ut_ad(column->prefetch_buf);
+
+		sel_buf = column->prefetch_buf + plan->first_prefetched;
+
+		data = sel_buf->data;
+		len = sel_buf->len;
+		val_buf_size = sel_buf->val_buf_size;
+
+		/* We must keep track of the allocated memory for
+		column values to be able to free it later: therefore
+		we swap the values for sel_buf and val */
+
+		sel_buf->data = dfield_get_data(val);
+		sel_buf->len = dfield_get_len(val);
+		sel_buf->val_buf_size = que_node_get_val_buf_size(column);
+		
+		dfield_set_data(val, data, len);
+		que_node_set_val_buf_size(column, val_buf_size);
+next_col:
+		column = UT_LIST_GET_NEXT(col_var_list, column);
+	}
+
+	plan->n_rows_prefetched--;
+
+	plan->first_prefetched++;
+}
+
+/*************************************************************************
+Pushes the column values for a prefetched, cached row to the column prefetch
+buffers from the val fields in the column nodes. */
+UNIV_INLINE
+void
+sel_push_prefetched_row(
+/*====================*/
+	plan_t*	plan)	/* in: plan node for a table */
+{
+	sym_node_t*	column;
+	sel_buf_t*	sel_buf;
+	dfield_t*	val;
+	byte*		data;
+	ulint		len;
+	ulint		pos;
+	ulint		val_buf_size;
+
+	if (plan->n_rows_prefetched == 0) {
+		pos = 0;
+		plan->first_prefetched = 0;
+	} else {
+		pos = plan->n_rows_prefetched;
+
+		/* We have the convention that pushing new rows starts only
+		after the prefetch stack has been emptied: */
+		
+		ut_ad(plan->first_prefetched == 0);
+	}
+
+	plan->n_rows_prefetched++;
+	
+	ut_ad(pos < SEL_MAX_N_PREFETCH);
+	
+	column = UT_LIST_GET_FIRST(plan->columns);
+
+	while (column) {
+		if (!column->copy_val) {
+			/* There is no sense to push pointers to database
+			page fields when we do not keep latch on the page! */
+
+			goto next_col;
+		}
+		
+		if (!column->prefetch_buf) {
+			/* Allocate a new prefetch buffer */
+
+			sel_col_prefetch_buf_alloc(column);
+		}
+
+		sel_buf = column->prefetch_buf + pos;
+
+		val = que_node_get_val(column);
+
+		data = dfield_get_data(val);
+		len = dfield_get_len(val);
+		val_buf_size = que_node_get_val_buf_size(column);
+
+		/* We must keep track of the allocated memory for
+		column values to be able to free it later: therefore
+		we swap the values for sel_buf and val */
+
+		dfield_set_data(val, sel_buf->data, sel_buf->len);
+		que_node_set_val_buf_size(column, sel_buf->val_buf_size);
+		
+		sel_buf->data = data;
+		sel_buf->len = len;
+		sel_buf->val_buf_size = val_buf_size;
+next_col:		
+		column = UT_LIST_GET_NEXT(col_var_list, column);
+	}
+}
+
+/*************************************************************************
+Builds a previous version of a clustered index record for a consistent read */
+static
+ulint
+row_sel_build_prev_vers(
+/*====================*/
+					/* out: DB_SUCCESS or error code */
+	read_view_t*	read_view,	/* in: read view */
+	plan_t*		plan,		/* in: plan node for table */
+	rec_t*		rec,		/* in: record in a clustered index */
+	rec_t**		old_vers,	/* out: old version, or NULL if the
+					record does not exist in the view:
+					i.e., it was freshly inserted
+					afterwards */
+	mtr_t*		mtr)		/* in: mtr */
+{
+	ulint	err;
+
+	if (plan->old_vers_heap) {
+		mem_heap_empty(plan->old_vers_heap);
+	} else {
+		plan->old_vers_heap = mem_heap_create(512);
+	}
+	
+	err = row_vers_build_for_consistent_read(rec, mtr, plan->index,
+					read_view, plan->old_vers_heap,
+					old_vers);
+	return(err);
+}
+
+/*************************************************************************
+Tests the conditions which determine when the index segment we are searching
+through has been exhausted. */
+UNIV_INLINE
+ibool
+row_sel_test_end_conds(
+/*===================*/
+ 			/* out: TRUE if row passed the tests */
+	plan_t*	plan)	/* in: plan for the table; the column values must
+			already have been retrieved and the right sides of
+			comparisons evaluated */
+{
+	func_node_t*	cond;
+
+	/* All conditions in end_conds are comparisons of a column to an
+	expression */
+	
+	cond = UT_LIST_GET_FIRST(plan->end_conds);
+
+	while (cond) {
+		/* Evaluate the left side of the comparison, i.e., get the
+		column value if there is an indirection */
+
+		eval_sym(cond->args);
+
+		/* Do the comparison */
+
+		if (!eval_cmp(cond)) {
+
+			return(FALSE);
+		}
+
+		cond = UT_LIST_GET_NEXT(cond_list, cond);
+	}
+
+	return(TRUE);
+}
+
+/*************************************************************************
+Tests the other conditions. */
+UNIV_INLINE
+ibool
+row_sel_test_other_conds(
+/*=====================*/
+			/* out: TRUE if row passed the tests */
+	plan_t*	plan)	/* in: plan for the table; the column values must
+			already have been retrieved */
+{
+	func_node_t*	cond;
+	
+	cond = UT_LIST_GET_FIRST(plan->other_conds);
+
+	while (cond) {
+		eval_exp(cond);
+
+		if (!eval_node_get_ibool_val(cond)) {
+
+			return(FALSE);
+		}
+
+		cond = UT_LIST_GET_NEXT(cond_list, cond);
+	}
+
+	return(TRUE);
+}
+
+/*************************************************************************
+Retrieves the clustered index record corresponding to a record in a
+non-clustered index. Does the necessary locking. */
+static
+ulint
+row_sel_get_clust_rec(
+/*==================*/
+				/* out: DB_SUCCESS or error code */
+	sel_node_t*	node,	/* in: select_node */
+	plan_t*		plan,	/* in: plan node for table */
+	rec_t*		rec,	/* in: record in a non-clustered index */
+	que_thr_t*	thr,	/* in: query thread */
+	rec_t**		out_rec,/* out: clustered record or an old version of
+				it, NULL if the old version did not exist
+				in the read view, i.e., it was a fresh
+				inserted version */
+	mtr_t*		mtr)	/* in: mtr used to get access to the
+				non-clustered record; the same mtr is used to
+				access the clustered index */
+{
+	dict_index_t*	index;
+	rec_t*		clust_rec;
+	rec_t*		old_vers;
+	ulint		err;
+	
+	row_build_row_ref_fast(plan->clust_ref, plan->clust_map, rec);
+
+	index = dict_table_get_first_index(plan->table);
+	
+	btr_pcur_open_with_no_init(index, plan->clust_ref, PAGE_CUR_LE,
+				node->latch_mode, &(plan->clust_pcur),
+				0, mtr);
+
+	clust_rec = btr_pcur_get_rec(&(plan->clust_pcur));
+
+	ut_ad(page_rec_is_user_rec(clust_rec));
+
+	if (!node->read_view) {
+		/* Try to place a lock on the index record */
+		
+		err = lock_clust_rec_read_check_and_lock(0, clust_rec, index,
+						node->row_lock_mode, thr);
+		if (err != DB_SUCCESS) {
+
+			return(err);
+		}
+	} else {
+		/* This is a non-locking consistent read: if necessary, fetch
+		a previous version of the record */
+
+		if (!lock_clust_rec_cons_read_sees(clust_rec, index,
+							node->read_view)) {
+
+			err = row_sel_build_prev_vers(node->read_view, plan,
+						clust_rec, &old_vers, mtr);
+			if (err != DB_SUCCESS) {
+
+				return(err);
+			}
+
+			clust_rec = old_vers;
+
+			if (clust_rec == NULL) {
+				*out_rec = clust_rec;
+
+				return(DB_SUCCESS);
+			}
+		}
+	}
+
+	/* Fetch the columns needed in test conditions */
+	
+	row_sel_fetch_columns(index, clust_rec,
+					UT_LIST_GET_FIRST(plan->columns));
+	*out_rec = clust_rec;
+
+	return(DB_SUCCESS);
+}
+
+/*************************************************************************
+Sets a lock on a record. */
+UNIV_INLINE
+ulint
+sel_set_rec_lock(
+/*=============*/
+				/* out: DB_SUCCESS or error code */
+	rec_t*		rec,	/* in: record */
+	dict_index_t*	index,	/* in: index */
+	ulint		mode,	/* in: lock mode */
+	que_thr_t*	thr)	/* in: query thread */	
+{
+	ulint	err;
+
+	if (index->type & DICT_CLUSTERED) {
+		err = lock_clust_rec_read_check_and_lock(0, rec, index, mode,
+									thr);
+	} else {
+		err = lock_sec_rec_read_check_and_lock(0, rec, index, mode,
+									thr);
+	}
+
+	return(err);
+}
+
+/*************************************************************************
+Opens a pcur to a table index. */
+static
+void
+row_sel_open_pcur(
+/*==============*/
+	sel_node_t*	node,		/* in: select node */
+	plan_t*		plan,		/* in: table plan */
+	ibool		search_latch_locked,
+					/* in: TRUE if the thread currently
+					has the search latch locked in
+					s-mode */
+	mtr_t*		mtr)		/* in: mtr */
+{
+	dict_index_t*	index;
+	func_node_t*	cond;
+	que_node_t*	exp;
+	ulint		n_fields;
+	ulint		has_search_latch = 0;	/* RW_S_LATCH or 0 */ 
+	ulint		i;
+
+	if (search_latch_locked) {
+		has_search_latch = RW_S_LATCH;
+	}
+
+	index = plan->index;
+
+	/* Calculate the value of the search tuple: the exact match columns
+	get their expressions evaluated when we evaluate the right sides of
+	end_conds */
+
+	cond = UT_LIST_GET_FIRST(plan->end_conds);
+
+	while (cond) {
+		eval_exp(que_node_get_next(cond->args));
+	
+		cond = UT_LIST_GET_NEXT(cond_list, cond);
+	}
+	
+	if (plan->tuple) {
+		n_fields = dtuple_get_n_fields(plan->tuple);
+	
+		if (plan->n_exact_match < n_fields) {
+			/* There is a non-exact match field which must be
+			evaluated separately */
+			
+			eval_exp(plan->tuple_exps[n_fields - 1]);
+		}
+		
+		for (i = 0; i < n_fields; i++) {
+			exp = plan->tuple_exps[i];
+	
+			dfield_copy_data(dtuple_get_nth_field(plan->tuple, i),
+							que_node_get_val(exp));
+		}
+	
+		/* Open pcur to the index */
+	
+		btr_pcur_open_with_no_init(index, plan->tuple, plan->mode,
+					node->latch_mode, &(plan->pcur),
+					has_search_latch, mtr);
+	} else {
+		/* Open the cursor to the start or the end of the index
+		(FALSE: no init) */
+
+		btr_pcur_open_at_index_side(plan->asc, index, node->latch_mode,
+						&(plan->pcur), FALSE, mtr);
+	}
+
+	ut_ad(plan->n_rows_prefetched == 0);
+	ut_ad(plan->n_rows_fetched == 0);
+	ut_ad(plan->cursor_at_end == FALSE);
+ 
+	plan->pcur_is_open = TRUE;
+}
+
+/*************************************************************************
+Restores a stored pcur position to a table index. */
+UNIV_INLINE
+ibool
+row_sel_restore_pcur_pos(
+/*=====================*/
+				/* out: TRUE if the cursor should be moved to
+				the next record after we return from this
+				function (moved to the previous, in the case
+				of a descending cursor) without processing
+				again the current cursor record */
+	sel_node_t*	node,	/* in: select node */
+	plan_t*		plan,	/* in: table plan */
+	mtr_t*		mtr)	/* in: mtr */
+{
+	ibool	equal_position;
+	ulint	relative_position;
+
+	ut_ad(!plan->cursor_at_end);
+	
+	relative_position = btr_pcur_get_rel_pos(&(plan->pcur));
+
+	equal_position = btr_pcur_restore_position(node->latch_mode,
+							&(plan->pcur), mtr);
+
+	/* If the cursor is traveling upwards, and relative_position is
+	
+	(1) BTR_PCUR_BEFORE: this is not allowed, as we did not have a lock
+	yet on the successor of the page infimum;
+	(2) BTR_PCUR_AFTER: btr_pcur_restore_position placed the cursor on the
+	first record GREATER than the predecessor of a page supremum; we have
+	not yet processed the cursor record: no need to move the cursor to the
+	next record;
+	(3) BTR_PCUR_ON: btr_pcur_restore_position placed the cursor on the
+	last record LESS or EQUAL to the old stored user record; (a) if
+	equal_position is FALSE, this means that the cursor is now on a record
+	less than the old user record, and we must move to the next record;
+	(b) if equal_position is TRUE, then if
+	plan->stored_cursor_rec_processed is TRUE, we must move to the next
+	record, else there is no need to move the cursor. */
+
+	if (plan->asc) {
+		if (relative_position == BTR_PCUR_ON) {
+
+			if (equal_position) {
+
+				return(plan->stored_cursor_rec_processed);
+			}
+
+			return(TRUE);
+		}
+
+		ut_ad(relative_position == BTR_PCUR_AFTER);
+
+		return(FALSE);
+	}
+
+	/* If the cursor is traveling downwards, and relative_position is
+	
+	(1) BTR_PCUR_BEFORE: btr_pcur_restore_position placed the cursor on
+	the last record LESS than the successor of a page infimum; we have not
+	processed the cursor record: no need to move the cursor;
+	(2) BTR_PCUR_AFTER: btr_pcur_restore_position placed the cursor on the
+	first record GREATER than the predecessor of a page supremum; we have
+	processed the cursor record: we should move the cursor to the previous
+	record;
+	(3) BTR_PCUR_ON: btr_pcur_restore_position placed the cursor on the
+	last record LESS or EQUAL to the old stored user record; (a) if
+	equal_position is FALSE, this means that the cursor is now on a record
+	less than the old user record, and we need not move to the previous
+	record; (b) if equal_position is TRUE, then if
+	plan->stored_cursor_rec_processed is TRUE, we must move to the previous
+	record, else there is no need to move the cursor. */
+
+	if (relative_position == BTR_PCUR_BEFORE) {
+
+		return(FALSE);
+	}
+
+	if (relative_position == BTR_PCUR_ON) {
+
+		if (equal_position) {
+
+			return(plan->stored_cursor_rec_processed);
+		}
+
+		return(FALSE);
+	}
+
+	ut_ad(relative_position == BTR_PCUR_AFTER);
+
+	return(TRUE);
+}
+
+/*************************************************************************
+Resets a plan cursor to a closed state. */
+UNIV_INLINE
+void
+plan_reset_cursor(
+/*==============*/
+	plan_t*	plan)	/* in: plan */
+{	
+	plan->pcur_is_open = FALSE;
+	plan->cursor_at_end = FALSE;	
+	plan->n_rows_fetched = 0;
+	plan->n_rows_prefetched = 0;
+}
+	
+/*************************************************************************
+Tries to do a shortcut to fetch a clustered index record with a unique key,
+using the hash index if possible (not always). */
+static
+ulint
+row_sel_try_search_shortcut(
+/*========================*/
+				/* out: SEL_FOUND, SEL_EXHAUSTED, SEL_RETRY */
+	sel_node_t*	node,	/* in: select node for a consistent read */
+	plan_t*		plan,	/* in: plan for a unique search in clustered
+				index */
+	mtr_t*		mtr)	/* in: mtr */
+{
+	dict_index_t*	index;
+	rec_t*		rec;
+
+	index = plan->index;
+
+	ut_ad(node->read_view);
+	ut_ad(plan->unique_search);
+	ut_ad(!plan->must_get_clust);
+	ut_ad(rw_lock_own(&btr_search_latch, RW_LOCK_SHARED));
+	
+	row_sel_open_pcur(node, plan, TRUE, mtr);
+
+	rec = btr_pcur_get_rec(&(plan->pcur));
+	
+	if (!page_rec_is_user_rec(rec)) {
+
+		return(SEL_RETRY);
+	}
+
+	ut_ad(plan->mode == PAGE_CUR_GE);
+	
+	/* As the cursor is now placed on a user record after a search with
+	the mode PAGE_CUR_GE, the up_match field in the cursor tells how many
+	fields in the user record matched to the search tuple */ 
+
+	if (btr_pcur_get_up_match(&(plan->pcur)) < plan->n_exact_match) {
+
+		return(SEL_EXHAUSTED);
+	}
+
+	/* This is a non-locking consistent read: if necessary, fetch
+	a previous version of the record */
+			
+	if (index->type & DICT_CLUSTERED) {
+		if (!lock_clust_rec_cons_read_sees(rec, index,
+							node->read_view)) {
+			return(SEL_RETRY);
+		}
+	} else if (!lock_sec_rec_cons_read_sees(rec, index, node->read_view)) {
+
+		return(SEL_RETRY);
+	}
+
+	/* Test deleted flag. Fetch the columns needed in test conditions. */
+	
+	row_sel_fetch_columns(index, rec, UT_LIST_GET_FIRST(plan->columns));
+
+	if (rec_get_deleted_flag(rec)) {
+
+		return(SEL_EXHAUSTED);
+	}
+
+	/* Test the rest of search conditions */
+	
+	if (!row_sel_test_other_conds(plan)) {
+
+		return(SEL_EXHAUSTED);
+	}
+
+	ut_ad(plan->pcur.latch_mode == node->latch_mode);
+
+	plan->n_rows_fetched++;
+
+	return(SEL_FOUND);
+}
+
+/*************************************************************************
+Performs a select step. */
+static
+ulint
+row_sel(
+/*====*/
+				/* out: DB_SUCCESS or error code */
+	sel_node_t*	node,	/* in: select node */
+	que_thr_t*	thr)	/* in: query thread */
+{
+	dict_index_t*	index;
+	plan_t*		plan;
+	mtr_t		mtr;
+	ibool		moved;
+	rec_t*		rec;
+	rec_t*		old_vers;
+	rec_t*		clust_rec;
+	ibool		search_latch_locked;
+	ibool		consistent_read;
+	
+		/* The following flag becomes TRUE when we are doing a
+		consistent read from a non-clustered index and we must look
+		at the clustered index to find out the previous delete mark
+		state of the non-clustered record: */
+
+	ibool		cons_read_requires_clust_rec	= FALSE;
+	ulint		cost_counter			= 0;
+	ibool		cursor_just_opened;
+	ibool		must_go_to_next;
+	ibool		leaf_contains_updates 		= FALSE;
+					/* TRUE if select_will_do_update is
+					TRUE and the current clustered index
+					leaf page has been updated during
+					the current mtr: mtr must be committed
+					at the same time as the leaf x-latch
+					is released */
+	ibool		mtr_has_extra_clust_latch 	= FALSE;
+					/* TRUE if the search was made using
+					a non-clustered index, and we had to
+					access the clustered record: now &mtr
+					contains a clustered index latch, and
+					&mtr must be committed before we move
+					to the next non-clustered record */
+	ulint		found_flag;
+	ulint		err;
+	
+	ut_ad(thr->run_node == node);
+
+	search_latch_locked = FALSE;
+
+	if (node->read_view) {
+		/* In consistent reads, we try to do with the hash index and
+		not to use the buffer page get. This is to reduce memory bus
+		load resulting from semaphore operations. The search latch
+		will be s-locked when we access an index with a unique search
+		condition, but not locked when we access an index with a
+		less selective search condition. */
+
+		consistent_read = TRUE;
+	} else {
+		consistent_read = FALSE;
+	}
+
+table_loop:
+	/* TABLE LOOP
+	   ----------
+	This is the outer major loop in calculating a join. We come here when
+	node->fetch_table changes, and after adding a row to aggregate totals
+	and, of course, when this function is called. */
+
+	ut_ad(leaf_contains_updates == FALSE);
+	ut_ad(mtr_has_extra_clust_latch == FALSE);
+
+	plan = sel_node_get_nth_plan(node, node->fetch_table);
+	index = plan->index;
+
+	if (plan->n_rows_prefetched > 0) {
+		sel_pop_prefetched_row(plan);
+
+		goto next_table_no_mtr;
+	}
+
+	if (plan->cursor_at_end) {
+		/* The cursor has already reached the result set end: no more
+		rows to process for this table cursor, as also the prefetch
+		stack was empty */
+
+		ut_ad(plan->pcur_is_open);
+
+		goto table_exhausted_no_mtr;
+	}
+
+	/* Open a cursor to index, or restore an open cursor position */
+	
+	mtr_start(&mtr);
+
+	if (consistent_read && plan->unique_search && !plan->pcur_is_open
+						&& !plan->must_get_clust) {
+		if (!search_latch_locked) {
+			rw_lock_s_lock(&btr_search_latch);
+
+			search_latch_locked = TRUE;
+		} else if (btr_search_latch.writer_is_wait_ex) {
+
+			/* There is an x-latch request waiting: release the
+			s-latch for a moment; as an s-latch here is often
+			kept for some 10 searches before being released,
+			a waiting x-latch request would block other threads
+			from acquiring an s-latch for a long time, lowering
+			performance significantly in multiprocessors. */
+
+			rw_lock_s_unlock(&btr_search_latch);
+			rw_lock_s_lock(&btr_search_latch);
+		}
+
+		found_flag = row_sel_try_search_shortcut(node, plan, &mtr);
+
+		if (found_flag == SEL_FOUND) {
+
+			goto next_table;
+
+		} else if (found_flag == SEL_EXHAUSTED) {
+
+			goto table_exhausted;
+		}
+		
+		ut_ad(found_flag == SEL_RETRY);
+
+		plan_reset_cursor(plan);
+
+		mtr_commit(&mtr);
+		mtr_start(&mtr);
+	}
+
+	if (search_latch_locked) {
+		rw_lock_s_unlock(&btr_search_latch);
+
+		search_latch_locked = FALSE;
+	}
+
+	if (!plan->pcur_is_open) {
+		/* Evaluate the expressions to build the search tuple and
+		open the cursor */
+
+		row_sel_open_pcur(node, plan, search_latch_locked, &mtr);
+
+		cursor_just_opened = TRUE;
+
+		/* A new search was made: increment the cost counter */
+		cost_counter++;
+	} else {
+		/* Restore pcur position to the index */
+
+		must_go_to_next = row_sel_restore_pcur_pos(node, plan, &mtr);
+
+		cursor_just_opened = FALSE;
+
+		if (must_go_to_next) {
+			/* We have already processed the cursor record: move
+			to the next */
+		
+			goto next_rec;
+		}
+	}
+	
+rec_loop:
+	/* RECORD LOOP
+	   -----------
+	In this loop we use pcur and try to fetch a qualifying row, and
+	also fill the prefetch buffer for this table if n_rows_fetched has
+	exceeded a threshold. While we are inside this loop, the following
+	holds:
+	(1) &mtr is started,
+	(2) pcur is positioned and open.
+
+	NOTE that if cursor_just_opened is TRUE here, it means that we came
+	to this point right after row_sel_open_pcur. */
+	
+	ut_ad(mtr_has_extra_clust_latch == FALSE);
+
+	rec = btr_pcur_get_rec(&(plan->pcur));
+	
+	/* PHASE 1: Set a lock if specified */
+
+	if (!node->asc && cursor_just_opened
+		&& (rec != page_get_supremum_rec(buf_frame_align(rec)))) {
+
+		/* When we open a cursor for a descending search, we must set
+		a next-key lock on the successor record: otherwise it would
+		be possible to insert new records next to the cursor position,
+		and it might be that these new records should appear in the
+		search result set, resulting in the phantom problem. */
+		
+		if (!consistent_read) {
+			err = sel_set_rec_lock(page_rec_get_next(rec), index,
+						node->row_lock_mode, thr);
+			if (err != DB_SUCCESS) {
+				/* Note that in this case we will store in pcur
+				the PREDECESSOR of the record we are waiting
+				the lock for */
+				
+				goto lock_wait_or_error;
+			}
+		}
+	}
+
+	if (rec == page_get_infimum_rec(buf_frame_align(rec))) {
+
+		/* The infimum record on a page cannot be in the result set,
+		and neither can a record lock be placed on it: we skip such
+		a record. We also increment the cost counter as we may have
+		processed yet another page of index. */
+
+		cost_counter++;
+
+		goto next_rec;
+	}
+
+	if (!consistent_read) {
+		/* Try to place a lock on the index record */	
+
+		err = sel_set_rec_lock(rec, index, node->row_lock_mode, thr);
+
+		if (err != DB_SUCCESS) {
+
+			goto lock_wait_or_error;
+		}
+	}
+
+	if (rec == page_get_supremum_rec(buf_frame_align(rec))) {
+
+		/* A page supremum record cannot be in the result set: skip
+		it now when we have placed a possible lock on it */
+
+		goto next_rec;
+	}
+
+	ut_ad(page_rec_is_user_rec(rec));
+
+	if (cost_counter > SEL_COST_LIMIT) {
+		
+		/* Now that we have placed the necessary locks, we can stop
+		for a while and store the cursor position; NOTE that if we
+		would store the cursor position BEFORE placing a record lock,
+		it might happen that the cursor would jump over some records
+		that another transaction could meanwhile insert adjacent to
+		the cursor: this would result in the phantom problem. */
+
+		goto stop_for_a_while;
+	}
+	
+	/* PHASE 2: Check a mixed index mix id if needed */
+
+	if (plan->unique_search && cursor_just_opened) {
+
+		ut_ad(plan->mode == PAGE_CUR_GE);
+	
+		/* As the cursor is now placed on a user record after a search
+		with the mode PAGE_CUR_GE, the up_match field in the cursor
+		tells how many fields in the user record matched to the search
+		tuple */ 
+
+		if (btr_pcur_get_up_match(&(plan->pcur))
+						< plan->n_exact_match) {
+			goto table_exhausted;
+		}
+
+		/* Ok, no need to test end_conds or mix id */
+
+	} else if (plan->mixed_index) {
+	    	/* We have to check if the record in a mixed cluster belongs
+	    	to this table */
+
+	 	if (!dict_is_mixed_table_rec(plan->table, rec)) {
+
+	    		goto next_rec;
+	    	}
+	}
+
+	/* We are ready to look at a possible new index entry in the result
+	set: the cursor is now placed on a user record */
+
+	/* PHASE 3: Get previous version in a consistent read */
+
+	if (consistent_read) {
+		/* This is a non-locking consistent read: if necessary, fetch
+		a previous version of the record */
+
+		if (index->type & DICT_CLUSTERED) {
+			
+			if (!lock_clust_rec_cons_read_sees(rec, index,
+							node->read_view)) {
+
+				err = row_sel_build_prev_vers(node->read_view,
+							plan, rec, &old_vers,
+							&mtr);
+				if (err != DB_SUCCESS) {
+
+					goto lock_wait_or_error;
+				}
+
+				if (old_vers == NULL) {
+					row_sel_fetch_columns(index, rec,
+					    UT_LIST_GET_FIRST(plan->columns));
+
+					if (!row_sel_test_end_conds(plan)) {
+
+						goto table_exhausted;
+					}
+
+					goto next_rec;
+				}
+
+				rec = old_vers;
+			}
+		} else if (!lock_sec_rec_cons_read_sees(rec, index,
+							node->read_view)) {
+			cons_read_requires_clust_rec = TRUE;
+		}
+	}
+
+	/* PHASE 4: Test search end conditions and deleted flag */
+
+	/* Fetch the columns needed in test conditions */
+	
+	row_sel_fetch_columns(index, rec, UT_LIST_GET_FIRST(plan->columns));
+
+	/* Test the selection end conditions: these can only contain columns
+	which already are found in the index, even though the index might be
+	non-clustered */
+
+	if (plan->unique_search && cursor_just_opened) {
+
+		/* No test necessary: the test was already made above */
+
+	} else if (!row_sel_test_end_conds(plan)) {
+
+		goto table_exhausted;
+	}
+
+	if (rec_get_deleted_flag(rec) && !cons_read_requires_clust_rec) {
+
+		/* The record is delete marked: we can skip it if this is
+		not a consistent read which might see an earlier version
+		of a non-clustered index record */
+
+		if (plan->unique_search) {
+			
+			goto table_exhausted;
+		}
+		
+		goto next_rec;
+	}
+
+	/* PHASE 5: Get the clustered index record, if needed and if we did
+	not do the search using the clustered index */
+
+	if (plan->must_get_clust || cons_read_requires_clust_rec) {
+
+		/* It was a non-clustered index and we must fetch also the
+		clustered index record */
+
+		err = row_sel_get_clust_rec(node, plan, rec, thr, &clust_rec,
+									&mtr);
+		mtr_has_extra_clust_latch = TRUE;
+		
+		if (err != DB_SUCCESS) {
+
+			goto lock_wait_or_error;
+		}
+
+		/* Retrieving the clustered record required a search:
+		increment the cost counter */
+
+		cost_counter++;
+
+		if (clust_rec == NULL) {
+			/* The record did not exist in the read view */
+			ut_ad(consistent_read);
+
+			goto next_rec;
+		}
+
+		if (rec_get_deleted_flag(clust_rec)) {
+
+			/* The record is delete marked: we can skip it */
+
+			goto next_rec;
+		}
+
+		if (node->can_get_updated) {
+
+			btr_pcur_store_position(&(plan->clust_pcur), &mtr);
+		}
+	}	
+
+	/* PHASE 6: Test the rest of search conditions */
+	
+	if (!row_sel_test_other_conds(plan)) {
+
+		if (plan->unique_search) {
+			
+			goto table_exhausted;
+		}
+
+		goto next_rec;
+	}
+
+	/* PHASE 7: We found a new qualifying row for the current table; push
+	the row if prefetch is on, or move to the next table in the join */
+	
+	plan->n_rows_fetched++;
+
+	ut_ad(plan->pcur.latch_mode == node->latch_mode);
+
+	if (node->select_will_do_update) {
+		/* This is a searched update and we can do the update in-place,
+		saving CPU time */
+
+		row_upd_in_place_in_select(node, thr, &mtr);
+
+		leaf_contains_updates = TRUE;
+
+		/* When the database is in the online backup mode, the number
+		of log records for a single mtr should be small: increment the
+		cost counter to ensure it */
+		
+		cost_counter += 1 + (SEL_COST_LIMIT / 8);
+
+		if (plan->unique_search) {
+
+			goto table_exhausted;			
+		}
+
+		goto next_rec;
+	}	
+
+	if ((plan->n_rows_fetched <= SEL_PREFETCH_LIMIT)
+				|| plan->unique_search || plan->no_prefetch) {
+
+		/* No prefetch in operation: go to the next table */
+	
+		goto next_table;
+	}
+
+	sel_push_prefetched_row(plan);
+
+	if (plan->n_rows_prefetched == SEL_MAX_N_PREFETCH) {
+
+		/* The prefetch buffer is now full */
+		
+		sel_pop_prefetched_row(plan);
+
+		goto next_table;
+	}
+
+next_rec:	
+	ut_ad(!search_latch_locked);
+
+	if (mtr_has_extra_clust_latch) {
+
+		/* We must commit &mtr if we are moving to the next
+		non-clustered index record, because we could break the
+		latching order if we would access a different clustered
+		index page right away without releasing the previous. */
+
+		goto commit_mtr_for_a_while;
+	}
+	
+	if (leaf_contains_updates
+		&& btr_pcur_is_after_last_on_page(&(plan->pcur), &mtr)) {
+
+		/* We must commit &mtr if we are moving to a different page,
+		because we have done updates to the x-latched leaf page, and
+		the latch would be released in btr_pcur_move_to_next, without
+		&mtr getting committed there */
+
+		ut_ad(node->asc);
+
+		goto commit_mtr_for_a_while;
+	}
+
+	if (node->asc) {
+		moved = btr_pcur_move_to_next(&(plan->pcur), &mtr);
+	} else {
+		moved = btr_pcur_move_to_prev(&(plan->pcur), &mtr);
+	}
+
+	if (!moved) {
+		
+		goto table_exhausted;
+	}
+
+	cursor_just_opened = FALSE;
+
+	/* END OF RECORD LOOP
+	   ------------------ */
+	goto rec_loop;
+
+next_table:
+	/* We found a record which satisfies the conditions: we can move to
+	the next table or return a row in the result set */
+
+	ut_ad(btr_pcur_is_on_user_rec(&(plan->pcur), &mtr));
+	
+	if (plan->unique_search && !node->can_get_updated) {
+
+		plan->cursor_at_end = TRUE;
+	} else {
+		ut_ad(!search_latch_locked);
+
+		plan->stored_cursor_rec_processed = TRUE;
+
+		btr_pcur_store_position(&(plan->pcur), &mtr);
+	}
+
+	mtr_commit(&mtr);
+
+	leaf_contains_updates = FALSE;
+	mtr_has_extra_clust_latch = FALSE;
+
+next_table_no_mtr:
+	/* If we use 'goto' to this label, it means that the row was popped
+	from the prefetched rows stack, and &mtr is already committed */
+	
+	if (node->fetch_table + 1 == node->n_tables) {
+
+		sel_eval_select_list(node);
+
+		if (node->is_aggregate) {
+
+			goto table_loop;			
+		}
+
+		sel_assign_into_var_values(node->into_list, node);
+		
+		thr->run_node = que_node_get_parent(node);
+
+		if (search_latch_locked) {
+			rw_lock_s_unlock(&btr_search_latch);
+		}
+		
+		return(DB_SUCCESS);
+	}
+
+	node->fetch_table++;
+
+	/* When we move to the next table, we first reset the plan cursor:
+	we do not care about resetting it when we backtrack from a table */
+	
+	plan_reset_cursor(sel_node_get_nth_plan(node, node->fetch_table));
+	
+	goto table_loop;
+
+table_exhausted:
+	/* The table cursor pcur reached the result set end: backtrack to the
+	previous table in the join if we do not have cached prefetched rows */	
+
+	plan->cursor_at_end = TRUE;
+
+	mtr_commit(&mtr);
+
+	leaf_contains_updates = FALSE;
+	mtr_has_extra_clust_latch = FALSE;
+	
+	if (plan->n_rows_prefetched > 0) {
+		/* The table became exhausted during a prefetch */
+	
+		sel_pop_prefetched_row(plan);
+
+		goto next_table_no_mtr;
+	}
+
+table_exhausted_no_mtr:
+	if (node->fetch_table == 0) {
+
+		if (node->is_aggregate && !node->aggregate_already_fetched) {
+
+			node->aggregate_already_fetched = TRUE;
+
+			sel_assign_into_var_values(node->into_list, node);
+
+			thr->run_node = que_node_get_parent(node);
+
+			if (search_latch_locked) {
+				rw_lock_s_unlock(&btr_search_latch);
+			}
+		
+			return(DB_SUCCESS);
+		}
+
+		node->state = SEL_NODE_NO_MORE_ROWS;
+		
+		thr->run_node = que_node_get_parent(node);
+
+		if (search_latch_locked) {
+			rw_lock_s_unlock(&btr_search_latch);
+		}
+		
+		return(DB_SUCCESS);
+	}
+
+	node->fetch_table--;
+
+	goto table_loop;
+
+stop_for_a_while:
+	/* Return control for a while to que_run_threads, so that runaway
+	queries can be canceled. NOTE that when we come here, we must, in a
+	locking read, have placed the necessary (possibly waiting request)
+	record lock on the cursor record or its successor: when we reposition
+	the cursor, this record lock guarantees that nobody can meanwhile have
+	inserted new records which should have appeared in the result set,
+	which would result in the phantom problem. */ 
+
+	ut_ad(!search_latch_locked);
+
+	plan->stored_cursor_rec_processed = FALSE;
+	btr_pcur_store_position(&(plan->pcur), &mtr);
+
+	mtr_commit(&mtr);
+		
+	ut_ad(sync_thread_levels_empty_gen(TRUE));
+
+	return(DB_SUCCESS);
+
+commit_mtr_for_a_while:
+	/* Stores the cursor position and commits &mtr; this is used if
+	&mtr may contain latches which would break the latching order if
+	&mtr would not be committed and the latches released. */ 
+
+	plan->stored_cursor_rec_processed = TRUE;
+
+	ut_ad(!search_latch_locked);
+	btr_pcur_store_position(&(plan->pcur), &mtr);
+
+	mtr_commit(&mtr);
+
+	leaf_contains_updates = FALSE;
+	mtr_has_extra_clust_latch = FALSE;
+	
+	ut_ad(sync_thread_levels_empty_gen(TRUE));
+
+	goto table_loop;
+
+lock_wait_or_error:
+	/* See the note at stop_for_a_while: the same holds for this case */
+
+	ut_ad(!btr_pcur_is_before_first_on_page(&(plan->pcur), &mtr)
+							|| !node->asc);
+	ut_ad(!search_latch_locked);
+
+	plan->stored_cursor_rec_processed = FALSE;
+	btr_pcur_store_position(&(plan->pcur), &mtr);
+	
+	mtr_commit(&mtr);
+		
+	ut_ad(sync_thread_levels_empty_gen(TRUE));
+
+	return(err);
+}
+
+/**************************************************************************
+Performs a select step. This is a high-level function used in SQL execution
+graphs. */
+
+que_thr_t*
+row_sel_step(
+/*=========*/
+				/* out: query thread to run next or NULL */
+	que_thr_t*	thr)	/* in: query thread */
+{
+	ulint		i_lock_mode;
+	sym_node_t*	table_node;
+	sel_node_t*	node;
+	ulint		err;
+
+	ut_ad(thr);
+	
+	node = thr->run_node;
+
+	ut_ad(que_node_get_type(node) == QUE_NODE_SELECT);
+
+	/* If this is a new time this node is executed (or when execution
+	resumes after wait for a table intention lock), set intention locks
+	on the tables, or assign a read view */
+
+	if (node->into_list && (thr->prev_node == que_node_get_parent(node))) {
+
+		node->state = SEL_NODE_OPEN;
+	}
+
+	if (node->state == SEL_NODE_OPEN) {
+
+		/* It may be that the current session has not yet started
+		its transaction, or it has been committed: */
+
+		trx_start_if_not_started(thr_get_trx(thr));
+
+		plan_reset_cursor(sel_node_get_nth_plan(node, 0));
+
+		if (node->consistent_read) {
+			/* Assign a read view for the query */
+			node->read_view = trx_assign_read_view(
+							thr_get_trx(thr));
+		} else {
+			if (node->set_x_locks) {
+				i_lock_mode = LOCK_IX;
+			} else {
+				i_lock_mode = LOCK_IS;
+			}
+	
+			table_node = node->table_list;
+	
+			while (table_node) {
+				err = lock_table(0, table_node->table,
+							i_lock_mode, thr);
+				if (err != DB_SUCCESS) {
+	
+					que_thr_handle_error(thr, DB_ERROR,
+								NULL, 0);
+					return(NULL);
+				}
+	
+				table_node = que_node_get_next(table_node);
+			}
+		}
+	
+		/* If this is an explicit cursor, copy stored procedure
+		variable values, so that the values cannot change between
+		fetches (currently, we copy them also for non-explicit
+		cursors) */
+
+		if (node->explicit_cursor &&
+				UT_LIST_GET_FIRST(node->copy_variables)) {
+
+			row_sel_copy_input_variable_vals(node);
+		}
+		
+		node->state = SEL_NODE_FETCH;
+		node->fetch_table = 0;
+
+		if (node->is_aggregate) {
+			/* Reset the aggregate total values */
+			sel_reset_aggregate_vals(node);
+		}
+	}
+
+	err = row_sel(node, thr);
+
+	/* NOTE! if queries are parallelized, the following assignment may
+	have problems; the assignment should be made only if thr is the
+	only top-level thr in the graph: */
+	
+	thr->graph->last_sel_node = node;
+
+	if (err == DB_SUCCESS) {
+		/* Ok: do nothing */
+
+	} else if (err == DB_LOCK_WAIT) {
+
+		return(NULL);
+	} else {
+		/* SQL error detected */
+		printf("SQL error %lu\n", err);
+
+		que_thr_handle_error(thr, DB_ERROR, NULL, 0);
+
+		return(NULL);
+	}
+
+	return(thr);
+} 
+
+/**************************************************************************
+Performs a fetch for a cursor. */
+
+que_thr_t*
+fetch_step(
+/*=======*/
+				/* out: query thread to run next or NULL */
+	que_thr_t*	thr)	/* in: query thread */
+{
+	sel_node_t*	sel_node;
+	fetch_node_t*	node;
+
+	ut_ad(thr);
+	
+	node = thr->run_node;
+	sel_node = node->cursor_def;
+	
+	ut_ad(que_node_get_type(node) == QUE_NODE_FETCH);
+
+	if (thr->prev_node != que_node_get_parent(node)) {
+
+		if (sel_node->state != SEL_NODE_NO_MORE_ROWS) {
+			
+			sel_assign_into_var_values(node->into_list, sel_node);
+		}
+
+		thr->run_node = que_node_get_parent(node);
+
+		return(thr);
+	}
+
+	/* Make the fetch node the parent of the cursor definition for
+	the time of the fetch, so that execution knows to return to this
+	fetch node after a row has been selected or we know that there is
+	no row left */
+		
+	sel_node->common.parent = node;
+	
+	if (sel_node->state == SEL_NODE_CLOSED) {
+		/* SQL error detected */
+		printf("SQL error %lu\n", DB_ERROR);
+
+		que_thr_handle_error(thr, DB_ERROR, NULL, 0);
+
+		return(NULL);
+	}
+
+	thr->run_node = sel_node;
+
+	return(thr);
+} 
+
+/***************************************************************
+Prints a row in a select result. */
+
+que_thr_t*
+row_printf_step(
+/*============*/
+				/* out: query thread to run next or NULL */
+	que_thr_t*	thr)	/* in: query thread */
+{
+	row_printf_node_t*	node;
+	sel_node_t*		sel_node;
+	que_node_t*		arg;
+
+	ut_ad(thr);
+	
+	node = thr->run_node;
+	
+	sel_node = node->sel_node;
+
+	ut_ad(que_node_get_type(node) == QUE_NODE_ROW_PRINTF);
+
+	if (thr->prev_node == que_node_get_parent(node)) {
+	
+		/* Reset the cursor */
+		sel_node->state = SEL_NODE_OPEN;
+
+		/* Fetch next row to print */
+
+		thr->run_node = sel_node;
+		
+		return(thr);
+	}
+
+	if (sel_node->state != SEL_NODE_FETCH) {
+
+		ut_ad(sel_node->state == SEL_NODE_NO_MORE_ROWS);
+
+		/* No more rows to print */
+
+		thr->run_node = que_node_get_parent(node);
+	
+		return(thr);
+	}
+
+	arg = sel_node->select_list;
+
+	while (arg) {
+		dfield_print_also_hex(que_node_get_val(arg));
+
+		printf(" ::: ");
+
+		arg = que_node_get_next(arg);
+	}
+
+	printf("\n");
+
+	/* Fetch next row to print */
+
+	thr->run_node = sel_node;
+
+	return(thr);
+} 
+
+/********************************************************************
+Converts a key value stored in MySQL format to an Innobase dtuple.
+The last field of the key value may be just a prefix of a fixed length
+field: hence the parameter key_len. */
+
+void
+row_sel_convert_mysql_key_to_innobase(
+/*==================================*/
+	dtuple_t*	tuple,		/* in: tuple where to build;
+					NOTE: we assume that the type info
+					in the tuple is already according
+					to index! */
+	byte*		buf,		/* in: buffer to use in field
+					conversions */
+	dict_index_t*	index,		/* in: index of the key value */
+	byte*		key_ptr,	/* in: MySQL key value */
+	ulint		key_len)	/* in: MySQL key value length */
+{
+	dfield_t*	dfield;
+	ulint		offset;
+	ulint		len;
+	byte*		key_end;
+	ulint		n_fields = 0;
+	
+	key_end = key_ptr + key_len;
+
+	/* Permit us to access any field in the tuple (ULINT_MAX): */
+	
+	dtuple_set_n_fields(tuple, ULINT_MAX);
+
+	dfield = dtuple_get_nth_field(tuple, 0);
+
+	if (dfield_get_type(dfield)->mtype == DATA_SYS) {
+		/* A special case: we are looking for a position in a
+		generated clustered index: the first and the only
+		ordering column is ROW_ID */
+
+		ut_a(key_len == DATA_ROW_ID_LEN);
+
+		dfield_set_data(dfield, key_ptr, DATA_ROW_ID_LEN);
+					
+		dtuple_set_n_fields(tuple, 1);
+
+		return;
+	}
+
+  	while (key_ptr < key_end) {
+		offset = 0;
+		len = dfield_get_type(dfield)->len;
+
+		n_fields++;    		
+
+    		if (!(dfield_get_type(dfield)->prtype & DATA_NOT_NULL)) {
+    			/* The first byte in the field tells if this is
+    			an SQL NULL value */
+    			
+    			offset = 1;
+
+			if (*key_ptr != 0) {
+      				dfield_set_data(dfield, NULL, UNIV_SQL_NULL);
+
+      				goto next_part;
+      			}
+      		}
+
+		row_mysql_store_col_in_innobase_format(
+				dfield, buf, key_ptr + offset, len,
+					dfield_get_type(dfield)->mtype,
+					dfield_get_type(dfield)->prtype
+							& DATA_UNSIGNED);
+	next_part:
+    		key_ptr += (offset + len);
+
+		if (key_ptr > key_end) {
+			/* The last field in key was not a complete
+			field but a prefix of it */
+
+			ut_ad(dfield_get_len(dfield) != UNIV_SQL_NULL);
+			
+			dfield_set_data(dfield, buf,
+					len - (ulint)(key_ptr - key_end));
+		}
+
+		buf += len;
+    		
+		dfield++;
+  	}
+
+ 	/* We set the length of tuple to n_fields: we assume that
+	the memory area allocated for it is big enough (usually
+	bigger than n_fields). */
+ 	
+ 	dtuple_set_n_fields(tuple, n_fields);
+}
+
+/******************************************************************
+Stores the row id to the prebuilt struct. */
+UNIV_INLINE
+void
+row_sel_store_row_id_to_prebuilt(
+/*=============================*/
+	row_prebuilt_t*	prebuilt,	/* in: prebuilt */
+	rec_t*		index_rec,	/* in: record */
+	dict_index_t*	index)		/* in: index of the record */
+{
+	byte*	data;
+	ulint	len;
+
+	data = rec_get_nth_field(index_rec,
+			dict_index_get_sys_col_pos(index, DATA_ROW_ID), &len);
+
+	ut_a(len == DATA_ROW_ID_LEN);
+
+	ut_memcpy(prebuilt->row_id, data, len);
+}
+
+/******************************************************************
+Stores a non-SQL-NULL field in the MySQL format. */
+UNIV_INLINE
+void
+row_sel_field_store_in_mysql_format(
+/*================================*/
+	byte*	dest,	/* in/out: buffer where to store; NOTE that BLOBs
+			are not in themselves stored here: the caller must
+			allocate and copy the BLOB into buffer before, and pass
+			the pointer to the BLOB in 'data' */
+	ulint	col_len,/* in: MySQL column length */
+	byte*	data,	/* in: data to store */
+	ulint	len,	/* in: length of the data */
+	ulint	type,	/* in: data type */
+	ulint	is_unsigned)/* in: != 0 if an unsigned integer type */
+{
+	byte*	ptr;
+
+	ut_ad(len != UNIV_SQL_NULL);
+
+	if (type == DATA_INT) {
+		/* Convert integer data from Innobase to a little-endian
+		format, sign bit restored to normal */
+
+		ptr = dest + len;
+
+		for (;;) {
+			ptr--;
+			*ptr = *data;
+			if (ptr == dest) {
+				break;
+			}
+			data++;
+		}
+
+		if (!is_unsigned) {
+			dest[len - 1] = dest[len - 1] ^ 128;
+		}
+
+		ut_ad(col_len == len);
+	} else if (type == DATA_VARCHAR || type == DATA_VARMYSQL
+						|| type == DATA_BINARY) {
+		/* Store the length of the data to the first two bytes of
+		dest; does not do anything yet because MySQL has
+		no real vars! */
+		
+		dest = row_mysql_store_var_len(dest, len);
+		ut_memcpy(dest, data, len);
+
+		/* ut_ad(col_len >= len + 2); No real var implemented in
+		MySQL yet! */
+		
+	} else if (type == DATA_BLOB) {
+		/* Store a pointer to the BLOB buffer to dest: the BLOB was
+		already copied to the buffer in row_sel_store_mysql_rec */
+
+		row_mysql_store_blob_ref(dest, col_len, data, len);
+	} else {
+		ut_memcpy(dest, data, len);
+		ut_ad(col_len == len);
+	}
+}
+
+/******************************************************************
+Convert a row in the Innobase format to a row in the MySQL format.
+Note that the template in prebuilt may advise us to copy only a few
+columns to mysql_rec, other columns are left blank. All columns may not
+be needed in the query. */
+static
+void
+row_sel_store_mysql_rec(
+/*====================*/
+	byte*		mysql_rec,	/* out: row in the MySQL format */
+	row_prebuilt_t*	prebuilt,	/* in: prebuilt struct */
+	rec_t*		rec)		/* in: Innobase record in the index
+					which was described in prebuilt's
+					template */
+{
+	mysql_row_templ_t*	templ;	
+	byte*			data;
+	ulint			len;
+	byte*			blob_buf;
+	ulint			i;
+	
+	ut_ad(prebuilt->mysql_template);
+
+	if (prebuilt->blob_heap != NULL) {
+		mem_heap_free(prebuilt->blob_heap);
+		prebuilt->blob_heap = NULL;
+	}
+
+	/* Mark all columns as not SQL NULL */
+
+	memset(mysql_rec, '\0', prebuilt->null_bitmap_len);
+
+	for (i = 0; i < prebuilt->n_template; i++) {
+
+		templ = prebuilt->mysql_template + i;
+
+		data = rec_get_nth_field(rec, templ->rec_field_no, &len);
+
+		if (len != UNIV_SQL_NULL) {
+			if (templ->type == DATA_BLOB) {
+
+				/* Copy the BLOB data to the BLOB
+				heap of prebuilt */
+
+				if (prebuilt->blob_heap == NULL) {
+					prebuilt->blob_heap =
+						mem_heap_create(len);
+				}
+
+				blob_buf = mem_heap_alloc(prebuilt->blob_heap,
+									len);
+				ut_memcpy(blob_buf, data, len);
+
+				data = blob_buf;
+			}
+		
+			row_sel_field_store_in_mysql_format(
+				mysql_rec + templ->mysql_col_offset,
+				templ->mysql_col_len, data, len,
+				templ->type, templ->is_unsigned);
+		} else {
+			mysql_rec[templ->mysql_null_byte_offset] |=
+					(byte) (templ->mysql_null_bit_mask);
+		}
+	} 
+}
+
+/*************************************************************************
+Builds a previous version of a clustered index record for a consistent read */
+static
+ulint
+row_sel_build_prev_vers_for_mysql(
+/*==============================*/
+					/* out: DB_SUCCESS or error code */
+	read_view_t*	read_view,	/* in: read view */
+	dict_index_t*	clust_index,	/* in: clustered index */
+	row_prebuilt_t*	prebuilt,	/* in: prebuilt struct */
+	rec_t*		rec,		/* in: record in a clustered index */
+	rec_t**		old_vers,	/* out: old version, or NULL if the
+					record does not exist in the view:
+					i.e., it was freshly inserted
+					afterwards */
+	mtr_t*		mtr)		/* in: mtr */
+{
+	ulint	err;
+
+	if (prebuilt->old_vers_heap) {
+		mem_heap_empty(prebuilt->old_vers_heap);
+	} else {
+		prebuilt->old_vers_heap = mem_heap_create(200);
+	}
+	
+	err = row_vers_build_for_consistent_read(rec, mtr, clust_index,
+					read_view, prebuilt->old_vers_heap,
+					old_vers);
+	return(err);
+}
+
+/*************************************************************************
+Retrieves the clustered index record corresponding to a record in a
+non-clustered index. Does the necessary locking. Used in the MySQL
+interface. */
+static
+ulint
+row_sel_get_clust_rec_for_mysql(
+/*============================*/
+				/* out: DB_SUCCESS or error code */
+	row_prebuilt_t*	prebuilt,/* in: prebuilt struct in the handle */
+	dict_index_t*	sec_index,/* in: secondary index where rec resides */
+	rec_t*		rec,	/* in: record in a non-clustered index */
+	que_thr_t*	thr,	/* in: query thread */
+	rec_t**		out_rec,/* out: clustered record or an old version of
+				it, NULL if the old version did not exist
+				in the read view, i.e., it was a fresh
+				inserted version */
+	mtr_t*		mtr)	/* in: mtr used to get access to the
+				non-clustered record; the same mtr is used to
+				access the clustered index */
+{
+	dict_index_t*	clust_index;
+	rec_t*		clust_rec;
+	rec_t*		old_vers;
+	ulint		err;
+	trx_t*		trx;
+
+	*out_rec = NULL;
+	
+	row_build_row_ref_in_tuple(prebuilt->clust_ref, sec_index, rec);
+
+	clust_index = dict_table_get_first_index(sec_index->table);
+	
+	btr_pcur_open_with_no_init(clust_index, prebuilt->clust_ref,
+			PAGE_CUR_LE, BTR_SEARCH_LEAF,
+			prebuilt->clust_pcur, 0, mtr);
+
+	clust_rec = btr_pcur_get_rec(prebuilt->clust_pcur);
+
+	ut_ad(page_rec_is_user_rec(clust_rec));
+
+	if (prebuilt->select_lock_type != LOCK_NONE) {
+		/* Try to place a lock on the index record */
+		
+		err = lock_clust_rec_read_check_and_lock(0, clust_rec,
+					clust_index,
+					prebuilt->select_lock_type, thr);
+		if (err != DB_SUCCESS) {
+
+			return(err);
+		}
+	} else {
+		/* This is a non-locking consistent read: if necessary, fetch
+		a previous version of the record */
+
+		trx = thr_get_trx(thr);
+		
+		if (!lock_clust_rec_cons_read_sees(clust_rec, clust_index,
+							trx->read_view)) {
+
+			err = row_sel_build_prev_vers_for_mysql(
+					trx->read_view, clust_index,
+					prebuilt, clust_rec,
+					&old_vers, mtr);
+						
+			if (err != DB_SUCCESS) {
+
+				return(err);
+			}
+
+			clust_rec = old_vers;
+		}
+	}
+
+	*out_rec = clust_rec;
+
+	if (prebuilt->select_lock_type == LOCK_X) {
+		/* We may use the cursor in update: store its position */
+		
+		btr_pcur_store_position(prebuilt->clust_pcur, mtr);
+	}
+
+	return(DB_SUCCESS);
+}
+
+/************************************************************************
+Restores cursor position after it has been stored. We have to take into
+account that the record cursor was positioned on can have been deleted.
+Then we may have to move the cursor one step up or down. */
+static
+ibool
+sel_restore_position_for_mysql(
+/*===========================*/
+					/* out: TRUE if we may need to
+					process the record the cursor is
+					now positioned on (i.e. we should
+					not go to the next record yet) */
+	ulint		latch_mode,	/* in: latch mode wished in
+					restoration */
+	btr_pcur_t*	pcur,		/* in: cursor whose position
+					has been stored */
+	ibool		moves_up,	/* in: TRUE if the cursor moves up
+					in the index */
+	mtr_t*		mtr)		/* in: mtr; CAUTION: may commit
+					mtr temporarily! */
+{
+	ibool	success;
+	ulint	relative_position;
+
+	relative_position = pcur->rel_pos;
+	
+	success = btr_pcur_restore_position(latch_mode, pcur, mtr);
+
+	if (relative_position == BTR_PCUR_ON) {
+		if (success) {
+			return(FALSE);
+		}
+
+		if (moves_up) {
+			btr_pcur_move_to_next(pcur, mtr);
+
+			return(TRUE);
+		}
+
+		return(TRUE);
+	}
+
+	if (relative_position == BTR_PCUR_AFTER) {
+		if (moves_up) {
+			return(TRUE);
+		}
+					
+		if (btr_pcur_is_on_user_rec(pcur, mtr)) {
+			btr_pcur_move_to_prev(pcur, mtr);
+		}
+
+		return(TRUE);
+	}
+
+	ut_ad(relative_position == BTR_PCUR_BEFORE);
+	
+	if (moves_up && btr_pcur_is_on_user_rec(pcur, mtr)) {
+		btr_pcur_move_to_next(pcur, mtr);
+	}
+
+	return(TRUE);
+}
+
+/************************************************************************
+Pops a cached row for MySQL from the fetch cache. */
+UNIV_INLINE
+void
+row_sel_pop_cached_row_for_mysql(
+/*=============================*/
+	byte*		buf,		/* in/out: buffer where to copy the
+					row */
+	row_prebuilt_t*	prebuilt)	/* in: prebuilt struct */
+{
+	ut_ad(prebuilt->n_fetch_cached > 0);
+
+	ut_memcpy(buf, prebuilt->fetch_cache[prebuilt->fetch_cache_first],
+						prebuilt->mysql_row_len);
+	prebuilt->n_fetch_cached--;
+	prebuilt->fetch_cache_first++;
+
+	if (prebuilt->n_fetch_cached == 0) {
+		prebuilt->fetch_cache_first = 0;
+	}
+}
+
+/************************************************************************
+Pushes a row for MySQL to the fetch cache. */
+UNIV_INLINE
+void
+row_sel_push_cache_row_for_mysql(
+/*=============================*/
+	row_prebuilt_t*	prebuilt,	/* in: prebuilt struct */
+	rec_t*		rec)		/* in: record to push */
+{
+	ulint	i;
+
+	ut_ad(prebuilt->n_fetch_cached < MYSQL_FETCH_CACHE_SIZE);
+
+	if (prebuilt->fetch_cache[0] == NULL) {
+		/* Allocate memory for the fetch cache */
+
+		for (i = 0; i < MYSQL_FETCH_CACHE_SIZE; i++) {
+			prebuilt->fetch_cache[i] = mem_alloc(
+						prebuilt->mysql_row_len);
+		}
+	}
+
+	ut_ad(prebuilt->fetch_cache_first == 0);
+
+	row_sel_store_mysql_rec(
+			prebuilt->fetch_cache[prebuilt->n_fetch_cached],
+			prebuilt, rec);
+
+	prebuilt->n_fetch_cached++;
+}
+	
+/************************************************************************
+Searches for rows in the database. This is used in the interface to
+MySQL. This function opens a cursor, and also implements fetch next
+and fetch prev. NOTE that if we do a search with a full key value
+from a unique index (ROW_SEL_EXACT), then we will not store the cursor
+position and fetch next or fetch prev must not be tried to the cursor! */
+
+ulint
+row_search_for_mysql(
+/*=================*/
+					/* out: DB_SUCCESS,
+					DB_RECORD_NOT_FOUND, 
+					DB_END_OF_INDEX, or DB_DEADLOCK */
+	byte*		buf,		/* in/out: buffer for the fetched
+					row in the MySQL format */
+	ulint		mode,		/* in: search mode PAGE_CUR_L, ... */
+	row_prebuilt_t*	prebuilt,	/* in: prebuilt struct for the
+					table handle; this contains the info
+					of search_tuple, index; if search
+					tuple contains 0 fields then we
+					position the cursor at the start or
+					the end of the index, depending on
+					'mode' */
+	ulint		match_mode,	/* in: 0 or ROW_SEL_EXACT or
+					ROW_SEL_EXACT_PREFIX */ 
+	ulint		direction)	/* in: 0 or ROW_SEL_NEXT or
+					ROW_SEL_PREV; NOTE: if this is != 0,
+					then prebuilt must have a pcur
+					with stored position! In opening of a
+					cursor 'direction' should be 0. */
+{
+	dict_index_t*	index		= prebuilt->index;
+	dtuple_t*	search_tuple	= prebuilt->search_tuple;
+	btr_pcur_t*	pcur		= prebuilt->pcur;
+	trx_t*		trx		= prebuilt->trx;
+	dict_index_t*	clust_index;
+	que_thr_t*	thr;
+	rec_t*		rec;
+	rec_t*		index_rec;
+	rec_t*		clust_rec;
+	rec_t*		old_vers;
+	ulint		err;
+	ibool		moved;
+	ibool		cons_read_requires_clust_rec;
+	ibool		was_lock_wait;
+	ulint		ret;
+	ibool		unique_search_from_clust_index	= FALSE;
+	ibool		mtr_has_extra_clust_latch 	= FALSE;
+	ibool		moves_up 			= FALSE;
+	mtr_t		mtr;
+	
+	ut_ad(index && pcur && search_tuple);
+	ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
+	
+	ut_ad(sync_thread_levels_empty_gen(FALSE));
+	
+	if (direction == 0) {
+		prebuilt->n_rows_fetched = 0;
+		prebuilt->n_fetch_cached = 0;
+		prebuilt->fetch_cache_first = 0;
+
+		if (prebuilt->sel_graph == NULL) {
+			/* Build a dummy select query graph */
+			row_prebuild_sel_graph(prebuilt);
+		}
+	} else {
+		if (prebuilt->n_rows_fetched == 0) {
+			prebuilt->fetch_direction = direction;
+		}
+
+		if (direction != prebuilt->fetch_direction) {
+			if (prebuilt->n_fetch_cached > 0) {
+				ut_a(0);
+				/* TODO: scrollable cursor: restore cursor to
+				the place of the latest returned row,
+				or better: prevent caching for a scroll
+				cursor! */
+			}
+		
+			prebuilt->n_rows_fetched = 0;
+			prebuilt->n_fetch_cached = 0;
+			prebuilt->fetch_cache_first = 0;
+
+		} else if (prebuilt->n_fetch_cached > 0) {
+			row_sel_pop_cached_row_for_mysql(buf, prebuilt);
+
+			prebuilt->n_rows_fetched++;
+
+			return(DB_SUCCESS);
+		}
+
+		if (prebuilt->fetch_cache_first > 0
+		    && prebuilt->fetch_cache_first < MYSQL_FETCH_CACHE_SIZE) {
+
+		    	/* The previous returned row was popped from the fetch
+		    	cache, but the cache was not full at the time of the
+		    	popping: no more rows can exist in the result set */
+		    
+		    	return(DB_RECORD_NOT_FOUND);
+		}
+		
+		prebuilt->n_rows_fetched++;
+
+		if (prebuilt->n_rows_fetched > 1000000000) {
+			/* Prevent wrap-over */
+			prebuilt->n_rows_fetched = 500000000;
+		}
+
+		mode = pcur->search_mode;
+	}
+
+	if (match_mode == ROW_SEL_EXACT && index->type & DICT_UNIQUE
+		&& index->type & DICT_CLUSTERED
+		&& dtuple_get_n_fields(search_tuple)
+				== dict_index_get_n_unique(index)) {
+
+		if (direction == ROW_SEL_NEXT) {
+			/* MySQL sometimes seems to do fetch next even
+			if the search condition is unique; we do not store
+			pcur position in this case, so we cannot
+			restore cursor position, and must return
+ 			immediately */
+
+			return(DB_RECORD_NOT_FOUND);
+		}
+
+		ut_a(direction == 0);	/* We cannot do fetch prev, as we have
+					not stored the cursor position */
+		mode = PAGE_CUR_GE;
+
+		unique_search_from_clust_index = TRUE;
+	}
+	
+	/* Note that if the search mode was GE or G, then the cursor
+	naturally moves upward (in fetch next) in alphabetical order,
+	otherwise downward */
+	
+	if (direction == 0) {
+		if (mode == PAGE_CUR_GE || mode == PAGE_CUR_G) {
+			moves_up = TRUE;
+		}
+	} else if (direction == ROW_SEL_NEXT) {
+		moves_up = TRUE;
+	}
+		
+	mtr_start(&mtr);
+
+	thr = que_fork_get_first_thr(prebuilt->sel_graph);
+
+	que_thr_move_to_run_state_for_mysql(thr, trx);
+
+	clust_index = dict_table_get_first_index(index->table);
+
+	if (direction != 0) {		
+		moved = sel_restore_position_for_mysql(BTR_SEARCH_LEAF, pcur,
+							moves_up, &mtr);
+		if (!moved) {
+			goto next_rec;
+		}
+
+	} else if (dtuple_get_n_fields(search_tuple) > 0) {
+
+		btr_pcur_open_with_no_init(index, search_tuple, mode,
+					BTR_SEARCH_LEAF,
+					pcur, 0, &mtr);
+	} else {
+		if (mode == PAGE_CUR_G) {
+			btr_pcur_open_at_index_side(TRUE, index,
+					BTR_SEARCH_LEAF, pcur, FALSE, &mtr);
+		} else if (mode == PAGE_CUR_L) {
+			btr_pcur_open_at_index_side(FALSE, index,
+					BTR_SEARCH_LEAF, pcur, FALSE, &mtr);
+		}
+	}
+
+	if (!prebuilt->sql_stat_start) {
+		/* No need to set an intention lock or assign a read view */
+
+	} else if (prebuilt->select_lock_type == LOCK_NONE) {
+		/* This is a consistent read */
+		trx_start_if_not_started(trx);
+	
+		/* Assign a read view for the query */
+
+		trx_assign_read_view(trx);
+		prebuilt->sql_stat_start = FALSE;
+	} else {			
+		trx_start_if_not_started(trx);
+
+		if (prebuilt->select_lock_type == LOCK_S) {		
+			err = lock_table(0, index->table, LOCK_IS, thr);
+		} else {
+			err = lock_table(0, index->table, LOCK_IX, thr);
+		}
+
+		if (err != DB_SUCCESS) {
+
+			goto lock_wait_or_error;
+		}
+		prebuilt->sql_stat_start = FALSE;
+	}
+
+	/*-------------------------------------------------------------*/
+rec_loop:
+	cons_read_requires_clust_rec = FALSE;
+
+	rec = btr_pcur_get_rec(pcur);
+
+	if (rec == page_get_infimum_rec(buf_frame_align(rec))) {
+
+		/* The infimum record on a page cannot be in the result set,
+		and neither can a record lock be placed on it: we skip such
+		a record. */
+
+		goto next_rec;
+	}
+				
+	if (prebuilt->select_lock_type != LOCK_NONE) {
+		/* Try to place a lock on the index record */	
+
+		err = sel_set_rec_lock(rec, index, prebuilt->select_lock_type,
+									thr);
+		if (err != DB_SUCCESS) {
+
+			goto lock_wait_or_error;
+		}
+	}
+
+	if (rec == page_get_supremum_rec(buf_frame_align(rec))) {
+
+		/* A page supremum record cannot be in the result set: skip
+		it now when we have placed a possible lock on it */		
+		
+		goto next_rec;
+	}
+
+	ut_ad(page_rec_is_user_rec(rec));
+
+	if (unique_search_from_clust_index && btr_pcur_get_up_match(pcur)
+					== dtuple_get_n_fields(search_tuple)) {
+		/* The record matches enough */
+
+		ut_ad(mode == PAGE_CUR_GE);
+	
+	} else if (match_mode == ROW_SEL_EXACT) {
+		/* Test if the index record matches completely to search_tuple
+		in prebuilt: if not, then we return with DB_RECORD_NOT_FOUND */
+
+		if (0 != cmp_dtuple_rec(search_tuple, rec)) {
+
+			btr_pcur_store_position(pcur, &mtr);
+
+			ret = DB_RECORD_NOT_FOUND;
+
+			goto normal_return;
+		}
+
+	} else if (match_mode == ROW_SEL_EXACT_PREFIX) {
+
+		if (!cmp_dtuple_is_prefix_of_rec(search_tuple, rec)) {
+			
+			btr_pcur_store_position(pcur, &mtr);
+
+			ret = DB_RECORD_NOT_FOUND;
+
+			goto normal_return;
+		}
+	}
+		
+	/* We are ready to look at a possible new index entry in the result
+	set: the cursor is now placed on a user record */
+
+	/* Get the right version of the row in a consistent read */
+
+	if (prebuilt->select_lock_type == LOCK_NONE) {
+
+		/* This is a non-locking consistent read: if necessary, fetch
+		a previous version of the record */
+
+		cons_read_requires_clust_rec = FALSE;
+
+		if (index == clust_index) {
+			
+			if (!lock_clust_rec_cons_read_sees(rec, index,
+							trx->read_view)) {
+
+				err = row_sel_build_prev_vers_for_mysql(
+						trx->read_view, clust_index,
+						prebuilt, rec,
+						&old_vers, &mtr);
+						
+				if (err != DB_SUCCESS) {
+
+					goto lock_wait_or_error;
+				}
+
+				if (old_vers == NULL) {
+					/* The row did not exist yet in
+					the read view */
+
+					goto next_rec;
+				}
+
+				rec = old_vers;
+			}
+		} else if (!lock_sec_rec_cons_read_sees(rec, index,
+							trx->read_view)) {
+			/* We are looking into a non-clustered index,
+			and to get the right version of the record we
+			have to look also into the clustered index: this
+			is necessary, because we can only get the undo
+			information via the clustered index record. */
+			
+			cons_read_requires_clust_rec = TRUE;
+		}
+	}
+
+	if (rec_get_deleted_flag(rec) && !cons_read_requires_clust_rec) {
+
+		/* The record is delete marked: we can skip it if this is
+		not a consistent read which might see an earlier version
+		of a non-clustered index record */
+		
+		goto next_rec;
+	}
+
+	/* Get the clustered index record if needed and if we did
+	not do the search using the clustered index */
+
+	index_rec = rec;
+
+	if (index != clust_index && (cons_read_requires_clust_rec
+				|| prebuilt->need_to_access_clustered)) {
+
+		/* It was a non-clustered index and we must fetch also the
+		clustered index record */
+
+		mtr_has_extra_clust_latch = TRUE;
+		
+		err = row_sel_get_clust_rec_for_mysql(prebuilt, index, rec,
+							thr, &clust_rec, &mtr);
+		if (err != DB_SUCCESS) {
+
+			goto lock_wait_or_error;
+		}
+
+		if (clust_rec == NULL) {
+			/* The record did not exist in the read view */
+			ut_ad(prebuilt->select_lock_type == LOCK_NONE);
+
+			goto next_rec;
+		}
+
+		if (rec_get_deleted_flag(clust_rec)) {
+
+			/* The record is delete marked: we can skip it */
+
+			goto next_rec;
+		}
+
+		rec = clust_rec;
+	}
+
+	/* We found a qualifying row */
+	
+	if (prebuilt->n_rows_fetched >= MYSQL_FETCH_CACHE_THRESHOLD
+			&& !prebuilt->templ_contains_blob
+			&& prebuilt->select_lock_type == LOCK_NONE
+			&& !prebuilt->clust_index_was_generated) {
+
+		/* Inside an update, for example, we do not cache rows,
+		since we may use the cursor position to do the actual
+		update, that is why we require ...lock_type == LOCK_NONE */
+
+		row_sel_push_cache_row_for_mysql(prebuilt, rec);
+
+		if (prebuilt->n_fetch_cached == MYSQL_FETCH_CACHE_SIZE) {
+			
+			goto got_row;
+		}
+
+		goto next_rec;
+	} else {
+		row_sel_store_mysql_rec(buf, prebuilt, rec);
+
+		if (prebuilt->clust_index_was_generated) {
+			row_sel_store_row_id_to_prebuilt(prebuilt, index_rec,
+									index);
+		}
+	}
+got_row:
+	/* TODO: should we in every case store the cursor position, even
+	if this is just a join, for example? */
+
+	if (!unique_search_from_clust_index
+				|| prebuilt->select_lock_type == LOCK_X) {
+
+		/* Inside an update always store the cursor position */
+
+		btr_pcur_store_position(pcur, &mtr);
+	}
+
+	ret = DB_SUCCESS;
+
+	goto normal_return;
+	/*-------------------------------------------------------------*/	
+next_rec:
+	if (mtr_has_extra_clust_latch) {
+		/* We must commit mtr if we are moving to the next
+		non-clustered index record, because we could break the
+		latching order if we would access a different clustered
+		index page right away without releasing the previous. */
+
+		btr_pcur_store_position(pcur, &mtr);
+
+		mtr_commit(&mtr);
+		mtr_has_extra_clust_latch = FALSE;
+	
+		mtr_start(&mtr);
+		moved = sel_restore_position_for_mysql(BTR_SEARCH_LEAF, pcur,
+							moves_up, &mtr);
+		if (moved) {
+			goto rec_loop;
+		}
+	}
+
+	if (moves_up) {		
+		moved = btr_pcur_move_to_next(pcur, &mtr);
+	} else {
+		moved = btr_pcur_move_to_prev(pcur, &mtr);
+	}
+
+	if (!moved) {
+		btr_pcur_store_position(pcur, &mtr);
+
+		if (match_mode != 0) {
+			ret = DB_RECORD_NOT_FOUND;
+		} else {
+			ret = DB_END_OF_INDEX;
+		}
+
+		goto normal_return;
+	}
+
+	goto rec_loop;
+	/*-------------------------------------------------------------*/
+lock_wait_or_error:
+	btr_pcur_store_position(pcur, &mtr);
+
+	mtr_commit(&mtr);
+	mtr_has_extra_clust_latch = FALSE;
+		
+	trx->error_state = err;
+
+	/* The following is a patch for MySQL */
+
+	que_thr_stop_for_mysql(thr);
+
+	was_lock_wait = row_mysql_handle_errors(&err, trx, thr, NULL);
+	
+	if (was_lock_wait) {
+		mtr_start(&mtr);
+
+		sel_restore_position_for_mysql(BTR_SEARCH_LEAF, pcur,
+							moves_up, &mtr);
+		mode = pcur->search_mode;
+
+		goto rec_loop;
+	}
+	
+	return(err);
+
+normal_return:
+	que_thr_stop_for_mysql_no_error(thr, trx);
+
+	mtr_commit(&mtr);
+
+	if (prebuilt->n_fetch_cached > 0) {
+		row_sel_pop_cached_row_for_mysql(buf, prebuilt);
+
+		ret = DB_SUCCESS;
+	}
+
+	return(ret);
+}
diff --git a/innobase/row/row0uins.c b/innobase/row/row0uins.c
new file mode 100644
index 00000000000..68115895dbb
--- /dev/null
+++ b/innobase/row/row0uins.c
@@ -0,0 +1,308 @@
+/******************************************************
+Fresh insert undo
+
+(c) 1996 Innobase Oy
+
+Created 2/25/1997 Heikki Tuuri
+*******************************************************/
+
+#include "row0uins.h"
+
+#ifdef UNIV_NONINL
+#include "row0uins.ic"
+#endif
+
+#include "dict0dict.h"
+#include "dict0boot.h"
+#include "dict0crea.h"
+#include "trx0undo.h"
+#include "trx0roll.h"
+#include "btr0btr.h"
+#include "mach0data.h"
+#include "row0undo.h"
+#include "row0vers.h"
+#include "trx0trx.h"
+#include "trx0rec.h"
+#include "row0row.h"
+#include "row0upd.h"
+#include "que0que.h"
+#include "ibuf0ibuf.h"
+#include "log0log.h"
+
+/*******************************************************************
+Removes a clustered index record. The pcur in node was positioned on the
+record, now it is detached. */
+static
+ulint
+row_undo_ins_remove_clust_rec(
+/*==========================*/
+				/* out: DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
+	undo_node_t*	node,	/* in: undo node */
+	que_thr_t*	thr)	/* in: query thread */
+{
+	btr_cur_t*	btr_cur;		
+	ibool		success;
+	ulint		err;
+	ulint		n_tries		= 0;
+	mtr_t		mtr;
+	
+	UT_NOT_USED(thr);
+
+	mtr_start(&mtr);
+	
+	success = btr_pcur_restore_position(BTR_MODIFY_LEAF, &(node->pcur),
+									&mtr);
+	ut_a(success);
+
+	if (ut_dulint_cmp(node->table->id, DICT_INDEXES_ID) == 0) {
+
+		/* Drop the index tree associated with the row in
+		SYS_INDEXES table: */
+	
+		dict_drop_index_tree(btr_pcur_get_rec(&(node->pcur)), &mtr);
+
+		mtr_commit(&mtr);
+
+		mtr_start(&mtr);
+
+		success = btr_pcur_restore_position(BTR_MODIFY_LEAF,
+						&(node->pcur), &mtr);
+		ut_a(success);
+	}
+		
+	btr_cur = btr_pcur_get_btr_cur(&(node->pcur));
+	
+	success = btr_cur_optimistic_delete(btr_cur, &mtr);
+
+	btr_pcur_commit_specify_mtr(&(node->pcur), &mtr);
+
+	if (success) {
+		trx_undo_rec_release(node->trx, node->undo_no);
+
+		return(DB_SUCCESS);
+	}
+retry:
+	/* If did not succeed, try pessimistic descent to tree */
+	mtr_start(&mtr);
+	
+	success = btr_pcur_restore_position(BTR_MODIFY_TREE,
+							&(node->pcur), &mtr);
+	ut_a(success);
+
+	btr_cur_pessimistic_delete(&err, FALSE, btr_cur, &mtr);
+
+	/* The delete operation may fail if we have little
+	file space left: TODO: easiest to crash the database
+	and restart with more file space */
+
+	if (err == DB_OUT_OF_FILE_SPACE
+				&& n_tries < BTR_CUR_RETRY_DELETE_N_TIMES) {
+
+		btr_pcur_commit_specify_mtr(&(node->pcur), &mtr);
+
+		n_tries++;
+
+		os_thread_sleep(BTR_CUR_RETRY_SLEEP_TIME);
+			
+		goto retry;
+	}
+
+	btr_pcur_commit_specify_mtr(&(node->pcur), &mtr);
+
+	trx_undo_rec_release(node->trx, node->undo_no);
+
+	return(err);
+}
+
+/*******************************************************************
+Removes a secondary index entry if found. */
+static
+ulint
+row_undo_ins_remove_sec_low(
+/*========================*/
+				/* out: DB_SUCCESS, DB_FAIL, or
+				DB_OUT_OF_FILE_SPACE */
+	ulint		mode,	/* in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE,
+				depending on whether we wish optimistic or
+				pessimistic descent down the index tree */
+	dict_index_t*	index,	/* in: index */
+	dtuple_t*	entry,	/* in: index entry to remove */
+	que_thr_t*	thr)	/* in: query thread */
+{
+	btr_pcur_t	pcur;		
+	btr_cur_t*	btr_cur;
+	ibool		found;
+	ibool		success;
+	ulint		err;
+	mtr_t		mtr;
+	
+	UT_NOT_USED(thr);
+
+	log_free_check();
+	mtr_start(&mtr);
+
+	found = row_search_index_entry(index, entry, mode, &pcur, &mtr);
+
+	btr_cur = btr_pcur_get_btr_cur(&pcur);
+
+	if (!found) {
+		/* Not found */
+
+		/* FIXME: remove printfs in the final version */
+
+		/* printf(
+		"--UNDO INS: Record not found from page %lu index %s\n",
+			buf_frame_get_page_no(btr_cur_get_rec(btr_cur)),
+			index->name); */
+
+		/* ibuf_print(); */
+
+		btr_pcur_close(&pcur);
+		mtr_commit(&mtr);
+
+		return(DB_SUCCESS);
+	}
+
+	if (mode == BTR_MODIFY_LEAF) {
+		success = btr_cur_optimistic_delete(btr_cur, &mtr);
+
+		if (success) {
+			err = DB_SUCCESS;
+		} else {
+			err = DB_FAIL;
+		}
+	} else {
+		ut_ad(mode == BTR_MODIFY_TREE);
+
+		btr_cur_pessimistic_delete(&err, FALSE, btr_cur, &mtr);
+	}
+
+	btr_pcur_close(&pcur);
+	mtr_commit(&mtr);
+
+	return(err);
+}
+
+/*******************************************************************
+Removes a secondary index entry from the index if found. Tries first
+optimistic, then pessimistic descent down the tree. */
+static
+ulint
+row_undo_ins_remove_sec(
+/*====================*/
+				/* out: DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
+	dict_index_t*	index,	/* in: index */
+	dtuple_t*	entry,	/* in: index entry to insert */
+	que_thr_t*	thr)	/* in: query thread */
+{
+	ulint	err;
+	ulint	n_tries	= 0;
+	
+	/* Try first optimistic descent to the B-tree */
+
+	err = row_undo_ins_remove_sec_low(BTR_MODIFY_LEAF, index, entry, thr);
+								
+	if (err == DB_SUCCESS) {
+
+		return(err);
+	}
+
+	/* Try then pessimistic descent to the B-tree */
+retry:
+	err = row_undo_ins_remove_sec_low(BTR_MODIFY_TREE, index, entry, thr);
+
+	/* The delete operation may fail if we have little
+	file space left: TODO: easiest to crash the database
+	and restart with more file space */
+
+	if (err != DB_SUCCESS && n_tries < BTR_CUR_RETRY_DELETE_N_TIMES) {
+
+		n_tries++;
+
+		os_thread_sleep(BTR_CUR_RETRY_SLEEP_TIME);
+			
+		goto retry;
+	}
+
+	return(err);
+}
+
+/***************************************************************
+Parses the row reference and other info in a fresh insert undo record. */
+static
+void
+row_undo_ins_parse_undo_rec(
+/*========================*/
+	undo_node_t*	node,	/* in: row undo node */
+	que_thr_t*	thr)	/* in: query thread */
+{
+	dict_index_t*	clust_index;
+	byte*		ptr;
+	dulint		undo_no;
+	dulint		table_id;
+	ulint		type;
+	ulint		dummy;
+
+	ut_ad(node && thr);
+	
+	ptr = trx_undo_rec_get_pars(node->undo_rec, &type, &dummy, &undo_no,
+								&table_id);
+	ut_ad(type == TRX_UNDO_INSERT_REC);
+	node->rec_type = type;
+
+	/* NOTE that the table has to be explicitly released later */
+	node->table = dict_table_get_on_id(table_id, node->trx);
+
+	clust_index = dict_table_get_first_index(node->table);
+	
+	ptr = trx_undo_rec_get_row_ref(ptr, clust_index, &(node->ref),
+								node->heap);
+}
+	
+/***************************************************************
+Undoes a fresh insert of a row to a table. A fresh insert means that
+the same clustered index unique key did not have any record, even delete
+marked, at the time of the insert. */
+
+ulint
+row_undo_ins(
+/*=========*/
+				/* out: DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
+	undo_node_t*	node,	/* in: row undo node */
+	que_thr_t*	thr)	/* in: query thread */
+{
+	dtuple_t*	entry;
+	ibool		found;
+	ulint		err;
+	
+	ut_ad(node && thr);
+	ut_ad(node->state == UNDO_NODE_INSERT);
+
+	row_undo_ins_parse_undo_rec(node, thr);
+
+	found = row_undo_search_clust_to_pcur(node, thr);
+
+	if (!found) {
+		return(DB_SUCCESS);
+	}
+
+	node->index = dict_table_get_next_index(
+				dict_table_get_first_index(node->table));
+
+	while (node->index != NULL) {
+		entry = row_build_index_entry(node->row, node->index,
+								node->heap);
+		err = row_undo_ins_remove_sec(node->index, entry, thr);
+
+		if (err != DB_SUCCESS) {
+
+			return(err);
+		}
+		
+		node->index = dict_table_get_next_index(node->index);
+	}
+
+	err = row_undo_ins_remove_clust_rec(node, thr);
+		
+	return(err);
+}
diff --git a/innobase/row/row0umod.c b/innobase/row/row0umod.c
new file mode 100644
index 00000000000..2aa223a6186
--- /dev/null
+++ b/innobase/row/row0umod.c
@@ -0,0 +1,608 @@
+/******************************************************
+Undo modify of a row
+
+(c) 1997 Innobase Oy
+
+Created 2/27/1997 Heikki Tuuri
+*******************************************************/
+
+#include "row0umod.h"
+
+#ifdef UNIV_NONINL
+#include "row0umod.ic"
+#endif
+
+#include "dict0dict.h"
+#include "dict0boot.h"
+#include "trx0undo.h"
+#include "trx0roll.h"
+#include "btr0btr.h"
+#include "mach0data.h"
+#include "row0undo.h"
+#include "row0vers.h"
+#include "trx0trx.h"
+#include "trx0rec.h"
+#include "row0row.h"
+#include "row0upd.h"
+#include "que0que.h"
+#include "log0log.h"
+
+/* Considerations on undoing a modify operation.
+(1) Undoing a delete marking: all index records should be found. Some of
+them may have delete mark already FALSE, if the delete mark operation was
+stopped underway, or if the undo operation ended prematurely because of a
+system crash.
+(2) Undoing an update of a delete unmarked record: the newer version of
+an updated secondary index entry should be removed if no prior version
+of the clustered index record requires its existence. Otherwise, it should
+be delete marked.
+(3) Undoing an update of a delete marked record. In this kind of update a
+delete marked clustered index record was delete unmarked and possibly also
+some of its fields were changed. Now, it is possible that the delete marked
+version has become obsolete at the time the undo is started. */
+
+/***************************************************************
+Checks if also the previous version of the clustered index record was
+modified or inserted by the same transaction, and its undo number is such
+that it should be undone in the same rollback. */
+UNIV_INLINE
+ibool
+row_undo_mod_undo_also_prev_vers(
+/*=============================*/
+				/* out: TRUE if also previous modify or
+				insert of this row should be undone */
+ 	undo_node_t*	node,	/* in: row undo node */
+	que_thr_t*	thr,	/* in: query thread */
+	dulint*		undo_no)/* out: the undo number */
+{
+	trx_undo_rec_t*	undo_rec;
+	ibool		ret;
+	trx_t*		trx;
+
+	UT_NOT_USED(thr);
+
+	trx = node->trx;
+	
+	if (0 != ut_dulint_cmp(node->new_trx_id, trx->id)) {
+
+		return(FALSE);
+	}
+
+	undo_rec = trx_undo_get_undo_rec_low(node->new_roll_ptr, node->heap);
+
+	*undo_no = trx_undo_rec_get_undo_no(undo_rec);
+
+	if (ut_dulint_cmp(trx->roll_limit, *undo_no) <= 0) {
+		ret = TRUE;
+	} else {
+		ret = FALSE;
+	}
+	
+	return(ret);
+}
+	
+/***************************************************************
+Undoes a modify in a clustered index record. */
+static
+ulint
+row_undo_mod_clust_low(
+/*===================*/
+				/* out: DB_SUCCESS, DB_FAIL, or error code:
+				we may run out of file space */
+	undo_node_t*	node,	/* in: row undo node */
+	que_thr_t*	thr,	/* in: query thread */
+	mtr_t*		mtr,	/* in: mtr */
+	ulint		mode)	/* in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE */
+{
+	dict_index_t*	index;
+	btr_pcur_t*	pcur;
+	btr_cur_t*	btr_cur;
+	ulint		err;
+	ibool		success;
+	ibool		do_remove;
+
+	index = dict_table_get_first_index(node->table);
+	
+	pcur = &(node->pcur);
+	btr_cur = btr_pcur_get_btr_cur(pcur);
+
+	success = btr_pcur_restore_position(mode, pcur, mtr);
+
+	ut_ad(success);
+
+	/* Find out if we can remove the whole clustered index record */
+
+	if (node->rec_type == TRX_UNDO_UPD_DEL_REC
+	    && !row_vers_must_preserve_del_marked(node->new_trx_id, mtr)) {
+
+		do_remove = TRUE;
+	} else {
+		do_remove = FALSE;
+	}
+	    
+	if (mode == BTR_MODIFY_LEAF) {
+
+		if (do_remove) {
+			success = btr_cur_optimistic_delete(btr_cur, mtr);
+
+			if (success) {
+				err = DB_SUCCESS;
+			} else {
+				err = DB_FAIL;
+			}
+		} else {
+			err = btr_cur_optimistic_update(BTR_NO_LOCKING_FLAG
+					| BTR_NO_UNDO_LOG_FLAG
+					| BTR_KEEP_SYS_FLAG,
+					btr_cur, node->update,
+					node->cmpl_info, thr, mtr);
+		}
+	} else {
+		ut_ad(mode == BTR_MODIFY_TREE);
+
+		if (do_remove) {
+			btr_cur_pessimistic_delete(&err, FALSE, btr_cur, mtr);
+
+			/* The delete operation may fail if we have little
+			file space left: TODO: easiest to crash the database
+			and restart with more file space */
+		} else {
+			err = btr_cur_pessimistic_update(BTR_NO_LOCKING_FLAG
+					| BTR_NO_UNDO_LOG_FLAG
+					| BTR_KEEP_SYS_FLAG,
+					btr_cur, node->update,
+					node->cmpl_info, thr, mtr);
+		}
+	}
+
+	return(err);
+}
+		
+/***************************************************************
+Undoes a modify in a clustered index record. Sets also the node state for the
+next round of undo. */
+static
+ulint
+row_undo_mod_clust(
+/*===============*/
+				/* out: DB_SUCCESS or error code: we may run
+				out of file space */
+	undo_node_t*	node,	/* in: row undo node */
+	que_thr_t*	thr)	/* in: query thread */
+{
+	btr_pcur_t*	pcur;
+	mtr_t		mtr;
+	ulint		err;
+	ibool		success;
+	ibool		more_vers;
+	dulint		new_undo_no;
+	
+	ut_ad(node && thr);
+
+	/* Check if also the previous version of the clustered index record
+	should be undone in this same rollback operation */
+
+	more_vers = row_undo_mod_undo_also_prev_vers(node, thr, &new_undo_no);
+
+	pcur = &(node->pcur);
+
+	mtr_start(&mtr);
+
+	/* Try optimistic processing of the record, keeping changes within
+	the index page */
+
+	err = row_undo_mod_clust_low(node, thr, &mtr, BTR_MODIFY_LEAF);
+
+	if (err != DB_SUCCESS) {
+		btr_pcur_commit_specify_mtr(pcur, &mtr);
+
+		/* We may have to modify tree structure: do a pessimistic
+		descent down the index tree */
+
+		mtr_start(&mtr);
+
+		err = row_undo_mod_clust_low(node, thr, &mtr, BTR_MODIFY_TREE);
+	}
+
+	node->state = UNDO_NODE_FETCH_NEXT;
+
+	btr_pcur_commit_specify_mtr(pcur, &mtr);
+
+ 	trx_undo_rec_release(node->trx, node->undo_no);
+
+	if (more_vers && err == DB_SUCCESS) {
+
+		/* Reserve the undo log record to the prior version after
+		committing &mtr: this is necessary to comply with the latching
+		order, as &mtr may contain the fsp latch which is lower in
+		the latch hierarchy than trx->undo_mutex. */
+
+		success = trx_undo_rec_reserve(node->trx, new_undo_no);
+
+		if (success) {
+			node->state = UNDO_NODE_PREV_VERS;
+		}
+	}
+
+	return(err);
+}
+
+/***************************************************************
+Delete marks or removes a secondary index entry if found. */
+static
+ulint
+row_undo_mod_del_mark_or_remove_sec_low(
+/*====================================*/
+				/* out: DB_SUCCESS, DB_FAIL, or
+				DB_OUT_OF_FILE_SPACE */
+	undo_node_t*	node,	/* in: row undo node */
+	que_thr_t*	thr,	/* in: query thread */
+	dict_index_t*	index,	/* in: index */
+	dtuple_t*	entry,	/* in: index entry */
+	ulint		mode)	/* in: latch mode BTR_MODIFY_LEAF or
+				BTR_MODIFY_TREE */	
+{
+	ibool		found;
+	mtr_t		mtr;
+	mtr_t		mtr_vers;
+	btr_pcur_t	pcur;
+	btr_cur_t*	btr_cur;
+	ibool		success;
+	ibool		old_has;
+	ulint		err;
+	
+	log_free_check();
+	mtr_start(&mtr);
+	
+	found = row_search_index_entry(index, entry, mode, &pcur, &mtr);
+
+	btr_cur = btr_pcur_get_btr_cur(&pcur);
+
+	if (!found) {
+		/* Not found */
+
+		/* FIXME: remove printfs in the final version */
+
+		/* printf(
+		"--UNDO MOD: Record not found from page %lu index %s\n",
+			buf_frame_get_page_no(btr_cur_get_rec(btr_cur)),
+			index->name); */
+
+		btr_pcur_close(&pcur);
+		mtr_commit(&mtr);
+
+		return(DB_SUCCESS);
+	}
+
+	/* We should remove the index record if no prior version of the row,
+	which cannot be purged yet, requires its existence. If some requires,
+	we should delete mark the record. */
+
+	mtr_start(&mtr_vers);
+		
+	success = btr_pcur_restore_position(BTR_SEARCH_LEAF, &(node->pcur),
+								&mtr_vers);
+	ut_ad(success);
+		
+	old_has = row_vers_old_has_index_entry(FALSE,
+					btr_pcur_get_rec(&(node->pcur)),
+					&mtr_vers, index, entry);
+	if (old_has) {
+		err = btr_cur_del_mark_set_sec_rec(BTR_NO_LOCKING_FLAG,
+						btr_cur, TRUE, thr, &mtr);
+		ut_ad(err == DB_SUCCESS);
+	} else {
+		/* Remove the index record */
+
+		if (mode == BTR_MODIFY_LEAF) {		
+			success = btr_cur_optimistic_delete(btr_cur, &mtr);
+			if (success) {
+				err = DB_SUCCESS;
+			} else {
+				err = DB_FAIL;
+			}
+		} else {
+			ut_ad(mode == BTR_MODIFY_TREE);
+
+			btr_cur_pessimistic_delete(&err, FALSE, btr_cur, &mtr);
+
+			/* The delete operation may fail if we have little
+			file space left: TODO: easiest to crash the database
+			and restart with more file space */
+		}
+	}
+
+	btr_pcur_commit_specify_mtr(&(node->pcur), &mtr_vers);
+	btr_pcur_close(&pcur);
+	mtr_commit(&mtr);
+
+	return(err);
+}
+
+/***************************************************************
+Delete marks or removes a secondary index entry if found. */
+UNIV_INLINE
+ulint
+row_undo_mod_del_mark_or_remove_sec(
+/*================================*/
+				/* out: DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
+	undo_node_t*	node,	/* in: row undo node */
+	que_thr_t*	thr,	/* in: query thread */
+	dict_index_t*	index,	/* in: index */
+	dtuple_t*	entry)	/* in: index entry */
+{
+	ulint	err;
+	
+	err = row_undo_mod_del_mark_or_remove_sec_low(node, thr, index,
+ 						entry, BTR_MODIFY_LEAF);
+	if (err == DB_SUCCESS) {
+
+		return(err);
+	}
+
+	err = row_undo_mod_del_mark_or_remove_sec_low(node, thr, index,
+ 						entry, BTR_MODIFY_TREE);
+ 	return(err);
+}
+
+/***************************************************************
+Delete unmarks a secondary index entry which must be found. */
+static
+void
+row_undo_mod_del_unmark_sec(
+/*========================*/
+	undo_node_t*	node,	/* in: row undo node */
+	que_thr_t*	thr,	/* in: query thread */
+	dict_index_t*	index,	/* in: index */
+	dtuple_t*	entry)	/* in: index entry */
+{
+	mtr_t		mtr;
+	btr_pcur_t	pcur;
+	btr_cur_t*	btr_cur;
+	ulint		err;
+	ibool		found;
+
+	UT_NOT_USED(node);
+	
+	log_free_check();
+	mtr_start(&mtr);
+	
+	found = row_search_index_entry(index, entry, BTR_MODIFY_LEAF, &pcur,
+									&mtr);
+	ut_a(found);
+
+	btr_cur = btr_pcur_get_btr_cur(&pcur);
+
+	err = btr_cur_del_mark_set_sec_rec(BTR_NO_LOCKING_FLAG,
+						btr_cur, FALSE, thr, &mtr);
+	ut_ad(err == DB_SUCCESS);
+
+	btr_pcur_close(&pcur);
+	mtr_commit(&mtr);
+}
+
+/***************************************************************
+Undoes a modify in secondary indexes when undo record type is UPD_DEL. */
+static
+ulint
+row_undo_mod_upd_del_sec(
+/*=====================*/
+				/* out: DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
+	undo_node_t*	node,	/* in: row undo node */
+	que_thr_t*	thr)	/* in: query thread */
+{
+	mem_heap_t*	heap;
+	dtuple_t*	entry;
+	dict_index_t*	index;
+	ulint		err;
+	
+	heap = mem_heap_create(1024);
+
+	while (node->index != NULL) {
+		index = node->index;
+
+		entry = row_build_index_entry(node->row, index, heap);
+
+		err = row_undo_mod_del_mark_or_remove_sec(node, thr, index,
+								entry);
+		if (err != DB_SUCCESS) {
+
+			mem_heap_free(heap);
+
+			return(err);
+		}
+									
+		node->index = dict_table_get_next_index(node->index);
+	}
+
+	mem_heap_free(heap);
+
+	return(DB_SUCCESS);
+}
+
+/***************************************************************
+Undoes a modify in secondary indexes when undo record type is DEL_MARK. */
+static
+ulint
+row_undo_mod_del_mark_sec(
+/*======================*/
+				/* out: DB_SUCCESS */
+	undo_node_t*	node,	/* in: row undo node */
+	que_thr_t*	thr)	/* in: query thread */
+{
+	mem_heap_t*	heap;
+	dtuple_t*	entry;
+	dict_index_t*	index;
+
+	heap = mem_heap_create(1024);
+
+	while (node->index != NULL) {
+		index = node->index;
+
+		entry = row_build_index_entry(node->row, index, heap);
+		
+		row_undo_mod_del_unmark_sec(node, thr, index, entry);
+
+		node->index = dict_table_get_next_index(node->index);
+	}
+
+	mem_heap_free(heap);	
+
+	return(DB_SUCCESS);
+}
+
+/***************************************************************
+Undoes a modify in secondary indexes when undo record type is UPD_EXIST. */
+static
+ulint
+row_undo_mod_upd_exist_sec(
+/*=======================*/
+				/* out: DB_SUCCESS or error code */
+	undo_node_t*	node,	/* in: row undo node */
+	que_thr_t*	thr)	/* in: query thread */
+{
+	mem_heap_t*	heap;
+	dtuple_t*	entry;
+	dict_index_t*	index;
+	ulint		err;
+
+	if (node->cmpl_info & UPD_NODE_NO_ORD_CHANGE) {
+		/* No change in secondary indexes */
+	
+		return(DB_SUCCESS);
+	}
+	
+	heap = mem_heap_create(1024);
+
+	while (node->index != NULL) {
+		index = node->index;
+
+		if (row_upd_changes_ord_field(node->row, node->index,
+							node->update)) {
+
+			/* Build the newest version of the index entry */
+			entry = row_build_index_entry(node->row, index, heap);
+
+			err = row_undo_mod_del_mark_or_remove_sec(node, thr,
+							index, entry);
+			if (err != DB_SUCCESS) {
+				mem_heap_free(heap);
+
+				return(err);
+			}
+							
+			/* We may have to update the delete mark in the
+			secondary index record of the previous version of
+			the row */
+
+			row_upd_index_replace_new_col_vals(entry, index,
+								node->update);
+
+			row_undo_mod_del_unmark_sec(node, thr, index, entry);
+		}
+
+		node->index = dict_table_get_next_index(node->index);
+	}
+
+	mem_heap_free(heap);
+
+	return(DB_SUCCESS);
+}
+
+/***************************************************************
+Parses the row reference and other info in a modify undo log record. */
+static
+void
+row_undo_mod_parse_undo_rec(
+/*========================*/
+	undo_node_t*	node,	/* in: row undo node */
+	que_thr_t*	thr)	/* in: query thread */
+{
+	dict_index_t*	clust_index;
+	byte*		ptr;
+	dulint		undo_no;
+	dulint		table_id;
+	dulint		trx_id;
+	dulint		roll_ptr;
+	ulint		info_bits;
+	ulint		type;
+	ulint		cmpl_info;
+	
+	ut_ad(node && thr);
+	
+	ptr = trx_undo_rec_get_pars(node->undo_rec, &type, &cmpl_info,
+							&undo_no, &table_id);
+	node->rec_type = type;
+	
+	/* NOTE that the table has to be explicitly released later */
+	node->table = dict_table_get_on_id(table_id, thr_get_trx(thr));
+
+	clust_index = dict_table_get_first_index(node->table);
+
+	ptr = trx_undo_update_rec_get_sys_cols(ptr, &trx_id, &roll_ptr,
+								&info_bits);
+
+	ptr = trx_undo_rec_get_row_ref(ptr, clust_index, &(node->ref),
+								node->heap);
+
+	trx_undo_update_rec_get_update(ptr, clust_index, type, trx_id,
+					roll_ptr, info_bits, node->heap,
+					&(node->update));
+	node->new_roll_ptr = roll_ptr;
+	node->new_trx_id = trx_id;
+	node->cmpl_info = cmpl_info;
+}
+	
+/***************************************************************
+Undoes a modify operation on a row of a table. */
+
+ulint
+row_undo_mod(
+/*=========*/
+				/* out: DB_SUCCESS or error code */
+	undo_node_t*	node,	/* in: row undo node */
+	que_thr_t*	thr)	/* in: query thread */
+{
+	ibool	found;
+	ulint	err;
+	
+	ut_ad(node && thr);
+	ut_ad(node->state == UNDO_NODE_MODIFY);
+
+	row_undo_mod_parse_undo_rec(node, thr);
+
+	found = row_undo_search_clust_to_pcur(node, thr);
+
+	if (!found) {
+		/* It is already undone, or will be undone by another query
+		thread */
+	
+		node->state = UNDO_NODE_FETCH_NEXT;
+
+		return(DB_SUCCESS);
+	}
+
+	node->index = dict_table_get_next_index(
+				dict_table_get_first_index(node->table));
+
+	if (node->rec_type == TRX_UNDO_UPD_EXIST_REC) {
+		
+		err = row_undo_mod_upd_exist_sec(node, thr);
+
+	} else if (node->rec_type == TRX_UNDO_DEL_MARK_REC) {
+
+		err = row_undo_mod_del_mark_sec(node, thr);
+	} else {
+		ut_ad(node->rec_type == TRX_UNDO_UPD_DEL_REC);
+		err = row_undo_mod_upd_del_sec(node, thr);
+	}
+
+	if (err != DB_SUCCESS) {
+
+		return(err);
+	}
+	
+	err = row_undo_mod_clust(node, thr);
+	
+	return(err);
+}
diff --git a/innobase/row/row0undo.c b/innobase/row/row0undo.c
new file mode 100644
index 00000000000..6dc032f7e13
--- /dev/null
+++ b/innobase/row/row0undo.c
@@ -0,0 +1,313 @@
+/******************************************************
+Row undo
+
+(c) 1997 Innobase Oy
+
+Created 1/8/1997 Heikki Tuuri
+*******************************************************/
+
+#include "row0undo.h"
+
+#ifdef UNIV_NONINL
+#include "row0undo.ic"
+#endif
+
+#include "fsp0fsp.h"
+#include "mach0data.h"
+#include "trx0rseg.h"
+#include "trx0trx.h"
+#include "trx0roll.h"
+#include "trx0undo.h"
+#include "trx0purge.h"
+#include "trx0rec.h"
+#include "que0que.h"
+#include "row0row.h"
+#include "row0uins.h"
+#include "row0umod.h"
+#include "srv0srv.h"
+
+/* How to undo row operations?
+(1) For an insert, we have stored a prefix of the clustered index record
+in the undo log. Using it, we look for the clustered record, and using
+that we look for the records in the secondary indexes. The insert operation
+may have been left incomplete, if the database crashed, for example.
+We may have look at the trx id and roll ptr to make sure the record in the
+clustered index is really the one for which the undo log record was
+written. We can use the framework we get from the original insert op.
+(2) Delete marking: We can use the framework we get from the original
+delete mark op. We only have to check the trx id.
+(3) Update: This may be the most complicated. We have to use the framework
+we get from the original update op.
+
+What if the same trx repeatedly deletes and inserts an identical row.
+Then the row id changes and also roll ptr. What if the row id was not
+part of the ordering fields in the clustered index? Maybe we have to write
+it to undo log. Well, maybe not, because if we order the row id and trx id
+in descending order, then the only undeleted copy is the first in the
+index. Our searches in row operations always position the cursor before
+the first record in the result set. But, if there is no key defined for
+a table, then it would be desirable that row id is in ascending order.
+So, lets store row id in descending order only if it is not an ordering
+field in the clustered index.
+
+NOTE: Deletes and inserts may lead to situation where there are identical
+records in a secondary index. Is that a problem in the B-tree? Yes.
+Also updates can lead to this, unless trx id and roll ptr are included in
+ord fields.
+(1) Fix in clustered indexes: include row id, trx id, and roll ptr
+in node pointers of B-tree.
+(2) Fix in secondary indexes: include all fields in node pointers, and
+if an entry is inserted, check if it is equal to the right neighbor,
+in which case update the right neighbor: the neighbor must be delete
+marked, set it unmarked and write the trx id of the current transaction.
+
+What if the same trx repeatedly updates the same row, updating a secondary
+index field or not? Updating a clustered index ordering field?
+
+(1) If it does not update the secondary index and not the clustered index
+ord field. Then the secondary index record stays unchanged, but the
+trx id in the secondary index record may be smaller than in the clustered
+index record. This is no problem?
+(2) If it updates secondary index ord field but not clustered: then in
+secondary index there are delete marked records, which differ in an
+ord field. No problem.
+(3) Updates clustered ord field but not secondary, and secondary index
+is unique. Then the record in secondary index is just updated at the
+clustered ord field.
+(4)
+
+Problem with duplicate records:
+Fix 1: Add a trx op no field to all indexes. A problem: if a trx with a
+bigger trx id has inserted and delete marked a similar row, our trx inserts
+again a similar row, and a trx with an even bigger id delete marks it. Then
+the position of the row should change in the index if the trx id affects
+the alphabetical ordering.
+
+Fix 2: If an insert encounters a similar row marked deleted, we turn the
+insert into an 'update' of the row marked deleted. Then we must write undo
+info on the update. A problem: what if a purge operation tries to remove
+the delete marked row?
+
+We can think of the database row versions as a linked list which starts
+from the record in the clustered index, and is linked by roll ptrs
+through undo logs. The secondary index records are references which tell
+what kinds of records can be found in this linked list for a record
+in the clustered index.
+
+How to do the purge? A record can be removed from the clustered index
+if its linked list becomes empty, i.e., the row has been marked deleted
+and its roll ptr points to the record in the undo log we are going through,
+doing the purge. Similarly, during a rollback, a record can be removed
+if the stored roll ptr in the undo log points to a trx already (being) purged,
+or if the roll ptr is NULL, i.e., it was a fresh insert. */
+
+/************************************************************************
+Creates a row undo node to a query graph. */
+
+undo_node_t*
+row_undo_node_create(
+/*=================*/
+				/* out, own: undo node */
+	trx_t*		trx,	/* in: transaction */
+	que_thr_t*	parent,	/* in: parent node, i.e., a thr node */
+	mem_heap_t*	heap)	/* in: memory heap where created */
+{
+	undo_node_t*	undo;
+
+	ut_ad(trx && parent && heap);
+
+	undo = mem_heap_alloc(heap, sizeof(undo_node_t));
+
+	undo->common.type = QUE_NODE_UNDO;
+	undo->common.parent = parent;
+
+	undo->state = UNDO_NODE_FETCH_NEXT;
+	undo->trx = trx;
+
+	undo->heap = mem_heap_create(256);
+
+	return(undo);
+}
+
+/***************************************************************
+Looks for the clustered index record when node has the row reference.
+The pcur in node is used in the search. If found, stores the row to node,
+and stores the position of pcur, and detaches it. The pcur must be closed
+by the caller in any case. */
+
+ibool
+row_undo_search_clust_to_pcur(
+/*==========================*/
+				/* out: TRUE if found; NOTE the node->pcur
+				must be closed by the caller, regardless of
+				the return value */
+	undo_node_t*	node,	/* in: row undo node */
+	que_thr_t*	thr)	/* in: query thread */
+{
+	dict_index_t*	clust_index;
+	ibool		found;
+	mtr_t		mtr;
+	ibool		ret;
+	rec_t*		rec;
+
+	UT_NOT_USED(thr);
+
+	mtr_start(&mtr);
+
+	clust_index = dict_table_get_first_index(node->table);
+	
+	found = row_search_on_row_ref(&(node->pcur), BTR_MODIFY_LEAF,
+					node->table, node->ref, &mtr);
+
+	rec = btr_pcur_get_rec(&(node->pcur));
+
+	if (!found || 0 != ut_dulint_cmp(node->roll_ptr,
+		   		row_get_rec_roll_ptr(rec, clust_index))) {
+
+		/* We must remove the reservation on the undo log record
+		BEFORE releasing the latch on the clustered index page: this
+		is to make sure that some thread will eventually undo the
+		modification corresponding to node->roll_ptr. */
+		
+		/* printf("--------------------undoing a previous version\n");
+		*/
+		trx_undo_rec_release(node->trx, node->undo_no);
+		   
+		ret = FALSE;
+	} else {
+		node->row = row_build(ROW_COPY_DATA, clust_index, rec,
+								node->heap);
+		btr_pcur_store_position(&(node->pcur), &mtr);
+
+		ret = TRUE;
+	}
+
+	btr_pcur_commit_specify_mtr(&(node->pcur), &mtr);
+
+	return(ret);
+}
+	
+/***************************************************************
+Fetches an undo log record and does the undo for the recorded operation.
+If none left, or a partial rollback completed, returns control to the
+parent node, which is always a query thread node. */
+static
+ulint
+row_undo(
+/*=====*/
+				/* out: DB_SUCCESS if operation successfully
+				completed, else error code */
+	undo_node_t*	node,	/* in: row undo node */
+	que_thr_t*	thr)	/* in: query thread */
+{
+	ulint	err;
+	trx_t*	trx;
+	dulint	roll_ptr;
+	
+	ut_ad(node && thr);
+	
+	trx = node->trx;
+
+	if (node->state == UNDO_NODE_FETCH_NEXT) {
+
+		/* The call below also starts &mtr */
+		node->undo_rec = trx_roll_pop_top_rec_of_trx(trx,
+							trx->roll_limit,
+							&roll_ptr,
+							node->heap);
+		if (!node->undo_rec) {
+			/* Rollback completed for this query thread */
+
+			thr->run_node = que_node_get_parent(node);
+
+			return(DB_SUCCESS);
+		}
+
+		node->roll_ptr = roll_ptr;
+		node->undo_no = trx_undo_rec_get_undo_no(node->undo_rec);
+
+		if (trx_undo_roll_ptr_is_insert(roll_ptr)) {
+
+			node->state = UNDO_NODE_INSERT;
+		} else {
+			node->state = UNDO_NODE_MODIFY;
+		}
+
+	} else if (node->state == UNDO_NODE_PREV_VERS) {
+
+		/* Undo should be done to the same clustered index record
+		again in this same rollback, restoring the previous version */
+
+		roll_ptr = node->new_roll_ptr;
+		
+		node->undo_rec = trx_undo_get_undo_rec_low(roll_ptr,
+								node->heap);
+		node->roll_ptr = roll_ptr;
+		node->undo_no = trx_undo_rec_get_undo_no(node->undo_rec);
+		
+		if (trx_undo_roll_ptr_is_insert(roll_ptr)) {
+
+			node->state = UNDO_NODE_INSERT;
+		} else {
+			node->state = UNDO_NODE_MODIFY;
+		}
+	}
+
+	if (node->state == UNDO_NODE_INSERT) {
+
+		err = row_undo_ins(node, thr);
+
+		node->state = UNDO_NODE_FETCH_NEXT;
+	} else {
+		ut_ad(node->state == UNDO_NODE_MODIFY);
+		err = row_undo_mod(node, thr);
+	}
+
+	/* Do some cleanup */
+	btr_pcur_close(&(node->pcur));
+
+	mem_heap_empty(node->heap);
+	
+	thr->run_node = node;
+
+	return(err);
+}
+
+/***************************************************************
+Undoes a row operation in a table. This is a high-level function used
+in SQL execution graphs. */
+
+que_thr_t*
+row_undo_step(
+/*==========*/
+				/* out: query thread to run next or NULL */
+	que_thr_t*	thr)	/* in: query thread */
+{
+	ulint		err;
+	undo_node_t*	node;
+	trx_t*		trx;
+
+	ut_ad(thr);
+
+	srv_activity_count++;
+	
+	trx = thr_get_trx(thr);
+	
+	node = thr->run_node;
+
+	ut_ad(que_node_get_type(node) == QUE_NODE_UNDO);
+
+	err = row_undo(node, thr);
+
+	trx->error_state = err;
+
+	if (err != DB_SUCCESS) {
+		/* SQL error detected */
+
+		ut_a(0);
+
+		return(NULL);
+	}
+
+	return(thr);
+} 
diff --git a/innobase/row/row0upd.c b/innobase/row/row0upd.c
new file mode 100644
index 00000000000..44843494247
--- /dev/null
+++ b/innobase/row/row0upd.c
@@ -0,0 +1,1394 @@
+/******************************************************
+Update of a row
+
+(c) 1996 Innobase Oy
+
+Created 12/27/1996 Heikki Tuuri
+*******************************************************/
+
+#include "row0upd.h"
+
+#ifdef UNIV_NONINL
+#include "row0upd.ic"
+#endif
+
+#include "dict0dict.h"
+#include "dict0boot.h"
+#include "dict0crea.h"
+#include "mach0data.h"
+#include "trx0undo.h"
+#include "btr0btr.h"
+#include "btr0cur.h"
+#include "que0que.h"
+#include "row0ins.h"
+#include "row0sel.h"
+#include "row0row.h"
+#include "rem0cmp.h"
+#include "lock0lock.h"
+#include "log0log.h"
+#include "pars0sym.h"
+#include "eval0eval.h"
+
+
+/* What kind of latch and lock can we assume when the control comes to
+   -------------------------------------------------------------------
+an update node?
+--------------
+Efficiency of massive updates would require keeping an x-latch on a
+clustered index page through many updates, and not setting an explicit
+x-lock on clustered index records, as they anyway will get an implicit
+x-lock when they are updated. A problem is that the read nodes in the
+graph should know that they must keep the latch when passing the control
+up to the update node, and not set any record lock on the record which
+will be updated. Another problem occurs if the execution is stopped,
+as the kernel switches to another query thread, or the transaction must
+wait for a lock. Then we should be able to release the latch and, maybe,
+acquire an explicit x-lock on the record.
+	Because this seems too complicated, we conclude that the less
+efficient solution of releasing all the latches when the control is
+transferred to another node, and acquiring explicit x-locks, is better. */
+
+/* How is a delete performed? If there is a delete without an
+explicit cursor, i.e., a searched delete, there are at least
+two different situations:
+the implicit select cursor may run on (1) the clustered index or
+on (2) a secondary index. The delete is performed by setting
+the delete bit in the record and substituting the id of the
+deleting transaction for the original trx id, and substituting a
+new roll ptr for previous roll ptr. The old trx id and roll ptr
+are saved in the undo log record. Thus, no physical changes occur
+in the index tree structure at the time of the delete. Only
+when the undo log is purged, the index records will be physically
+deleted from the index trees.
+
+The query graph executing a searched delete would consist of
+a delete node which has as a subtree a select subgraph.
+The select subgraph should return a (persistent) cursor
+in the clustered index, placed on page which is x-latched.
+The delete node should look for all secondary index records for
+this clustered index entry and mark them as deleted. When is
+the x-latch freed? The most efficient way for performing a
+searched delete is obviously to keep the x-latch for several
+steps of query graph execution. */
+
+/*************************************************************************
+Creates an update node for a query graph. */
+
+upd_node_t*
+upd_node_create(
+/*============*/
+				/* out, own: update node */
+	mem_heap_t*	heap)	/* in: mem heap where created */
+{
+	upd_node_t*	node;
+
+	node = mem_heap_alloc(heap, sizeof(upd_node_t));
+	node->common.type = QUE_NODE_UPDATE;
+
+	node->state = UPD_NODE_UPDATE_CLUSTERED;
+	node->select_will_do_update = FALSE;
+	node->in_mysql_interface = FALSE;
+
+	node->row = NULL;
+	node->index = NULL;
+
+	node->select = NULL;
+	
+	node->heap = mem_heap_create(128);
+	node->magic_n = UPD_NODE_MAGIC_N;	
+
+	node->cmpl_info = 0;
+	
+	return(node);
+}
+
+/*************************************************************************
+Updates the trx id and roll ptr field in a clustered index record in database
+recovery. */
+
+void
+row_upd_rec_sys_fields_in_recovery(
+/*===============================*/
+	rec_t*	rec,	/* in: record */
+	ulint	pos,	/* in: TRX_ID position in rec */
+	dulint	trx_id,	/* in: transaction id */
+	dulint	roll_ptr)/* in: roll ptr of the undo log record */
+{
+	byte*	field;
+	ulint	len;
+
+	field = rec_get_nth_field(rec, pos, &len);
+	ut_ad(len == DATA_TRX_ID_LEN);
+	trx_write_trx_id(field, trx_id);
+
+	field = rec_get_nth_field(rec, pos + 1, &len);
+	ut_ad(len == DATA_ROLL_PTR_LEN);
+	trx_write_roll_ptr(field, roll_ptr);
+}
+
+/*************************************************************************
+Sets the trx id or roll ptr field of a clustered index entry. */
+
+void
+row_upd_index_entry_sys_field(
+/*==========================*/
+	dtuple_t*	entry,	/* in: index entry, where the memory buffers
+				for sys fields are already allocated:
+				the function just copies the new values to
+				them */
+	dict_index_t*	index,	/* in: clustered index */
+	ulint		type,	/* in: DATA_TRX_ID or DATA_ROLL_PTR */
+	dulint		val)	/* in: value to write */
+{
+	dfield_t*	dfield;
+	byte*		field;
+	ulint		pos;
+
+	ut_ad(index->type & DICT_CLUSTERED);
+
+	pos = dict_index_get_sys_col_pos(index, type);
+
+	dfield = dtuple_get_nth_field(entry, pos);
+	field = dfield_get_data(dfield);
+
+	if (type == DATA_TRX_ID) {
+		trx_write_trx_id(field, val);
+	} else {
+		ut_ad(type == DATA_ROLL_PTR);
+		trx_write_roll_ptr(field, val);
+	}
+}
+
+/***************************************************************
+Returns TRUE if row update changes size of some field in index. */
+
+ibool
+row_upd_changes_field_size(
+/*=======================*/
+				/* out: TRUE if the update changes the size of
+				some field in index */		
+	rec_t*		rec,	/* in: record in clustered index */
+	dict_index_t*	index,	/* in: clustered index */
+	upd_t*		update)	/* in: update vector */
+{
+	upd_field_t*	upd_field;
+	dfield_t*	new_val;
+	ulint		old_len;
+	ulint		new_len;
+	ulint		n_fields;
+	ulint		i;
+
+	ut_ad(index->type & DICT_CLUSTERED);
+
+	n_fields = upd_get_n_fields(update);
+
+	for (i = 0; i < n_fields; i++) {
+		upd_field = upd_get_nth_field(update, i);
+
+		new_val = &(upd_field->new_val);
+		new_len = new_val->len;
+
+		if (new_len == UNIV_SQL_NULL) {
+			new_len = dtype_get_sql_null_size(
+					dict_index_get_nth_type(index, i));
+		}
+
+		old_len = rec_get_nth_field_size(rec, upd_field->field_no);
+		
+		if (old_len != new_len) {
+
+			return(TRUE);
+		}
+	}
+
+	return(FALSE);
+}
+
+/***************************************************************
+Replaces the new column values stored in the update vector to the record
+given. No field size changes are allowed. This function is used only for
+a clustered index */
+
+void
+row_upd_rec_in_place(
+/*=================*/
+	rec_t*	rec,	/* in/out: record where replaced */
+	upd_t*	update)	/* in: update vector */
+{
+	upd_field_t*	upd_field;
+	dfield_t*	new_val;
+	ulint		n_fields;
+	ulint		i;
+
+	rec_set_info_bits(rec, update->info_bits);
+
+	n_fields = upd_get_n_fields(update);
+
+	for (i = 0; i < n_fields; i++) {
+		upd_field = upd_get_nth_field(update, i);
+		new_val = &(upd_field->new_val);
+		
+		rec_set_nth_field(rec, upd_field->field_no,
+						dfield_get_data(new_val),
+						dfield_get_len(new_val));
+	}
+}
+
+/*************************************************************************
+Writes into the redo log the values of trx id and roll ptr and enough info
+to determine their positions within a clustered index record. */
+
+byte*
+row_upd_write_sys_vals_to_log(
+/*==========================*/
+				/* out: new pointer to mlog */
+	dict_index_t*	index,	/* in: clustered index */
+	trx_t*		trx,	/* in: transaction */
+	dulint		roll_ptr,/* in: roll ptr of the undo log record */
+	byte*		log_ptr,/* pointer to a buffer of size > 20 opened
+				in mlog */
+	mtr_t*		mtr)	/* in: mtr */
+{
+	ut_ad(index->type & DICT_CLUSTERED);
+	ut_ad(mtr);
+
+	log_ptr += mach_write_compressed(log_ptr,
+			dict_index_get_sys_col_pos(index, DATA_TRX_ID));
+
+	trx_write_roll_ptr(log_ptr, roll_ptr);
+	log_ptr += DATA_ROLL_PTR_LEN;	
+
+	log_ptr += mach_dulint_write_compressed(log_ptr, trx->id);
+
+	return(log_ptr);
+}
+
+/*************************************************************************
+Parses the log data of system field values. */
+
+byte*
+row_upd_parse_sys_vals(
+/*===================*/
+			/* out: log data end or NULL */
+	byte*	ptr,	/* in: buffer */
+	byte*	end_ptr,/* in: buffer end */
+	ulint*	pos,	/* out: TRX_ID position in record */
+	dulint*	trx_id,	/* out: trx id */
+	dulint*	roll_ptr)/* out: roll ptr */
+{
+	ptr = mach_parse_compressed(ptr, end_ptr, pos);
+
+	if (ptr == NULL) {
+
+		return(NULL);
+	}
+
+	if (end_ptr < ptr + DATA_ROLL_PTR_LEN) {
+
+		return(NULL);
+	}
+	
+	*roll_ptr = trx_read_roll_ptr(ptr);
+	ptr += DATA_ROLL_PTR_LEN;	
+
+	ptr = mach_dulint_parse_compressed(ptr, end_ptr, trx_id);
+
+	return(ptr);
+}
+
+/***************************************************************
+Writes to the redo log the new values of the fields occurring in the index. */
+
+void
+row_upd_index_write_log(
+/*====================*/
+	upd_t*	update,	/* in: update vector */
+	byte*	log_ptr,/* in: pointer to mlog buffer: must contain at least
+			MLOG_BUF_MARGIN bytes of free space; the buffer is
+			closed within this function */
+	mtr_t*	mtr)	/* in: mtr into whose log to write */
+{
+	upd_field_t*	upd_field;
+	dfield_t*	new_val;
+	ulint		len;
+	ulint		n_fields;
+	byte*		buf_end;
+	ulint		i;
+
+	n_fields = upd_get_n_fields(update);
+
+	buf_end = log_ptr + MLOG_BUF_MARGIN;
+	
+	mach_write_to_1(log_ptr, update->info_bits);
+	log_ptr++;
+	log_ptr += mach_write_compressed(log_ptr, n_fields);
+	
+	for (i = 0; i < n_fields; i++) {
+
+		ut_ad(MLOG_BUF_MARGIN > 30);
+
+		if (log_ptr + 30 > buf_end) {
+			mlog_close(mtr, log_ptr);
+			
+			log_ptr = mlog_open(mtr, MLOG_BUF_MARGIN);
+			buf_end = log_ptr + MLOG_BUF_MARGIN;
+		}
+
+		upd_field = upd_get_nth_field(update, i);
+
+		new_val = &(upd_field->new_val);
+
+		len = new_val->len;
+
+		log_ptr += mach_write_compressed(log_ptr, upd_field->field_no);
+		log_ptr += mach_write_compressed(log_ptr, len);
+
+		if (len != UNIV_SQL_NULL) {
+			if (log_ptr + len < buf_end) {
+				ut_memcpy(log_ptr, new_val->data, len);
+
+				log_ptr += len;
+			} else {
+				mlog_close(mtr, log_ptr);
+			
+				mlog_catenate_string(mtr, new_val->data, len);
+
+				log_ptr = mlog_open(mtr, MLOG_BUF_MARGIN);
+				buf_end = log_ptr + MLOG_BUF_MARGIN;
+			}
+		}
+	}
+
+	mlog_close(mtr, log_ptr);
+}
+
+/*************************************************************************
+Parses the log data written by row_upd_index_write_log. */
+
+byte*
+row_upd_index_parse(
+/*================*/
+				/* out: log data end or NULL */
+	byte*		ptr,	/* in: buffer */
+	byte*		end_ptr,/* in: buffer end */
+	mem_heap_t*	heap,	/* in: memory heap where update vector is
+				built */
+	upd_t**		update_out)/* out: update vector */
+{
+	upd_t*		update;
+	upd_field_t*	upd_field;
+	dfield_t*	new_val;
+	ulint		len;
+	ulint		n_fields;
+	byte*		buf;
+	ulint		info_bits;
+	ulint		i;
+
+	if (end_ptr < ptr + 1) {
+
+		return(NULL);
+	}
+
+	info_bits = mach_read_from_1(ptr);
+	ptr++;
+	ptr = mach_parse_compressed(ptr, end_ptr, &n_fields);
+
+	if (ptr == NULL) {
+
+		return(NULL);
+	}
+
+	update = upd_create(n_fields, heap);
+	update->info_bits = info_bits;
+
+	for (i = 0; i < n_fields; i++) {
+		upd_field = upd_get_nth_field(update, i);
+		new_val = &(upd_field->new_val);
+
+		ptr = mach_parse_compressed(ptr, end_ptr,
+						&(upd_field->field_no));
+		if (ptr == NULL) {
+
+			return(NULL);
+		}
+
+		ptr = mach_parse_compressed(ptr, end_ptr, &len);
+
+		if (ptr == NULL) {
+
+			return(NULL);
+		}
+
+		new_val->len = len;
+
+		if (len != UNIV_SQL_NULL) {
+
+			if (end_ptr < ptr + len) {
+
+				return(NULL);
+			} else {
+				buf = mem_heap_alloc(heap, len);
+				ut_memcpy(buf, ptr, len);
+
+				ptr += len;
+
+				new_val->data = buf;
+			}
+		}
+	}
+
+	*update_out = update;
+
+	return(ptr);
+}
+	
+/*******************************************************************
+Builds an update vector from those fields, excluding the roll ptr and
+trx id fields, which in an index entry differ from a record that has
+the equal ordering fields. */
+
+upd_t*
+row_upd_build_difference(
+/*=====================*/
+				/* out, own: update vector of differing
+				fields, excluding roll ptr and trx id */
+	dict_index_t*	index,	/* in: clustered index */
+	dtuple_t*	entry,	/* in: entry to insert */
+	rec_t*		rec,	/* in: clustered index record */
+	mem_heap_t*	heap)	/* in: memory heap from which allocated */
+{
+	upd_field_t*	upd_field;
+	dfield_t*	dfield;
+	byte*		data;
+	ulint		len;
+	upd_t*		update;
+	ulint		n_diff;
+	ulint		roll_ptr_pos;
+	ulint		trx_id_pos;
+	ulint		i;
+
+	/* This function is used only for a clustered index */
+	ut_ad(index->type & DICT_CLUSTERED);
+
+	update = upd_create(dtuple_get_n_fields(entry), heap);
+
+	n_diff = 0;
+
+	roll_ptr_pos = dict_index_get_sys_col_pos(index, DATA_ROLL_PTR);
+	trx_id_pos = dict_index_get_sys_col_pos(index, DATA_TRX_ID);
+
+	for (i = 0; i < dtuple_get_n_fields(entry); i++) {
+
+		data = rec_get_nth_field(rec, i, &len);
+		dfield = dtuple_get_nth_field(entry, i);
+
+		if ((i != trx_id_pos) && (i != roll_ptr_pos)
+		   		&& !dfield_data_is_equal(dfield, len, data)) {
+
+			upd_field = upd_get_nth_field(update, n_diff);
+
+			dfield_copy(&(upd_field->new_val), dfield);
+
+			upd_field_set_field_no(upd_field, i, index);
+				
+			n_diff++;
+		}
+	}
+
+	update->n_fields = n_diff;
+
+	return(update);
+}
+
+/***************************************************************
+Replaces the new column values stored in the update vector to the index entry
+given. */
+
+void
+row_upd_index_replace_new_col_vals(
+/*===============================*/
+	dtuple_t*	entry,	/* in/out: index entry where replaced */
+	dict_index_t*	index,	/* in: index; NOTE that may also be a
+				non-clustered index */
+	upd_t*		update)	/* in: update vector */
+{
+	upd_field_t*	upd_field;
+	dfield_t*	dfield;
+	dfield_t*	new_val;
+	ulint		field_no;
+	dict_index_t*	clust_index;
+	ulint		i;
+
+	ut_ad(index);
+
+	clust_index = dict_table_get_first_index(index->table);
+
+	dtuple_set_info_bits(entry, update->info_bits);
+
+	for (i = 0; i < upd_get_n_fields(update); i++) {
+
+		upd_field = upd_get_nth_field(update, i);
+
+		field_no = dict_index_get_nth_col_pos(index,
+				dict_index_get_nth_col_no(clust_index,
+							upd_field->field_no));
+		if (field_no != ULINT_UNDEFINED) {
+			dfield = dtuple_get_nth_field(entry, field_no);
+
+			new_val = &(upd_field->new_val);
+
+			dfield_set_data(dfield, new_val->data, new_val->len);
+		}
+	}
+}
+
+/***************************************************************
+Replaces the new column values stored in the update vector to the
+clustered index entry given. */
+
+void
+row_upd_clust_index_replace_new_col_vals(
+/*=====================================*/
+	dtuple_t*	entry,	/* in/out: index entry where replaced */
+	upd_t*		update)	/* in: update vector */
+{
+	upd_field_t*	upd_field;
+	dfield_t*	dfield;
+	dfield_t*	new_val;
+	ulint		field_no;
+	ulint		i;
+
+	dtuple_set_info_bits(entry, update->info_bits);
+
+	for (i = 0; i < upd_get_n_fields(update); i++) {
+
+		upd_field = upd_get_nth_field(update, i);
+
+		field_no = upd_field->field_no;
+
+		dfield = dtuple_get_nth_field(entry, field_no);
+
+		new_val = &(upd_field->new_val);
+
+		dfield_set_data(dfield, new_val->data, new_val->len);
+	}
+}
+
+/***************************************************************
+Checks if an update vector changes an ordering field of an index record.
+This function is fast if the update vector is short or the number of ordering
+fields in the index is small. Otherwise, this can be quadratic. */
+
+ibool
+row_upd_changes_ord_field(
+/*======================*/
+				/* out: TRUE if update vector changes
+				an ordering field in the index record */
+	dtuple_t*	row,	/* in: old value of row, or NULL if the
+				row and the data values in update are not
+				known when this function is called, e.g., at
+				compile time */
+	dict_index_t*	index,	/* in: index of the record */
+	upd_t*		update)	/* in: update vector for the row */
+{
+	upd_field_t*	upd_field;
+	dict_field_t*	ind_field;
+	dict_col_t*	col;
+	ulint		n_unique;
+	ulint		n_upd_fields;
+	ulint		col_pos;
+	ulint		col_no;
+	ulint		i, j;
+	
+	ut_ad(update && index);
+
+	n_unique = dict_index_get_n_unique(index);
+	n_upd_fields = upd_get_n_fields(update);
+
+	for (i = 0; i < n_unique; i++) {
+
+		ind_field = dict_index_get_nth_field(index, i);
+		col = dict_field_get_col(ind_field);
+		col_pos = dict_col_get_clust_pos(col);
+		col_no = dict_col_get_no(col);
+
+		for (j = 0; j < n_upd_fields; j++) {
+
+			upd_field = upd_get_nth_field(update, j);
+
+			if (col_pos == upd_field->field_no
+			     && (row == NULL
+				 || !dfield_datas_are_equal(
+					dtuple_get_nth_field(row, col_no),
+						&(upd_field->new_val)))) {
+				return(TRUE);
+			}
+		}
+	}
+
+	return(FALSE);
+}
+
+/***************************************************************
+Checks if an update vector changes an ordering field of an index record.
+This function is fast if the update vector is short or the number of ordering
+fields in the index is small. Otherwise, this can be quadratic. */
+
+ibool
+row_upd_changes_some_index_ord_field(
+/*=================================*/
+				/* out: TRUE if update vector may change
+				an ordering field in an index record */
+	dict_table_t*	table,	/* in: table */
+	upd_t*		update)	/* in: update vector for the row */
+{
+	dict_index_t*	index;
+
+	index = dict_table_get_first_index(table);
+	
+	while (index) {
+		if (row_upd_changes_ord_field(NULL, index, update)) {
+
+			return(TRUE);
+		}
+
+		index = dict_table_get_next_index(index);
+	}	
+
+	return(FALSE);
+}
+
+/*************************************************************************
+Copies the column values from a record. */
+UNIV_INLINE
+void
+row_upd_copy_columns(
+/*=================*/
+	rec_t*		rec,	/* in: record in a clustered index */
+	sym_node_t*	column)	/* in: first column in a column list, or
+				NULL */
+{
+	byte*	data;
+	ulint	len;
+
+	while (column) {
+		data = rec_get_nth_field(rec,
+					column->field_nos[SYM_CLUST_FIELD_NO],
+									&len);
+		eval_node_copy_and_alloc_val(column, data, len);
+
+		column = UT_LIST_GET_NEXT(col_var_list, column);
+	}
+}
+
+/*************************************************************************
+Calculates the new values for fields to update. Note that row_upd_copy_columns
+must have been called first. */
+UNIV_INLINE
+void
+row_upd_eval_new_vals(
+/*==================*/
+	upd_t*	update)	/* in: update vector */
+{
+	que_node_t*	exp;
+	upd_field_t*	upd_field;
+	ulint		n_fields;
+	ulint		i;
+
+	n_fields = upd_get_n_fields(update);
+
+	for (i = 0; i < n_fields; i++) {
+		upd_field = upd_get_nth_field(update, i);
+
+		exp = upd_field->exp;
+
+		eval_exp(exp);
+
+		dfield_copy_data(&(upd_field->new_val), que_node_get_val(exp));
+	}
+}
+
+/***************************************************************
+Stores to the heap the row on which the node->pcur is positioned. */
+UNIV_INLINE
+void
+row_upd_store_row(
+/*==============*/
+	upd_node_t*	node)	/* in: row update node */
+{
+	dict_index_t*	clust_index;
+	
+	ut_ad((node->pcur)->latch_mode != BTR_NO_LATCHES);
+
+	if (node->row != NULL) {
+		mem_heap_empty(node->heap);
+		node->row = NULL;
+	}
+	
+	clust_index = dict_table_get_first_index(node->table);
+
+	node->row = row_build(ROW_COPY_DATA, clust_index,
+				btr_pcur_get_rec(node->pcur), node->heap);
+}
+
+/***************************************************************
+Updates a secondary index entry of a row. */
+static
+ulint
+row_upd_sec_index_entry(
+/*====================*/
+				/* out: DB_SUCCESS if operation successfully
+				completed, else error code or DB_LOCK_WAIT */
+	upd_node_t*	node,	/* in: row update node */
+	que_thr_t*	thr)	/* in: query thread */
+{
+	ibool		found;
+	dict_index_t*	index;
+	dtuple_t*	entry;
+	mtr_t		mtr;
+	btr_pcur_t	pcur;
+	btr_cur_t*	btr_cur;
+	mem_heap_t*	heap;
+	rec_t*		rec;
+	ulint		err;
+	
+	index = node->index;
+	
+	heap = mem_heap_create(1024);
+
+	/* Build old index entry */
+	entry = row_build_index_entry(node->row, index, heap);
+
+	log_free_check();
+	mtr_start(&mtr);
+	
+	found = row_search_index_entry(index, entry, BTR_MODIFY_LEAF, &pcur,
+									&mtr);
+	ut_ad(found);
+
+	btr_cur = btr_pcur_get_btr_cur(&pcur);
+
+	rec = btr_cur_get_rec(btr_cur);
+
+ 	/* Delete mark the old index record; it can already be delete marked if
+ 	we return after a lock wait in row_ins_index_entry below */
+
+	if (!rec_get_deleted_flag(rec)) {
+		err = btr_cur_del_mark_set_sec_rec(0, btr_cur, TRUE, thr,
+									&mtr);
+	}
+
+	btr_pcur_close(&pcur);
+	mtr_commit(&mtr);
+
+	if (node->is_delete || (err != DB_SUCCESS)) {
+
+		mem_heap_free(heap);	
+
+        	return(err);
+	}
+
+	/* Build a new index entry */
+	row_upd_index_replace_new_col_vals(entry, index, node->update);
+
+	/* Insert new index entry */
+	err = row_ins_index_entry(index, entry, thr);
+
+	mem_heap_free(heap);	
+
+        return(err);
+}
+
+/***************************************************************
+Updates secondary index record if it is changed in the row update. This
+should be quite rare in database applications. */
+UNIV_INLINE
+ulint
+row_upd_sec_step(
+/*=============*/
+				/* out: DB_SUCCESS if operation successfully
+				completed, else error code or DB_LOCK_WAIT */
+	upd_node_t*	node,	/* in: row update node */
+	que_thr_t*	thr)	/* in: query thread */
+{
+	ulint	err;
+
+	ut_ad((node->state == UPD_NODE_UPDATE_ALL_SEC)
+				|| (node->state == UPD_NODE_UPDATE_SOME_SEC));
+	ut_ad(!(node->index->type & DICT_CLUSTERED));
+	
+	if ((node->state == UPD_NODE_UPDATE_ALL_SEC)
+			|| row_upd_changes_ord_field(node->row, node->index,
+							node->update)) {
+		err = row_upd_sec_index_entry(node, thr);
+
+		return(err);
+	}
+
+	return(DB_SUCCESS);
+}
+
+/***************************************************************
+Marks the clustered index record deleted and inserts the updated version
+of the record to the index. This function should be used when the ordering
+fields of the clustered index record change. This should be quite rare in
+database applications. */
+static
+ulint
+row_upd_clust_rec_by_insert(
+/*========================*/
+				/* out: DB_SUCCESS if operation successfully
+				completed, else error code or DB_LOCK_WAIT */
+	upd_node_t*	node,	/* in: row update node */
+	dict_index_t*	index,	/* in: clustered index of the record */
+	que_thr_t*	thr,	/* in: query thread */
+	mtr_t*		mtr)	/* in: mtr; gets committed here */
+{	
+	btr_pcur_t*	pcur;
+	btr_cur_t*	btr_cur;
+	trx_t*		trx;
+	dict_table_t*	table;
+	mem_heap_t*	heap;
+	dtuple_t*	entry;
+	ulint		err;
+	
+	ut_ad(node);
+	ut_ad(index->type & DICT_CLUSTERED);
+
+	trx = thr_get_trx(thr);
+	table = node->table;
+	pcur = node->pcur;
+	btr_cur	= btr_pcur_get_btr_cur(pcur);
+	
+	if (node->state != UPD_NODE_INSERT_CLUSTERED) {
+
+		err = btr_cur_del_mark_set_clust_rec(BTR_NO_LOCKING_FLAG,
+						btr_cur, TRUE, thr, mtr);
+		if (err != DB_SUCCESS) {
+			mtr_commit(mtr);
+
+			return(err);
+		}
+	} 
+
+	mtr_commit(mtr);
+
+	node->state = UPD_NODE_INSERT_CLUSTERED;
+
+	heap = mem_heap_create(1024);
+
+	entry = row_build_index_entry(node->row, index, heap);
+
+	row_upd_clust_index_replace_new_col_vals(entry, node->update);
+
+	row_upd_index_entry_sys_field(entry, index, DATA_TRX_ID, trx->id);
+	
+	err = row_ins_index_entry(index, entry, thr);
+
+	mem_heap_free(heap);	
+
+	return(err);
+}
+
+/***************************************************************
+Updates a clustered index record of a row when the ordering fields do
+not change. */
+static
+ulint
+row_upd_clust_rec(
+/*==============*/
+				/* out: DB_SUCCESS if operation successfully
+				completed, else error code or DB_LOCK_WAIT */
+	upd_node_t*	node,	/* in: row update node */
+	dict_index_t*	index,	/* in: clustered index */
+	que_thr_t*	thr,	/* in: query thread */
+	mtr_t*		mtr)	/* in: mtr; gets committed here */
+{
+	btr_pcur_t*	pcur;
+	btr_cur_t*	btr_cur;
+	ulint		err;
+	
+	ut_ad(node);
+	ut_ad(index->type & DICT_CLUSTERED);
+
+	pcur = node->pcur;
+	btr_cur = btr_pcur_get_btr_cur(pcur);
+
+	ut_ad(FALSE == rec_get_deleted_flag(btr_pcur_get_rec(pcur)));
+	
+	/* Try optimistic updating of the record, keeping changes within
+	the page; we do not check locks because we assume the x-lock on the
+	record to update */
+
+	if (node->cmpl_info & UPD_NODE_NO_SIZE_CHANGE) {
+		err = btr_cur_update_in_place(BTR_NO_LOCKING_FLAG,
+						btr_cur, node->update,
+						node->cmpl_info, thr, mtr);
+	} else {
+		err = btr_cur_optimistic_update(BTR_NO_LOCKING_FLAG,
+						btr_cur, node->update,
+						node->cmpl_info, thr, mtr);
+	}
+
+	mtr_commit(mtr);
+	
+	if (err == DB_SUCCESS) {
+
+		return(err);
+	}
+
+	/* We may have to modify the tree structure: do a pessimistic descent
+	down the index tree */
+
+	mtr_start(mtr);
+	
+	/* NOTE: this transaction has an s-lock or x-lock on the record and
+	therefore other transactions cannot modify the record when we have no
+	latch on the page. In addition, we assume that other query threads of
+	the same transaction do not modify the record in the meantime.
+	Therefore we can assert that the restoration of the cursor succeeds. */
+
+	ut_a(btr_pcur_restore_position(BTR_MODIFY_TREE, pcur, mtr));
+
+	ut_ad(FALSE == rec_get_deleted_flag(btr_pcur_get_rec(pcur)));
+	
+	err = btr_cur_pessimistic_update(BTR_NO_LOCKING_FLAG, btr_cur,
+				node->update, node->cmpl_info, thr, mtr);
+	mtr_commit(mtr);
+
+	return(err);
+}
+
+/***************************************************************
+Delete marks a clustered index record. */
+static
+ulint
+row_upd_del_mark_clust_rec(
+/*=======================*/
+				/* out: DB_SUCCESS if operation successfully
+				completed, else error code or DB_LOCK_WAIT */
+	upd_node_t*	node,	/* in: row update node */
+	dict_index_t*	index,	/* in: clustered index */
+	que_thr_t*	thr,	/* in: query thread */
+	mtr_t*		mtr)	/* in: mtr; gets committed here */
+{
+	btr_pcur_t*	pcur;
+	btr_cur_t*	btr_cur;
+	ulint		err;
+	
+	ut_ad(node);
+	ut_ad(index->type & DICT_CLUSTERED);
+	ut_ad(node->is_delete);
+
+	pcur = node->pcur;
+	btr_cur = btr_pcur_get_btr_cur(pcur);
+
+	ut_ad(FALSE == rec_get_deleted_flag(btr_pcur_get_rec(pcur)));
+
+	/* Store row because we have to build also the secondary index
+	entries */
+	
+	row_upd_store_row(node);
+
+	/* Mark the clustered index record deleted; we do not have to check
+	locks, because we assume that we have an x-lock on the record */
+
+	err = btr_cur_del_mark_set_clust_rec(BTR_NO_LOCKING_FLAG, btr_cur,
+							TRUE, thr, mtr);
+	mtr_commit(mtr);
+	
+	return(err);
+}
+
+/***************************************************************
+Updates the clustered index record. */
+static
+ulint
+row_upd_clust_step(
+/*===============*/
+				/* out: DB_SUCCESS if operation successfully
+				completed, DB_LOCK_WAIT in case of a lock wait,
+				else error code */
+	upd_node_t*	node,	/* in: row update node */
+	que_thr_t*	thr)	/* in: query thread */
+{
+	dict_index_t*	index;
+	btr_pcur_t*	pcur;
+	ibool		success;
+	ulint		err;
+	mtr_t		mtr_buf;
+	mtr_t*		mtr;
+	
+	index = dict_table_get_first_index(node->table);
+
+	pcur = node->pcur;
+
+	/* We have to restore the cursor to its position */
+	mtr = &mtr_buf;
+
+	mtr_start(mtr);
+	
+	/* If the restoration does not succeed, then the same
+	transaction has deleted the record on which the cursor was,
+	and that is an SQL error. If the restoration succeeds, it may
+	still be that the same transaction has successively deleted
+	and inserted a record with the same ordering fields, but in
+	that case we know that the transaction has at least an
+	implicit x-lock on the record. */
+	
+	ut_a(pcur->rel_pos == BTR_PCUR_ON);
+
+	success = btr_pcur_restore_position(BTR_MODIFY_LEAF, pcur, mtr);
+
+	if (!success) {
+		err = DB_RECORD_NOT_FOUND;
+
+		mtr_commit(mtr);
+
+		return(err);
+	}
+
+	/* If this is a row in SYS_INDEXES table of the data dictionary,
+	then we have to free the file segments of the index tree associated
+	with the index */
+
+	if (ut_dulint_cmp(node->table->id, DICT_INDEXES_ID) == 0) {
+
+		dict_drop_index_tree(btr_pcur_get_rec(pcur), mtr);
+
+		mtr_commit(mtr);
+
+		mtr_start(mtr);
+
+		success = btr_pcur_restore_position(BTR_MODIFY_LEAF, pcur,
+									mtr);
+		if (!success) {
+			err = DB_ERROR;
+
+			mtr_commit(mtr);
+
+			return(err);
+		}
+	} 
+
+	if (!node->has_clust_rec_x_lock) {
+		err = lock_clust_rec_modify_check_and_lock(0,
+						btr_pcur_get_rec(pcur),
+						index, thr);
+		if (err != DB_SUCCESS) {
+			mtr_commit(mtr);
+
+			return(err);
+		}
+	}
+
+	/* NOTE: the following function calls will also commit mtr */
+
+	if (node->is_delete) {
+		err = row_upd_del_mark_clust_rec(node, index, thr, mtr);
+
+		if (err != DB_SUCCESS) {
+
+			return(err);
+		}
+
+		node->state = UPD_NODE_UPDATE_ALL_SEC;
+		node->index = dict_table_get_next_index(index);
+
+		return(err);
+	}
+	
+	/* If the update is made for MySQL, we already have the update vector
+	ready, else we have to do some evaluation: */
+ 
+	if (!node->in_mysql_interface) {
+		/* Copy the necessary columns from clust_rec and calculate the
+		new values to set */
+
+		row_upd_copy_columns(btr_pcur_get_rec(pcur),
+					UT_LIST_GET_FIRST(node->columns));
+		row_upd_eval_new_vals(node->update);
+	}
+		
+	if (node->cmpl_info & UPD_NODE_NO_ORD_CHANGE) {
+
+		err = row_upd_clust_rec(node, index, thr, mtr);
+
+		return(err);
+	}
+	
+	row_upd_store_row(node);
+
+	if (row_upd_changes_ord_field(node->row, index, node->update)) {
+
+		/* Update causes an ordering field (ordering fields within
+		the B-tree) of the clustered index record to change: perform
+		the update by delete marking and inserting.
+
+		TODO! What to do to the 'Halloween problem', where an update
+		moves the record forward in index so that it is again
+		updated when the cursor arrives there? Solution: the
+		read operation must check the undo record undo number when
+		choosing records to update. MySQL solves now the problem
+		externally! */
+
+		err = row_upd_clust_rec_by_insert(node, index, thr, mtr);
+
+		if (err != DB_SUCCESS) {
+
+			return(err);
+		}
+
+		node->state = UPD_NODE_UPDATE_ALL_SEC;
+	} else {
+		err = row_upd_clust_rec(node, index, thr, mtr);
+
+		if (err != DB_SUCCESS) {
+
+			return(err);
+		}
+
+		node->state = UPD_NODE_UPDATE_SOME_SEC;
+	}
+
+	node->index = dict_table_get_next_index(index);
+
+	return(err);
+}
+
+/***************************************************************
+Updates the affected index records of a row. When the control is transferred
+to this node, we assume that we have a persistent cursor which was on a
+record, and the position of the cursor is stored in the cursor. */
+static
+ulint
+row_upd(
+/*====*/
+				/* out: DB_SUCCESS if operation successfully
+				completed, else error code or DB_LOCK_WAIT */
+	upd_node_t*	node,	/* in: row update node */
+	que_thr_t*	thr)	/* in: query thread */
+{
+	ulint	err;
+	
+	ut_ad(node && thr);
+
+	if (node->in_mysql_interface) {
+		/* We do not get the cmpl_info value from the MySQL
+		interpreter: we must calculate it on the fly: */
+		
+		if (row_upd_changes_some_index_ord_field(node->table,
+							node->update)) {
+			node->cmpl_info = 0; 
+		} else {
+			node->cmpl_info = UPD_NODE_NO_ORD_CHANGE;
+		}
+	}
+
+	if (node->state == UPD_NODE_UPDATE_CLUSTERED
+				|| node->state == UPD_NODE_INSERT_CLUSTERED) {
+
+		err = row_upd_clust_step(node, thr);
+		
+		if (err != DB_SUCCESS) {
+
+			goto function_exit;
+		}
+	}
+
+	if (node->cmpl_info & UPD_NODE_NO_ORD_CHANGE) {
+
+		goto function_exit;
+	}
+
+	while (node->index != NULL) {
+		err = row_upd_sec_step(node, thr);
+
+		if (err != DB_SUCCESS) {
+
+			goto function_exit;
+		}
+
+		node->index = dict_table_get_next_index(node->index);
+        }
+
+function_exit:
+	if (err == DB_SUCCESS) {
+		/* Do some cleanup */
+
+		if (node->row != NULL) {
+			mem_heap_empty(node->heap);
+			node->row = NULL;
+		}
+
+		node->state = UPD_NODE_UPDATE_CLUSTERED;
+	}
+
+        return(err);
+}
+
+/***************************************************************
+Updates a row in a table. This is a high-level function used in SQL execution
+graphs. */
+
+que_thr_t*
+row_upd_step(
+/*=========*/
+				/* out: query thread to run next or NULL */
+	que_thr_t*	thr)	/* in: query thread */
+{
+	upd_node_t*	node;
+	sel_node_t*	sel_node;
+	que_node_t*	parent;
+	ulint		err		= DB_SUCCESS;
+	trx_t*		trx;
+
+	ut_ad(thr);
+	
+	trx = thr_get_trx(thr);
+
+	node = thr->run_node;
+	
+	sel_node = node->select;
+
+	parent = que_node_get_parent(node);
+	
+	ut_ad(que_node_get_type(node) == QUE_NODE_UPDATE);
+	
+	if (thr->prev_node == parent) {
+		node->state = UPD_NODE_SET_IX_LOCK;
+	}
+
+	if (node->state == UPD_NODE_SET_IX_LOCK) {
+
+		if (!node->has_clust_rec_x_lock) {
+			/* It may be that the current session has not yet
+			started its transaction, or it has been committed: */
+
+			trx_start_if_not_started(thr_get_trx(thr));
+
+			err = lock_table(0, node->table, LOCK_IX, thr);
+
+			if (err != DB_SUCCESS) {
+
+				goto error_handling;
+			}
+		}
+	
+		node->state = UPD_NODE_UPDATE_CLUSTERED;
+
+		if (node->searched_update) {
+			/* Reset the cursor */
+			sel_node->state = SEL_NODE_OPEN;
+		
+			/* Fetch a row to update */
+		
+			thr->run_node = sel_node;
+	
+			return(thr);
+		}
+	}
+
+	/* sel_node is NULL if we are in the MySQL interface */
+	
+	if (sel_node && (sel_node->state != SEL_NODE_FETCH)) {
+
+		if (!node->searched_update) {
+			/* An explicit cursor should be positioned on a row
+			to update */
+
+			ut_error;
+			
+			err = DB_ERROR;
+
+			goto error_handling;
+		}
+
+		ut_ad(sel_node->state == SEL_NODE_NO_MORE_ROWS);
+
+		/* No more rows to update, or the select node performed the
+		updates directly in-place */
+
+		thr->run_node = parent;
+	
+		return(thr);
+	}
+
+	/* DO THE CHECKS OF THE CONSISTENCY CONSTRAINTS HERE */
+	
+	err = row_upd(node, thr);
+
+error_handling:
+	trx->error_state = err;
+
+	if (err == DB_SUCCESS) {
+		/* Ok: do nothing */
+	} else if (err == DB_LOCK_WAIT) {
+
+		return(NULL);
+	} else {
+		return(NULL);
+	}
+
+	/* DO THE TRIGGER ACTIONS HERE */
+
+	if (node->searched_update) {
+		/* Fetch next row to update */
+
+		thr->run_node = sel_node;
+	} else {
+		/* It was an explicit cursor update */
+
+		thr->run_node = parent;
+	}
+
+	node->state = UPD_NODE_UPDATE_CLUSTERED;
+
+	return(thr);
+} 
+
+/*************************************************************************
+Performs an in-place update for the current clustered index record in
+select. */
+
+void
+row_upd_in_place_in_select(
+/*=======================*/
+	sel_node_t*	sel_node,	/* in: select node */
+	que_thr_t*	thr,		/* in: query thread */
+	mtr_t*		mtr)		/* in: mtr */
+{
+	upd_node_t*	node;
+	btr_pcur_t*	pcur;
+	btr_cur_t*	btr_cur;
+	ulint		err;
+
+	ut_ad(sel_node->select_will_do_update);
+	ut_ad(sel_node->latch_mode == BTR_MODIFY_LEAF);
+	ut_ad(sel_node->asc);
+
+	node = que_node_get_parent(sel_node);
+
+	ut_ad(que_node_get_type(node) == QUE_NODE_UPDATE);
+
+	pcur = node->pcur;
+	btr_cur = btr_pcur_get_btr_cur(pcur);
+
+	/* Copy the necessary columns from clust_rec and calculate the new
+	values to set */
+
+	row_upd_copy_columns(btr_pcur_get_rec(pcur),
+					UT_LIST_GET_FIRST(node->columns));
+	row_upd_eval_new_vals(node->update);
+
+	ut_ad(FALSE == rec_get_deleted_flag(btr_pcur_get_rec(pcur)));
+	
+	ut_ad(node->cmpl_info & UPD_NODE_NO_SIZE_CHANGE);
+	ut_ad(node->cmpl_info & UPD_NODE_NO_ORD_CHANGE);
+	ut_ad(node->select_will_do_update);
+
+	err = btr_cur_update_in_place(BTR_NO_LOCKING_FLAG, btr_cur,
+						node->update, node->cmpl_info,
+						thr, mtr);
+	ut_ad(err == DB_SUCCESS);
+}
diff --git a/innobase/row/row0vers.c b/innobase/row/row0vers.c
new file mode 100644
index 00000000000..80acc7225df
--- /dev/null
+++ b/innobase/row/row0vers.c
@@ -0,0 +1,409 @@
+/******************************************************
+Row versions
+
+(c) 1997 Innobase Oy
+
+Created 2/6/1997 Heikki Tuuri
+*******************************************************/
+
+#include "row0vers.h"
+
+#ifdef UNIV_NONINL
+#include "row0vers.ic"
+#endif
+
+#include "dict0dict.h"
+#include "dict0boot.h"
+#include "btr0btr.h"
+#include "mach0data.h"
+#include "trx0rseg.h"
+#include "trx0trx.h"
+#include "trx0roll.h"
+#include "trx0undo.h"
+#include "trx0purge.h"
+#include "trx0rec.h"
+#include "que0que.h"
+#include "row0row.h"
+#include "row0upd.h"
+#include "rem0cmp.h"
+#include "read0read.h"
+
+/*********************************************************************
+Finds out if an active transaction has inserted or modified a secondary
+index record. NOTE: the kernel mutex is temporarily released in this
+function! */
+
+trx_t*
+row_vers_impl_x_locked_off_kernel(
+/*==============================*/
+				/* out: NULL if committed, else the active
+				transaction; NOTE that the kernel mutex is
+				temporarily released! */
+	rec_t*		rec,	/* in: record in a secondary index */
+	dict_index_t*	index)	/* in: the secondary index */
+{
+	dict_index_t*	clust_index;
+	rec_t*		clust_rec;
+	rec_t*		version;
+	rec_t*		prev_version;
+	dulint		trx_id;
+	dulint		prev_trx_id;
+	mem_heap_t*	heap;
+	mem_heap_t*	heap2;
+	dtuple_t*	row;
+	dtuple_t*	entry;
+	trx_t*		trx;
+	ibool		vers_del;
+	ibool		rec_del;
+	ulint		err;
+	mtr_t		mtr;
+	
+	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad(!rw_lock_own(&(purge_sys->latch), RW_LOCK_SHARED));
+
+	mutex_exit(&kernel_mutex);
+
+	mtr_start(&mtr);
+	
+	/* Search for the clustered index record: this is a time-consuming
+	operation: therefore we release the kernel mutex; also, the release
+	is required by the latching order convention. The latch on the
+	clustered index locks the top of the stack of versions. We also
+	reserve purge_latch to lock the bottom of the version stack. */	
+
+	clust_rec = row_get_clust_rec(BTR_SEARCH_LEAF, rec, index,
+							&clust_index, &mtr);
+	ut_a(clust_rec);
+
+	trx_id = row_get_rec_trx_id(clust_rec, clust_index);
+
+	mtr_s_lock(&(purge_sys->latch), &mtr);
+
+	mutex_enter(&kernel_mutex);
+	
+	if (!trx_is_active(trx_id)) {
+		/* The transaction that modified or inserted clust_rec is no
+		longer active: no implicit lock on rec */
+		
+		mtr_commit(&mtr);
+
+		return(NULL);
+	}
+
+	/* We look up if some earlier version of the clustered index record
+	would require rec to be in a different state (delete marked or
+	unmarked, or not existing). If there is such a version, then rec was
+	modified by the trx_id transaction, and it has an implicit x-lock on
+	rec. Note that if clust_rec itself would require rec to be in a
+	different state, then the trx_id transaction has not yet had time to
+	modify rec, and does not necessarily have an implicit x-lock on rec. */
+
+	rec_del = rec_get_deleted_flag(rec);
+	trx = NULL;
+
+	version = clust_rec;
+	heap = NULL;
+
+	for (;;) {
+		mutex_exit(&kernel_mutex);
+
+		/* While we retrieve an earlier version of clust_rec, we
+		release the kernel mutex, because it may take time to access
+		the disk. After the release, we have to check if the trx_id
+		transaction is still active. We keep the semaphore in mtr on
+		the clust_rec page, so that no other transaction can update
+		it and get an implicit x-lock on rec. */
+
+		heap2 = heap;
+		heap = mem_heap_create(1024);
+
+		err = trx_undo_prev_version_build(clust_rec, &mtr, version,
+						clust_index, heap,
+						&prev_version);
+		if (heap2) {
+			mem_heap_free(heap2); /* version was stored in heap2,
+						if heap2 != NULL */
+		}
+
+		if (prev_version) {
+			row = row_build(ROW_COPY_POINTERS, clust_index,
+							prev_version, heap);
+			entry = row_build_index_entry(row, index, heap);
+		}
+
+		mutex_enter(&kernel_mutex);
+
+		if (!trx_is_active(trx_id)) {
+			/* Transaction no longer active: no implicit x-lock */
+
+			break;
+		}
+
+		/* If the transaction is still active, the previous version
+		of clust_rec must be accessible if not a fresh insert; we
+		may assert the following: */
+
+		ut_ad(err == DB_SUCCESS);
+						
+		if (prev_version == NULL) {
+			/* It was a freshly inserted version: there is an
+			implicit x-lock on rec */
+
+			trx = trx_get_on_id(trx_id);
+
+			break;
+		}
+
+		/* If we get here, we know that the trx_id transaction is
+		still active and it has modified prev_version. Let us check
+		if prev_version would require rec to be in a different state. */
+
+		vers_del = rec_get_deleted_flag(prev_version);
+
+		if (0 == cmp_dtuple_rec(entry, rec)) {
+			/* The delete marks of rec and prev_version should be
+			equal for rec to be in the state required by
+			prev_version */
+
+			if (rec_del != vers_del) {
+				trx = trx_get_on_id(trx_id);
+
+				break;
+			}
+		} else if (!rec_del) {
+			/* The delete mark should be set in rec for it to be
+			in the state required by prev_version */
+
+			trx = trx_get_on_id(trx_id);
+
+			break;
+		}
+
+		prev_trx_id = row_get_rec_trx_id(prev_version, clust_index);
+
+		if (0 != ut_dulint_cmp(trx_id, prev_trx_id)) {
+			/* The versions modified by the trx_id transaction end
+			to prev_version: no implicit x-lock */
+
+			break;
+		}
+
+		version = prev_version;
+	}/* for (;;) */
+
+	mtr_commit(&mtr);
+	mem_heap_free(heap);
+
+	return(trx);
+}
+
+/*********************************************************************
+Finds out if we must preserve a delete marked earlier version of a clustered
+index record, because it is >= the purge view. */
+
+ibool
+row_vers_must_preserve_del_marked(
+/*==============================*/
+			/* out: TRUE if earlier version should be preserved */
+	dulint	trx_id,	/* in: transaction id in the version */
+	mtr_t*	mtr)	/* in: mtr holding the latch on the clustered index
+			record; it will also hold the latch on purge_view */
+{
+	ut_ad(!rw_lock_own(&(purge_sys->latch), RW_LOCK_SHARED));
+
+	mtr_s_lock(&(purge_sys->latch), mtr);
+
+	if (trx_purge_update_undo_must_exist(trx_id)) {
+
+		/* A purge operation is not yet allowed to remove this
+		delete marked record */
+			
+		return(TRUE);
+	}
+
+	return(FALSE);
+}
+
+/*********************************************************************
+Finds out if a version of the record, where the version >= the current
+purge view, should have ientry as its secondary index entry. We check
+if there is any not delete marked version of the record where the trx
+id >= purge view, and the secondary index entry == ientry; exactly in
+this case we return TRUE. */
+
+ibool
+row_vers_old_has_index_entry(
+/*=========================*/
+				/* out: TRUE if earlier version should have */
+	ibool		also_curr,/* in: TRUE if also rec is included in the
+				versions to search; otherwise only versions
+				prior to it are searched */
+	rec_t*		rec,	/* in: record in the clustered index; the
+				caller must have a latch on the page */
+	mtr_t*		mtr,	/* in: mtr holding the latch on rec; it will
+				also hold the latch on purge_view */
+	dict_index_t*	index,	/* in: the secondary index */
+	dtuple_t*	ientry)	/* in: the secondary index entry */
+{
+	rec_t*		version;
+	rec_t*		prev_version;
+	dict_index_t*	clust_index;
+	mem_heap_t*	heap;
+	mem_heap_t*	heap2;
+	dtuple_t*	row;
+	dtuple_t*	entry;
+	ulint		err;
+	
+	ut_ad(mtr_memo_contains(mtr, buf_block_align(rec), MTR_MEMO_PAGE_X_FIX)
+	   	|| mtr_memo_contains(mtr, buf_block_align(rec),
+						MTR_MEMO_PAGE_S_FIX));
+	ut_ad(!rw_lock_own(&(purge_sys->latch), RW_LOCK_SHARED));
+	mtr_s_lock(&(purge_sys->latch), mtr);
+
+	clust_index = dict_table_get_first_index(index->table);
+
+	if (also_curr && !rec_get_deleted_flag(rec)) {
+
+		heap = mem_heap_create(1024);
+		row = row_build(ROW_COPY_POINTERS, clust_index, rec, heap);
+		entry = row_build_index_entry(row, index, heap);
+
+		if (dtuple_datas_are_equal(ientry, entry)) {
+
+			mem_heap_free(heap);
+
+			return(TRUE);
+		}
+
+		mem_heap_free(heap);
+	}
+
+	version = rec;
+	heap = NULL;
+
+	for (;;) {
+		heap2 = heap;
+		heap = mem_heap_create(1024);
+
+		err = trx_undo_prev_version_build(rec, mtr, version,
+						clust_index, heap,
+						&prev_version);	
+		if (heap2) {
+			mem_heap_free(heap2); /* version was stored in heap2,
+						if heap2 != NULL */
+		}
+
+		if ((err != DB_SUCCESS) || !prev_version) {
+			/* Versions end here */
+
+			mem_heap_free(heap);
+
+			return(FALSE);
+		}
+
+		if (!rec_get_deleted_flag(prev_version)) {
+			row = row_build(ROW_COPY_POINTERS, clust_index,
+							prev_version, heap);
+			entry = row_build_index_entry(row, index, heap);
+
+			if (dtuple_datas_are_equal(ientry, entry)) {
+
+				mem_heap_free(heap);
+
+				return(TRUE);
+			}
+		}
+
+		version = prev_version;
+	}
+}
+
+/*********************************************************************
+Constructs the version of a clustered index record which a consistent
+read should see. We assume that the trx id stored in rec is such that
+the consistent read should not see rec in its present version. */
+
+ulint
+row_vers_build_for_consistent_read(
+/*===============================*/
+				/* out: DB_SUCCESS or DB_MISSING_HISTORY */
+	rec_t*		rec,	/* in: record in a clustered index; the
+				caller must have a latch on the page; this
+				latch locks the top of the stack of versions
+				of this records */
+	mtr_t*		mtr,	/* in: mtr holding the latch on rec */
+	dict_index_t*	index,	/* in: the clustered index */
+	read_view_t*	view,	/* in: the consistent read view */
+	mem_heap_t*	in_heap,/* in: memory heap from which the memory for
+				old_vers is allocated; memory for possible
+				intermediate versions is allocated and freed
+				locally within the function */
+	rec_t**		old_vers)/* out, own: old version, or NULL if the
+				record does not exist in the view, that is,
+				it was freshly inserted afterwards */
+{
+	rec_t*		version;
+	rec_t*		prev_version;
+	dulint		prev_trx_id;
+	mem_heap_t*	heap;
+	mem_heap_t*	heap2;
+	byte*		buf;
+	ulint		err;
+
+	ut_ad(index->type & DICT_CLUSTERED);
+	ut_ad(mtr_memo_contains(mtr, buf_block_align(rec), MTR_MEMO_PAGE_X_FIX)
+	   	|| mtr_memo_contains(mtr, buf_block_align(rec),
+						MTR_MEMO_PAGE_S_FIX));
+	ut_ad(!rw_lock_own(&(purge_sys->latch), RW_LOCK_SHARED));
+	ut_ad(!read_view_sees_trx_id(view, row_get_rec_trx_id(rec, index)));
+
+	rw_lock_s_lock(&(purge_sys->latch));
+	version = rec;
+	heap = NULL;
+
+	for (;;) {
+		heap2 = heap;
+		heap = mem_heap_create(1024);
+
+		err = trx_undo_prev_version_build(rec, mtr, version, index,
+							heap, &prev_version);
+		if (heap2) {
+			mem_heap_free(heap2); /* version was stored in heap2,
+						if heap2 != NULL */
+		}
+
+		if (err != DB_SUCCESS) {
+			break;
+		}
+
+		if (prev_version == NULL) {
+			/* It was a freshly inserted version */
+			*old_vers = NULL;
+			err = DB_SUCCESS;
+
+			break;
+		}
+
+		prev_trx_id = row_get_rec_trx_id(prev_version, index);
+
+		if (read_view_sees_trx_id(view, prev_trx_id)) {
+
+			/* The view already sees this version: we can copy
+			it to in_heap and return */
+
+			buf = mem_heap_alloc(in_heap, rec_get_size(
+								prev_version));
+			*old_vers = rec_copy(buf, prev_version);
+			err = DB_SUCCESS;
+
+			break;
+		}
+
+		version = prev_version;
+	}/* for (;;) */
+
+	mem_heap_free(heap);
+	rw_lock_s_unlock(&(purge_sys->latch));
+
+	return(err);
+}
diff --git a/innobase/row/ts/makefile b/innobase/row/ts/makefile
new file mode 100644
index 00000000000..589db50d4ed
--- /dev/null
+++ b/innobase/row/ts/makefile
@@ -0,0 +1,16 @@
+
+
+
+include ..\..\makefile.i
+
+tstcur: ..\tcur.lib tstcur.c
+	$(CCOM) $(CFL) -I.. -I..\.. ..\tcur.lib ..\..\trx.lib ..\..\btr.lib ..\..\fut.lib ..\..\fsp.lib ..\..\page.lib ..\..\dyn.lib ..\..\mtr.lib ..\..\log.lib ..\..\rem.lib ..\..\fil.lib ..\..\buf.lib ..\..\dict.lib ..\..\data.lib ..\..\mach.lib ..\..\ha.lib ..\..\ut.lib ..\..\sync.lib ..\..\mem.lib ..\..\os.lib tstcur.c $(LFL)
+
+
+
+
+
+
+
+
+
diff --git a/innobase/row/ts/tstcur.c b/innobase/row/ts/tstcur.c
new file mode 100644
index 00000000000..f5a5eb1f9f3
--- /dev/null
+++ b/innobase/row/ts/tstcur.c
@@ -0,0 +1,1087 @@
+/************************************************************************
+Test for the index system
+
+(c) 1994-1996 Innobase Oy
+
+Created 2/16/1996 Heikki Tuuri
+*************************************************************************/
+
+#include "sync0sync.h"
+#include "ut0mem.h"
+#include "mem0mem.h"
+#include "data0data.h"
+#include "data0type.h"
+#include "dict0dict.h"
+#include "buf0buf.h"
+#include "os0file.h"
+#include "fil0fil.h"
+#include "fsp0fsp.h"
+#include "rem0rec.h"
+#include "rem0cmp.h"
+#include "mtr0mtr.h"
+#include "log0log.h"
+#include "page0page.h"
+#include "page0cur.h"
+#include "trx0trx.h"
+#include "dict0boot.h"
+#include "trx0sys.h"
+#include "dict0crea.h"
+#include "btr0btr.h"
+#include "btr0pcur.h"
+#include "rem0rec.h"
+#include "..\tcur0ins.h"
+
+os_file_t	files[1000];
+
+mutex_t		ios_mutex;
+ulint		ios;
+ulint		n[10];
+
+mutex_t		incs_mutex;
+ulint		incs;
+
+byte		bigbuf[1000000];
+
+#define N_SPACES	1
+#define N_FILES		1
+#define FILE_SIZE	4000 	/* must be > 512 */
+#define POOL_SIZE	1000
+#define	COUNTER_OFFSET	1500
+
+#define LOOP_SIZE	150
+#define	N_THREADS	5
+
+
+ulint zero = 0;
+
+buf_block_t*	bl_arr[POOL_SIZE];
+
+/************************************************************************
+Io-handler thread function. */
+
+ulint
+handler_thread(
+/*===========*/
+	void*	arg)
+{
+	ulint	segment;
+	void*	mess;
+	ulint	i;
+	bool	ret;
+	
+	segment = *((ulint*)arg);
+
+	printf("Io handler thread %lu starts\n", segment);
+
+	for (i = 0;; i++) {
+		ret = fil_aio_wait(segment, &mess);
+		ut_a(ret);
+
+		buf_page_io_complete((buf_block_t*)mess);
+		
+		mutex_enter(&ios_mutex);
+		ios++;
+		mutex_exit(&ios_mutex);
+		
+	}
+
+	return(0);
+}
+
+/*************************************************************************
+Creates the files for the file system test and inserts them to
+the file system. */
+
+void
+create_files(void)
+/*==============*/
+{
+	bool		ret;
+	ulint		i, k;
+	char		name[20];
+	os_thread_t	thr[5];
+	os_thread_id_t	id[5];
+
+	printf("--------------------------------------------------------\n");
+	printf("Create or open database files\n");
+
+	strcpy(name, "tsfile00");
+
+	for (k = 0; k < N_SPACES; k++) {
+	for (i = 0; i < N_FILES; i++) {
+
+		name[6] = (char)((ulint)'0' + k);
+		name[7] = (char)((ulint)'0' + i);
+	
+		files[i] = os_file_create(name, OS_FILE_CREATE,
+					OS_FILE_TABLESPACE, &ret);
+
+		if (ret == FALSE) {
+			ut_a(os_file_get_last_error() ==
+						OS_FILE_ALREADY_EXISTS);
+	
+			files[i] = os_file_create(
+				name, OS_FILE_OPEN,
+						OS_FILE_TABLESPACE, &ret);
+
+			ut_a(ret);
+		}
+
+		ret = os_file_close(files[i]);
+		ut_a(ret);
+
+		if (i == 0) {
+			fil_space_create(name, k, OS_FILE_TABLESPACE);
+		}
+
+		ut_a(fil_validate());
+
+		fil_node_create(name, FILE_SIZE, k);
+	}
+	}
+
+	ios = 0;
+
+	mutex_create(&ios_mutex);
+	
+	for (i = 0; i < 5; i++) {
+		n[i] = i;
+
+		thr[i] = os_thread_create(handler_thread, n + i, id + i);
+	}
+}
+
+/************************************************************************
+Inits space header of space 0. */
+
+void
+init_space(void)
+/*============*/
+{
+	mtr_t		mtr;
+
+	printf("Init space header\n");
+	
+	mtr_start(&mtr);
+
+	fsp_header_init(0, FILE_SIZE * N_FILES, &mtr);		
+
+	mtr_commit(&mtr);
+}
+
+/*********************************************************************
+Test for index page. */
+
+void 
+test1(void)
+/*=======*/
+{
+	dtuple_t*	tuple;
+	mem_heap_t*	heap;
+	mem_heap_t*	heap2;
+	ulint		rnd	= 0;
+	dict_index_t*	index;	
+	dict_table_t*	table;
+	byte		buf[16];
+	ulint		i, j;
+	ulint		tm, oldtm;
+	trx_t*		trx;
+/*	dict_tree_t*	tree;*/
+	btr_pcur_t	pcur;
+	btr_pcur_t	pcur2;
+	mtr_t		mtr;
+	mtr_t		mtr2;
+	byte*		field;
+	ulint		len;
+	dtuple_t*	search_tuple;
+	dict_tree_t*	index_tree;
+	rec_t*		rec;
+
+	UT_NOT_USED(len);
+	UT_NOT_USED(field);
+	UT_NOT_USED(pcur2);
+/*	
+	printf("\n\n\nPress 2 x enter to start test\n");
+	
+	while (EOF == getchar()) {
+		
+	}
+
+	getchar();
+*/	
+	printf("-------------------------------------------------\n");
+	printf("TEST 1. CREATE TABLE WITH 3 COLUMNS AND WITH 3 INDEXES\n");
+
+	heap = mem_heap_create(1024);
+	heap2 = mem_heap_create(1024);
+
+	trx = trx_start(ULINT_UNDEFINED);	
+
+	table = dict_mem_table_create("TS_TABLE1", 0, 3);
+
+	dict_mem_table_add_col(table, "COL1", DATA_VARCHAR,
+						DATA_ENGLISH, 10, 0);
+	dict_mem_table_add_col(table, "COL2", DATA_VARCHAR,
+						DATA_ENGLISH, 10, 0);
+	dict_mem_table_add_col(table, "COL3", DATA_VARCHAR,
+						DATA_ENGLISH, 100, 0);
+
+	ut_a(TRUE == dict_create_table(table, trx));
+
+	index = dict_mem_index_create("TS_TABLE1", "IND1", 75046,
+							DICT_CLUSTERED, 2);
+
+	dict_mem_index_add_field(index, "COL1", 0);
+	dict_mem_index_add_field(index, "COL2", 0);
+
+	ut_a(mem_heap_validate(index->heap));
+	
+	ut_a(TRUE == dict_create_index(index, trx));
+
+	trx_commit(trx);
+
+	trx = trx_start(ULINT_UNDEFINED);	
+
+	index = dict_mem_index_create("TS_TABLE1", "IND2", 0, DICT_UNIQUE, 1);
+
+	dict_mem_index_add_field(index, "COL2", 0);
+
+	ut_a(mem_heap_validate(index->heap));
+	
+	ut_a(TRUE == dict_create_index(index, trx));
+
+	trx_commit(trx);
+
+	trx = trx_start(ULINT_UNDEFINED);	
+
+	index = dict_mem_index_create("TS_TABLE1", "IND3", 0, DICT_UNIQUE, 1);
+
+	dict_mem_index_add_field(index, "COL2", 0);
+
+	ut_a(mem_heap_validate(index->heap));
+	
+	ut_a(TRUE == dict_create_index(index, trx));
+
+	trx_commit(trx);
+/*
+	tree = dict_index_get_tree(dict_table_get_first_index(table));
+
+	btr_print_tree(tree, 10);
+*/
+	dict_table_print(table);
+
+	/*---------------------------------------------------------*/
+/*
+	printf("\n\n\nPress 2 x enter to continue test\n");
+	
+	while (EOF == getchar()) {
+		
+	}
+	getchar();
+*/	
+	printf("-------------------------------------------------\n");
+	printf("TEST 2. INSERT 1 ROW TO THE TABLE\n");
+
+	trx = trx_start(ULINT_UNDEFINED);
+	
+	tuple = dtuple_create(heap, 3);
+
+	table = dict_table_get("TS_TABLE1", trx);
+
+	dtuple_gen_test_tuple3(tuple, 0, buf);
+	tcur_insert(tuple, table, heap2, trx);
+
+	trx_commit(trx);
+/*
+	tree = dict_index_get_tree(dict_table_get_first_index(table));
+
+	btr_print_tree(tree, 10);
+*/
+/*
+	printf("\n\n\nPress 2 x enter to continue test\n");
+	
+	while (EOF == getchar()) {
+		
+	}
+	getchar();
+*/	
+	printf("-------------------------------------------------\n");
+	printf("TEST 3. INSERT MANY ROWS TO THE TABLE IN A SINGLE TRX\n");
+
+	rnd = 0;
+	oldtm = ut_clock();
+	
+	trx = trx_start(ULINT_UNDEFINED);
+	for (i = 0; i < 300 * UNIV_DBC * UNIV_DBC; i++) {
+
+		if (i % 5000 == 0) {
+			/* dict_table_print(table);
+			buf_print();
+			buf_LRU_print();
+			printf("%lu rows inserted\n", i); */
+		}
+
+		table = dict_table_get("TS_TABLE1", trx);
+		
+		if (i == 2180) {
+			rnd = rnd % 200000;
+		}
+
+		rnd = (rnd + 1) % 200000;
+		
+		dtuple_gen_test_tuple3(tuple, rnd, buf);
+
+		tcur_insert(tuple, table, heap2, trx);
+
+		mem_heap_empty(heap2);
+
+		if (i % 4 == 3) {
+		}
+	}
+	trx_commit(trx);
+
+	tm = ut_clock();
+	printf("Wall time for test %lu milliseconds\n", tm - oldtm);
+	printf("%lu rows inserted\n", i);
+/*
+	printf("\n\n\nPress 2 x enter to continue test\n");
+	
+	while (EOF == getchar()) {
+		
+	}
+	getchar();
+*/	
+	printf("-------------------------------------------------\n");
+	printf("TEST 4. PRINT PART OF CONTENTS OF EACH INDEX TREE\n");
+
+/*
+	mem_print_info();
+*/
+
+/*
+	tree = dict_index_get_tree(dict_table_get_first_index(table));
+
+	btr_print_tree(tree, 10);
+
+	tree = dict_index_get_tree(dict_table_get_next_index(
+				dict_table_get_first_index(table)));
+
+	btr_print_tree(tree, 5);
+*/	
+/*
+	printf("\n\n\nPress 2 x enter to continue test\n");
+
+	while (EOF == getchar()) {
+		
+	}
+	getchar();
+*/
+/*	mem_print_info(); */
+
+	os_thread_sleep(5000000);
+
+   for (j = 0; j < 5; j++) {
+	printf("-------------------------------------------------\n");
+	printf("TEST 5. CALCULATE THE JOIN OF THE TABLE WITH ITSELF\n");
+
+	i = 0;
+
+	oldtm = ut_clock();
+
+	mtr_start(&mtr);
+
+	index_tree = dict_index_get_tree(UT_LIST_GET_FIRST(table->indexes));
+
+	search_tuple = dtuple_create(heap, 2);
+
+	dtuple_gen_search_tuple3(search_tuple, i, buf);
+	
+	btr_pcur_open(index_tree, search_tuple, PAGE_CUR_GE,
+						BTR_SEARCH_LEAF, &pcur, &mtr);
+
+	ut_a(btr_pcur_move_to_next(&pcur, &mtr));
+
+	while (!btr_pcur_is_after_last_in_tree(&pcur, &mtr)) {
+	
+		if (i % 20000 == 0) {
+			printf("%lu rows joined\n", i);
+		}
+
+		index_tree = dict_index_get_tree(
+					UT_LIST_GET_FIRST(table->indexes));
+
+		rec = btr_pcur_get_rec(&pcur);
+
+		rec_copy_prefix_to_dtuple(search_tuple, rec, 2, heap2);
+
+		mtr_start(&mtr2);		
+
+		btr_pcur_open(index_tree, search_tuple, PAGE_CUR_GE,
+						BTR_SEARCH_LEAF, &pcur2, &mtr2);
+
+		btr_pcur_move_to_next(&pcur2, &mtr2);
+
+		rec = btr_pcur_get_rec(&pcur2);
+
+		field = rec_get_nth_field(rec, 1, &len);
+
+		ut_a(len == 8);
+
+		ut_a(ut_memcmp(field, dfield_get_data(
+					dtuple_get_nth_field(search_tuple, 1)),
+				len) == 0);
+				
+		btr_pcur_close(&pcur2, &mtr);
+
+		mem_heap_empty(heap2);
+
+		mtr_commit(&mtr2);
+
+		btr_pcur_store_position(&pcur, &mtr);
+		mtr_commit(&mtr);
+
+		mtr_start(&mtr);
+
+		btr_pcur_restore_position(BTR_SEARCH_LEAF, &pcur, &mtr);
+
+		btr_pcur_move_to_next(&pcur, &mtr);
+		i++;
+	}
+
+	btr_pcur_close(&pcur, &mtr);
+	mtr_commit(&mtr);
+
+	tm = ut_clock();
+	printf("Wall time for test %lu milliseconds\n", tm - oldtm);
+	printf("%lu rows joined\n", i);
+   }
+
+	oldtm = ut_clock();
+
+/*
+	printf("\n\n\nPress 2 x enter to continue test\n");
+	
+	while (EOF == getchar()) {
+		
+	}
+	getchar();
+*/	
+	printf("-------------------------------------------------\n");
+	printf("TEST 6. INSERT MANY ROWS TO THE TABLE IN SEPARATE TRXS\n");
+
+	rnd = 200000;
+	
+	for (i = 0; i < 350; i++) {
+
+		if (i % 4 == 0) {
+		}
+		trx = trx_start(ULINT_UNDEFINED);
+
+		table = dict_table_get("TS_TABLE1", trx);
+		
+		if (i == 2180) {
+			rnd = rnd % 200000;
+		}
+
+		rnd = (rnd + 1) % 200000;
+		
+		dtuple_gen_test_tuple3(tuple, rnd, buf);
+
+		tcur_insert(tuple, table, heap2, trx);
+
+		trx_commit(trx);
+
+		mem_heap_empty(heap2);
+		if (i % 4 == 3) {
+		}
+	}
+
+	tm = ut_clock();
+	printf("Wall time for test %lu milliseconds\n", tm - oldtm);
+	printf("%lu rows inserted in %lu transactions\n", i, i);
+/*
+	printf("\n\n\nPress 2 x enter to continue test\n");
+	
+	while (EOF == getchar()) {
+		
+	}
+	getchar();
+*/	
+	printf("-------------------------------------------------\n");
+	printf("TEST 7. PRINT MEMORY ALLOCATION INFO\n");
+
+	mem_print_info();
+/*
+	printf("\n\n\nPress 2 x enter to continue test\n");
+	
+	while (EOF == getchar()) {
+		
+	}
+	getchar();
+*/	
+	printf("-------------------------------------------------\n");
+	printf("TEST 8. PRINT SEMAPHORE INFO\n");
+
+	sync_print();
+
+	
+
+#ifdef notdefined
+	rnd = 90000;
+
+	oldtm = ut_clock();
+	
+	for (i = 0; i < 1000 * UNIV_DBC * UNIV_DBC; i++) {
+	
+		mtr_start(&mtr);
+
+		if (i == 50000) {
+			rnd = rnd % 200000;
+		}
+
+		rnd = (rnd + 595659561) % 200000;
+		
+		dtuple_gen_test_tuple3(tuple, rnd, buf);
+
+		btr_pcur_open(tree, tuple, PAGE_CUR_GE,
+					BTR_SEARCH_LEAF, &cursor, &mtr);
+
+		mtr_commit(&mtr);
+	}
+
+	tm = ut_clock();
+	printf("Wall time for test %lu milliseconds\n", tm - oldtm);
+
+	rnd = 0;
+
+	oldtm = ut_clock();
+	
+	for (i = 0; i < 1000 * UNIV_DBC * UNIV_DBC; i++) {
+	
+		mtr_start(&mtr);
+
+		rnd = (rnd + 35608971) % 200000 + 1;
+		
+		dtuple_gen_test_tuple3(tuple, rnd, buf);
+
+		mtr_commit(&mtr);
+	}
+
+	tm = ut_clock();
+	printf("Wall time for test %lu milliseconds\n", tm - oldtm);
+	
+/*	btr_print_tree(tree, 3); */
+
+#endif
+	mem_heap_free(heap);
+}
+
+
+#ifdef notdefined
+	
+	mtr_start(&mtr);
+
+	block = buf_page_create(0, 5, &mtr);
+	buf_page_x_lock(block, &mtr);
+
+	frame = buf_block_get_frame(block);
+
+	page = page_create(frame, &mtr);
+
+	for (i = 0; i < 512; i++) {
+
+		rnd = (rnd + 534671) % 512;
+
+		if (i % 27 == 0) {
+			ut_a(page_validate(page, index));
+		}
+	
+		dtuple_gen_test_tuple(tuple, rnd);
+
+/*		dtuple_print(tuple);*/
+
+		page_cur_search(page, tuple, PAGE_CUR_G, &cursor);
+
+		rec = page_cur_insert_rec(&cursor, tuple, NULL, &mtr);
+
+		ut_a(rec);
+
+		rec_validate(rec);
+/*		page_print_list(page, 151); */
+	}
+
+/*	page_print_list(page, 151); */
+
+	ut_a(page_validate(page, index));
+	ut_a(page_get_n_recs(page) == 512);
+
+	for (i = 0; i < 512; i++) {
+
+		rnd = (rnd + 7771) % 512;
+	
+		if (i % 27 == 0) {
+			ut_a(page_validate(page, index));
+		}
+
+		dtuple_gen_test_tuple(tuple, rnd);
+
+/*		dtuple_print(tuple);*/
+
+		page_cur_search(page, tuple, PAGE_CUR_G, &cursor);
+
+		page_cur_delete_rec(&cursor, &mtr);
+
+		ut_a(rec);
+
+		rec_validate(rec);
+/*		page_print_list(page, 151); */
+	}
+
+	ut_a(page_get_n_recs(page) == 0);
+
+	ut_a(page_validate(page, index));
+	page = page_create(frame, &mtr);
+
+	rnd = 311;
+	
+	for (i = 0; i < 512; i++) {
+
+		rnd = (rnd + 1) % 512;
+
+		if (i % 27 == 0) {
+			ut_a(page_validate(page, index));
+		}
+	
+		dtuple_gen_test_tuple(tuple, rnd);
+
+/*		dtuple_print(tuple);*/
+
+		page_cur_search(page, tuple, PAGE_CUR_G, &cursor);
+
+		rec = page_cur_insert_rec(&cursor, tuple, NULL, &mtr);
+
+		ut_a(rec);
+
+		rec_validate(rec);
+/*		page_print_list(page, 151); */
+	}
+
+	ut_a(page_validate(page, index));
+	ut_a(page_get_n_recs(page) == 512);
+
+	rnd = 217;
+
+	for (i = 0; i < 512; i++) {
+
+		rnd = (rnd + 1) % 512;
+	
+		if (i % 27 == 0) {
+			ut_a(page_validate(page, index));
+		}
+
+		dtuple_gen_test_tuple(tuple, rnd);
+
+/*		dtuple_print(tuple);*/
+
+		page_cur_search(page, tuple, PAGE_CUR_G, &cursor);
+
+		page_cur_delete_rec(&cursor, &mtr);
+
+		ut_a(rec);
+
+		rec_validate(rec);
+/*		page_print_list(page, 151); */
+	}
+
+	ut_a(page_validate(page, index));
+	ut_a(page_get_n_recs(page) == 0);
+	page = page_create(frame, &mtr);
+
+	rnd = 291;
+	
+	for (i = 0; i < 512; i++) {
+
+		rnd = (rnd - 1) % 512;
+
+		if (i % 27 == 0) {
+			ut_a(page_validate(page, index));
+		}
+	
+		dtuple_gen_test_tuple(tuple, rnd);
+
+/*		dtuple_print(tuple);*/
+
+		page_cur_search(page, tuple, PAGE_CUR_G, &cursor);
+
+		rec = page_cur_insert_rec(&cursor, tuple, NULL, &mtr);
+
+		ut_a(rec);
+
+		rec_validate(rec);
+/*		page_print_list(page, 151); */
+	}
+
+	ut_a(page_validate(page, index));
+	ut_a(page_get_n_recs(page) == 512);
+
+	rnd = 277;
+
+	for (i = 0; i < 512; i++) {
+
+		rnd = (rnd - 1) % 512;
+	
+		if (i % 27 == 0) {
+			ut_a(page_validate(page, index));
+		}
+
+		dtuple_gen_test_tuple(tuple, rnd);
+
+/*		dtuple_print(tuple);*/
+
+		page_cur_search(page, tuple, PAGE_CUR_G, &cursor);
+
+		page_cur_delete_rec(&cursor, &mtr);
+
+		ut_a(rec);
+
+		rec_validate(rec);
+/*		page_print_list(page, 151); */
+	}
+
+	ut_a(page_validate(page, index));
+	ut_a(page_get_n_recs(page) == 0);
+
+	mtr_commit(&mtr);
+	mem_heap_free(heap);
+}
+
+/*********************************************************************
+Test for index page. */
+
+void 
+test2(void)
+/*=======*/
+{
+	page_t*		page;
+	dtuple_t*	tuple;
+	mem_heap_t*	heap;
+	ulint		i, j;
+	ulint		rnd	= 0;
+	rec_t*		rec;
+	page_cur_t	cursor;
+	dict_index_t*	index;	
+	dict_table_t*	table;
+	buf_block_t*	block;
+	buf_frame_t*	frame;
+	ulint		tm, oldtm;
+	byte		buf[8];
+	mtr_t		mtr;
+	
+	printf("-------------------------------------------------\n");
+	printf("TEST 2. Speed test\n");
+	
+	oldtm = ut_clock();
+	
+	for (i = 0; i < 1000 * UNIV_DBC * UNIV_DBC; i++) {
+		ut_memcpy(bigbuf, bigbuf + 800, 800);
+	}
+
+	tm = ut_clock();
+	printf("Wall time for %lu mem copys of 800 bytes %lu millisecs\n",
+			i, tm - oldtm);
+
+	oldtm = ut_clock();
+
+	rnd = 0;
+	for (i = 0; i < 1000 * UNIV_DBC * UNIV_DBC; i++) {
+		ut_memcpy(bigbuf + rnd, bigbuf + rnd + 800, 800);
+		rnd += 1600;
+		if (rnd > 995000) {
+			rnd = 0;
+		}
+	}
+
+	tm = ut_clock();
+	printf("Wall time for %lu mem copys of 800 bytes %lu millisecs\n",
+			i, tm - oldtm);
+			
+	heap = mem_heap_create(0);
+	
+	table = dict_table_create("TS_TABLE2", 2);
+
+	dict_table_add_col(table, "COL1", DATA_VARCHAR, DATA_ENGLISH, 10, 0);
+	dict_table_add_col(table, "COL2", DATA_VARCHAR, DATA_ENGLISH, 10, 0);
+
+	ut_a(0 == dict_table_publish(table));
+
+	index = dict_index_create("TS_TABLE2", "IND2", 0, 2, 0);
+
+	dict_index_add_field(index, "COL1", 0);
+	dict_index_add_field(index, "COL2", 0);
+
+	ut_a(0 == dict_index_publish(index));
+
+	index = dict_index_get("TS_TABLE2", "IND2");
+	ut_a(index);
+	
+	tuple = dtuple_create(heap, 2);
+
+	oldtm = ut_clock();
+	
+	rnd = 677;
+	for (i = 0; i < 4 * UNIV_DBC * UNIV_DBC; i++) {
+
+		mtr_start(&mtr);
+
+		block = buf_page_create(0, 5, &mtr);
+		buf_page_x_lock(block, &mtr);
+
+		frame = buf_block_get_frame(block);
+
+		page = page_create(frame, &mtr);
+
+		for (j = 0; j < 250; j++) {
+			rnd = (rnd + 54841) % 1000;
+			dtuple_gen_test_tuple2(tuple, rnd, buf);
+			page_cur_search(page, tuple, PAGE_CUR_G, &cursor);
+
+			rec = page_cur_insert_rec(&cursor, tuple, NULL, &mtr);
+			ut_a(rec);
+		}
+		mtr_commit(&mtr);
+	}
+
+	tm = ut_clock();
+	printf("Wall time for insertion of %lu recs %lu milliseconds\n",
+			i * j, tm - oldtm);
+
+	mtr_start(&mtr);
+
+	block = buf_page_get(0, 5, &mtr);
+	buf_page_s_lock(block, &mtr);
+
+	page = buf_block_get_frame(block);
+	ut_a(page_validate(page, index));
+	mtr_commit(&mtr);
+	
+	oldtm = ut_clock();
+	
+	rnd = 677;
+	for (i = 0; i < 4 * UNIV_DBC * UNIV_DBC; i++) {
+		mtr_start(&mtr);
+
+		block = buf_page_create(0, 5, &mtr);
+		buf_page_x_lock(block, &mtr);
+
+		frame = buf_block_get_frame(block);
+
+		page = page_create(frame, &mtr);
+
+		for (j = 0; j < 250; j++) {
+			rnd = (rnd + 54841) % 1000;
+			dtuple_gen_test_tuple2(tuple, rnd, buf);
+		}
+		mtr_commit(&mtr);
+	}
+
+	tm = ut_clock();
+	printf(
+	"Wall time for %lu empty loops with page create %lu milliseconds\n",
+			i * j, tm - oldtm);
+
+	oldtm = ut_clock();
+	
+	for (i = 0; i < 4 * UNIV_DBC * UNIV_DBC; i++) {
+
+		mtr_start(&mtr);
+
+		block = buf_page_create(0, 5, &mtr);
+		buf_page_x_lock(block, &mtr);
+
+		frame = buf_block_get_frame(block);
+
+		page = page_create(frame, &mtr);
+
+		rnd = 100;
+		for (j = 0; j < 250; j++) {
+			rnd = (rnd + 1) % 1000;
+			dtuple_gen_test_tuple2(tuple, rnd, buf);
+			page_cur_search(page, tuple, PAGE_CUR_G, &cursor);
+
+			rec = page_cur_insert_rec(&cursor, tuple, NULL, &mtr);
+			ut_a(rec);
+		}
+		mtr_commit(&mtr);
+	}
+
+	tm = ut_clock();
+	printf(
+	"Wall time for sequential insertion of %lu recs %lu milliseconds\n",
+			i * j, tm - oldtm);
+
+
+	oldtm = ut_clock();
+	
+	for (i = 0; i < 4 * UNIV_DBC * UNIV_DBC; i++) {
+		mtr_start(&mtr);
+
+		block = buf_page_create(0, 5, &mtr);
+		buf_page_x_lock(block, &mtr);
+
+		frame = buf_block_get_frame(block);
+
+		page = page_create(frame, &mtr);
+
+		rnd = 500;
+		for (j = 0; j < 250; j++) {
+			rnd = (rnd - 1) % 1000;
+			dtuple_gen_test_tuple2(tuple, rnd, buf);
+			page_cur_search(page, tuple, PAGE_CUR_G, &cursor);
+
+			rec = page_cur_insert_rec(&cursor, tuple, NULL, &mtr);
+			ut_a(rec);
+		}
+		mtr_commit(&mtr);
+	}
+
+	tm = ut_clock();
+	printf(
+	"Wall time for descend. seq. insertion of %lu recs %lu milliseconds\n",
+			i * j, tm - oldtm);
+
+	oldtm = ut_clock();
+	
+	for (i = 0; i < 4 * UNIV_DBC * UNIV_DBC; i++) {
+		mtr_start(&mtr);
+
+		block = buf_page_create(0, 5, &mtr);
+		buf_page_x_lock(block, &mtr);
+
+		frame = buf_block_get_frame(block);
+
+		page = page_create(frame, &mtr);
+
+		rnd = 677;
+
+		for (j = 0; j < 250; j++) {
+			rnd = (rnd + 54841) % 1000;
+			dtuple_gen_test_tuple2(tuple, rnd, buf);
+			page_cur_search(page, tuple, PAGE_CUR_G, &cursor);
+
+			rec = page_cur_insert_rec(&cursor, tuple, NULL, &mtr);
+			ut_a(rec);
+		}
+
+		rnd = 677;
+		for (j = 0; j < 250; j++) {
+			rnd = (rnd + 54841) % 1000;
+			dtuple_gen_test_tuple2(tuple, rnd, buf);
+			page_cur_search(page, tuple, PAGE_CUR_G, &cursor);
+
+			page_cur_delete_rec(&cursor, &mtr);
+		}
+		ut_a(page_get_n_recs(page) == 0);
+
+		mtr_commit(&mtr);
+	}
+
+	tm = ut_clock();
+	printf("Wall time for insert and delete of %lu recs %lu milliseconds\n",
+			i * j, tm - oldtm);
+
+	mtr_start(&mtr);
+
+	block = buf_page_create(0, 5, &mtr);
+	buf_page_x_lock(block, &mtr);
+
+	frame = buf_block_get_frame(block);
+
+	page = page_create(frame, &mtr);
+
+	rnd = 677;
+
+	for (j = 0; j < 250; j++) {
+		rnd = (rnd + 54841) % 1000;
+		dtuple_gen_test_tuple2(tuple, rnd, buf);
+		page_cur_search(page, tuple, PAGE_CUR_G, &cursor);
+
+		rec = page_cur_insert_rec(&cursor, tuple, NULL, &mtr);
+		ut_a(rec);
+	}
+	ut_a(page_validate(page, index));
+	mtr_print(&mtr);
+	
+	oldtm = ut_clock();
+	
+	for (i = 0; i < 4 * UNIV_DBC * UNIV_DBC; i++) {
+		rnd = 677;
+		for (j = 0; j < 250; j++) {
+			rnd = (rnd + 54841) % 1000;
+			dtuple_gen_test_tuple2(tuple, rnd, buf);
+			page_cur_search(page, tuple, PAGE_CUR_G, &cursor);
+		}
+	}
+
+	tm = ut_clock();
+	printf("Wall time for search of %lu recs %lu milliseconds\n",
+			i * j, tm - oldtm);
+
+	oldtm = ut_clock();
+	
+	for (i = 0; i < 4 * UNIV_DBC * UNIV_DBC; i++) {
+		rnd = 677;
+		for (j = 0; j < 250; j++) {
+			rnd = (rnd + 54841) % 1000;
+			dtuple_gen_test_tuple2(tuple, rnd, buf);
+		}
+	}
+
+	tm = ut_clock();
+	printf("Wall time for %lu empty loops %lu milliseconds\n",
+			i * j, tm - oldtm);
+	mtr_commit(&mtr);
+}
+
+#endif
+
+/********************************************************************
+Main test function. */
+
+void 
+main(void) 
+/*======*/
+{
+	ulint	tm, oldtm;
+	mtr_t	mtr;
+
+	sync_init();
+	mem_init();
+	os_aio_init(160, 5);
+	fil_init(25);
+	buf_pool_init(POOL_SIZE, POOL_SIZE);
+	fsp_init();
+	log_init();
+
+	create_files();
+	init_space();
+
+	mtr_start(&mtr);
+
+	trx_sys_create(&mtr);
+	dict_create(&mtr);
+
+	mtr_commit(&mtr);
+
+	
+	oldtm = ut_clock();
+	
+	ut_rnd_set_seed(19);
+
+	test1();
+
+/*	mem_print_info(); */
+	
+	tm = ut_clock();
+	printf("Wall time for test %lu milliseconds\n", tm - oldtm);
+	printf("TESTS COMPLETED SUCCESSFULLY!\n");
+}
author	unknown <monty@donna.mysql.com>	2001-02-17 14:19:19 +0200
committer	unknown <monty@donna.mysql.com>	2001-02-17 14:19:19 +0200
commit	2662b59306ef0cd495fa6e2edf7129e58a11393a (patch)
tree	bfe39951a73e906579ab819bf5198ad8f3a64a36 /innobase/row
parent	66de55a56bdcf2f7a9c0c4f8e19b3e761475e202 (diff)
download	mariadb-git-2662b59306ef0cd495fa6e2edf7129e58a11393a.tar.gz